net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135 #include <linux/cpu_rmap.h>
 136
 137 #include "net-sysfs.h"
 138
 139 /* Instead of increasing this, you should create a hash table. */
 140 #define MAX_GRO_SKBS 8
 141
 142 /* This should be increased if a protocol with a bigger head is added. */
 143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145 /*
 146  *      The list of packet types we will receive (as opposed to discard)
 147  *      and the routines to invoke.
 148  *
 149  *      Why 16. Because with 16 the only overlap we get on a hash of the
 150  *      low nibble of the protocol value is RARP/SNAP/X.25.
 151  *
 152  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 153  *             sure which should go first, but I bet it won't make much
 154  *             difference if we are running VLANs.  The good news is that
 155  *             this protocol won't be in the list unless compiled in, so
 156  *             the average user (w/out VLANs) will not be adversely affected.
 157  *             --BLG
 158  *
 159  *              0800    IP
 160  *              8100    802.1Q VLAN
 161  *              0001    802.3
 162  *              0002    AX.25
 163  *              0004    802.2
 164  *              8035    RARP
 165  *              0005    SNAP
 166  *              0805    X.25
 167  *              0806    ARP
 168  *              8137    IPX
 169  *              0009    Localtalk
 170  *              86DD    IPv6
 171  */
 172
 173 #define PTYPE_HASH_SIZE (16)
 174 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 175
 176 static DEFINE_SPINLOCK(ptype_lock);
 177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 178 static struct list_head ptype_all __read_mostly;        /* Taps */
 179
 180 /*
 181  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 182  * semaphore.
 183  *
 184  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 185  *
 186  * Writers must hold the rtnl semaphore while they loop through the
 187  * dev_base_head list, and hold dev_base_lock for writing when they do the
 188  * actual updates.  This allows pure readers to access the list even
 189  * while a writer is preparing to update it.
 190  *
 191  * To put it another way, dev_base_lock is held for writing only to
 192  * protect against pure readers; the rtnl semaphore provides the
 193  * protection against other writers.
 194  *
 195  * See, for example usages, register_netdevice() and
 196  * unregister_netdevice(), which must be called with the rtnl
 197  * semaphore held.
 198  */
 199 DEFINE_RWLOCK(dev_base_lock);
 200 EXPORT_SYMBOL(dev_base_lock);
 201
 202 static inline void dev_base_seq_inc(struct net *net)
 203 {
 204         while (++net->dev_base_seq == 0);
 205 }
 206
 207 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 208 {
 209         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 210         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 211 }
 212
 213 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 214 {
 215         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 216 }
 217
 218 static inline void rps_lock(struct softnet_data *sd)
 219 {
 220 #ifdef CONFIG_RPS
 221         spin_lock(&sd->input_pkt_queue.lock);
 222 #endif
 223 }
 224
 225 static inline void rps_unlock(struct softnet_data *sd)
 226 {
 227 #ifdef CONFIG_RPS
 228         spin_unlock(&sd->input_pkt_queue.lock);
 229 #endif
 230 }
 231
 232 /* Device list insertion */
 233 static int list_netdevice(struct net_device *dev)
 234 {
 235         struct net *net = dev_net(dev);
 236
 237         ASSERT_RTNL();
 238
 239         write_lock_bh(&dev_base_lock);
 240         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 241         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 242         hlist_add_head_rcu(&dev->index_hlist,
 243                            dev_index_hash(net, dev->ifindex));
 244         write_unlock_bh(&dev_base_lock);
 245
 246         dev_base_seq_inc(net);
 247
 248         return 0;
 249 }
 250
 251 /* Device list removal
 252  * caller must respect a RCU grace period before freeing/reusing dev
 253  */
 254 static void unlist_netdevice(struct net_device *dev)
 255 {
 256         ASSERT_RTNL();
 257
 258         /* Unlink dev from the device chain */
 259         write_lock_bh(&dev_base_lock);
 260         list_del_rcu(&dev->dev_list);
 261         hlist_del_rcu(&dev->name_hlist);
 262         hlist_del_rcu(&dev->index_hlist);
 263         write_unlock_bh(&dev_base_lock);
 264
 265         dev_base_seq_inc(dev_net(dev));
 266 }
 267
 268 /*
 269  *      Our notifier list
 270  */
 271
 272 static RAW_NOTIFIER_HEAD(netdev_chain);
 273
 274 /*
 275  *      Device drivers call our routines to queue packets here. We empty the
 276  *      queue in the local softnet handler.
 277  */
 278
 279 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 280 EXPORT_PER_CPU_SYMBOL(softnet_data);
 281
 282 #ifdef CONFIG_LOCKDEP
 283 /*
 284  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 285  * according to dev->type
 286  */
 287 static const unsigned short netdev_lock_type[] =
 288         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 289          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 290          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 291          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 292          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 293          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 294          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 295          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 296          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 297          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 298          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 299          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 300          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 301          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 302          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 303          ARPHRD_VOID, ARPHRD_NONE};
 304
 305 static const char *const netdev_lock_name[] =
 306         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 307          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 308          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 309          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 310          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 311          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 312          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 313          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 314          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 315          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 316          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 317          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 318          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 319          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 320          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 321          "_xmit_VOID", "_xmit_NONE"};
 322
 323 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 324 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 325
 326 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 327 {
 328         int i;
 329
 330         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 331                 if (netdev_lock_type[i] == dev_type)
 332                         return i;
 333         /* the last key is used by default */
 334         return ARRAY_SIZE(netdev_lock_type) - 1;
 335 }
 336
 337 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 338                                                  unsigned short dev_type)
 339 {
 340         int i;
 341
 342         i = netdev_lock_pos(dev_type);
 343         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 344                                    netdev_lock_name[i]);
 345 }
 346
 347 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 348 {
 349         int i;
 350
 351         i = netdev_lock_pos(dev->type);
 352         lockdep_set_class_and_name(&dev->addr_list_lock,
 353                                    &netdev_addr_lock_key[i],
 354                                    netdev_lock_name[i]);
 355 }
 356 #else
 357 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 358                                                  unsigned short dev_type)
 359 {
 360 }
 361 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 362 {
 363 }
 364 #endif
 365
 366 /*******************************************************************************
 367
 368                 Protocol management and registration routines
 369
 370 *******************************************************************************/
 371
 372 /*
 373  *      Add a protocol ID to the list. Now that the input handler is
 374  *      smarter we can dispense with all the messy stuff that used to be
 375  *      here.
 376  *
 377  *      BEWARE!!! Protocol handlers, mangling input packets,
 378  *      MUST BE last in hash buckets and checking protocol handlers
 379  *      MUST start from promiscuous ptype_all chain in net_bh.
 380  *      It is true now, do not change it.
 381  *      Explanation follows: if protocol handler, mangling packet, will
 382  *      be the first on list, it is not able to sense, that packet
 383  *      is cloned and should be copied-on-write, so that it will
 384  *      change it and subsequent readers will get broken packet.
 385  *                                                      --ANK (980803)
 386  */
 387
 388 static inline struct list_head *ptype_head(const struct packet_type *pt)
 389 {
 390         if (pt->type == htons(ETH_P_ALL))
 391                 return &ptype_all;
 392         else
 393                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 394 }
 395
 396 /**
 397  *      dev_add_pack - add packet handler
 398  *      @pt: packet type declaration
 399  *
 400  *      Add a protocol handler to the networking stack. The passed &packet_type
 401  *      is linked into kernel lists and may not be freed until it has been
 402  *      removed from the kernel lists.
 403  *
 404  *      This call does not sleep therefore it can not
 405  *      guarantee all CPU's that are in middle of receiving packets
 406  *      will see the new packet type (until the next received packet).
 407  */
 408
 409 void dev_add_pack(struct packet_type *pt)
 410 {
 411         struct list_head *head = ptype_head(pt);
 412
 413         spin_lock(&ptype_lock);
 414         list_add_rcu(&pt->list, head);
 415         spin_unlock(&ptype_lock);
 416 }
 417 EXPORT_SYMBOL(dev_add_pack);
 418
 419 /**
 420  *      __dev_remove_pack        - remove packet handler
 421  *      @pt: packet type declaration
 422  *
 423  *      Remove a protocol handler that was previously added to the kernel
 424  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 425  *      from the kernel lists and can be freed or reused once this function
 426  *      returns.
 427  *
 428  *      The packet type might still be in use by receivers
 429  *      and must not be freed until after all the CPU's have gone
 430  *      through a quiescent state.
 431  */
 432 void __dev_remove_pack(struct packet_type *pt)
 433 {
 434         struct list_head *head = ptype_head(pt);
 435         struct packet_type *pt1;
 436
 437         spin_lock(&ptype_lock);
 438
 439         list_for_each_entry(pt1, head, list) {
 440                 if (pt == pt1) {
 441                         list_del_rcu(&pt->list);
 442                         goto out;
 443                 }
 444         }
 445
 446         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 447 out:
 448         spin_unlock(&ptype_lock);
 449 }
 450 EXPORT_SYMBOL(__dev_remove_pack);
 451
 452 /**
 453  *      dev_remove_pack  - remove packet handler
 454  *      @pt: packet type declaration
 455  *
 456  *      Remove a protocol handler that was previously added to the kernel
 457  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 458  *      from the kernel lists and can be freed or reused once this function
 459  *      returns.
 460  *
 461  *      This call sleeps to guarantee that no CPU is looking at the packet
 462  *      type after return.
 463  */
 464 void dev_remove_pack(struct packet_type *pt)
 465 {
 466         __dev_remove_pack(pt);
 467
 468         synchronize_net();
 469 }
 470 EXPORT_SYMBOL(dev_remove_pack);
 471
 472 /******************************************************************************
 473
 474                       Device Boot-time Settings Routines
 475
 476 *******************************************************************************/
 477
 478 /* Boot time configuration table */
 479 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 480
 481 /**
 482  *      netdev_boot_setup_add   - add new setup entry
 483  *      @name: name of the device
 484  *      @map: configured settings for the device
 485  *
 486  *      Adds new setup entry to the dev_boot_setup list.  The function
 487  *      returns 0 on error and 1 on success.  This is a generic routine to
 488  *      all netdevices.
 489  */
 490 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 491 {
 492         struct netdev_boot_setup *s;
 493         int i;
 494
 495         s = dev_boot_setup;
 496         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 497                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 498                         memset(s[i].name, 0, sizeof(s[i].name));
 499                         strlcpy(s[i].name, name, IFNAMSIZ);
 500                         memcpy(&s[i].map, map, sizeof(s[i].map));
 501                         break;
 502                 }
 503         }
 504
 505         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 506 }
 507
 508 /**
 509  *      netdev_boot_setup_check - check boot time settings
 510  *      @dev: the netdevice
 511  *
 512  *      Check boot time settings for the device.
 513  *      The found settings are set for the device to be used
 514  *      later in the device probing.
 515  *      Returns 0 if no settings found, 1 if they are.
 516  */
 517 int netdev_boot_setup_check(struct net_device *dev)
 518 {
 519         struct netdev_boot_setup *s = dev_boot_setup;
 520         int i;
 521
 522         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 523                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 524                     !strcmp(dev->name, s[i].name)) {
 525                         dev->irq        = s[i].map.irq;
 526                         dev->base_addr  = s[i].map.base_addr;
 527                         dev->mem_start  = s[i].map.mem_start;
 528                         dev->mem_end    = s[i].map.mem_end;
 529                         return 1;
 530                 }
 531         }
 532         return 0;
 533 }
 534 EXPORT_SYMBOL(netdev_boot_setup_check);
 535
 536
 537 /**
 538  *      netdev_boot_base        - get address from boot time settings
 539  *      @prefix: prefix for network device
 540  *      @unit: id for network device
 541  *
 542  *      Check boot time settings for the base address of device.
 543  *      The found settings are set for the device to be used
 544  *      later in the device probing.
 545  *      Returns 0 if no settings found.
 546  */
 547 unsigned long netdev_boot_base(const char *prefix, int unit)
 548 {
 549         const struct netdev_boot_setup *s = dev_boot_setup;
 550         char name[IFNAMSIZ];
 551         int i;
 552
 553         sprintf(name, "%s%d", prefix, unit);
 554
 555         /*
 556          * If device already registered then return base of 1
 557          * to indicate not to probe for this interface
 558          */
 559         if (__dev_get_by_name(&init_net, name))
 560                 return 1;
 561
 562         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 563                 if (!strcmp(name, s[i].name))
 564                         return s[i].map.base_addr;
 565         return 0;
 566 }
 567
 568 /*
 569  * Saves at boot time configured settings for any netdevice.
 570  */
 571 int __init netdev_boot_setup(char *str)
 572 {
 573         int ints[5];
 574         struct ifmap map;
 575
 576         str = get_options(str, ARRAY_SIZE(ints), ints);
 577         if (!str || !*str)
 578                 return 0;
 579
 580         /* Save settings */
 581         memset(&map, 0, sizeof(map));
 582         if (ints[0] > 0)
 583                 map.irq = ints[1];
 584         if (ints[0] > 1)
 585                 map.base_addr = ints[2];
 586         if (ints[0] > 2)
 587                 map.mem_start = ints[3];
 588         if (ints[0] > 3)
 589                 map.mem_end = ints[4];
 590
 591         /* Add new entry to the list */
 592         return netdev_boot_setup_add(str, &map);
 593 }
 594
 595 __setup("netdev=", netdev_boot_setup);
 596
 597 /*******************************************************************************
 598
 599                             Device Interface Subroutines
 600
 601 *******************************************************************************/
 602
 603 /**
 604  *      __dev_get_by_name       - find a device by its name
 605  *      @net: the applicable net namespace
 606  *      @name: name to find
 607  *
 608  *      Find an interface by name. Must be called under RTNL semaphore
 609  *      or @dev_base_lock. If the name is found a pointer to the device
 610  *      is returned. If the name is not found then %NULL is returned. The
 611  *      reference counters are not incremented so the caller must be
 612  *      careful with locks.
 613  */
 614
 615 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 616 {
 617         struct hlist_node *p;
 618         struct net_device *dev;
 619         struct hlist_head *head = dev_name_hash(net, name);
 620
 621         hlist_for_each_entry(dev, p, head, name_hlist)
 622                 if (!strncmp(dev->name, name, IFNAMSIZ))
 623                         return dev;
 624
 625         return NULL;
 626 }
 627 EXPORT_SYMBOL(__dev_get_by_name);
 628
 629 /**
 630  *      dev_get_by_name_rcu     - find a device by its name
 631  *      @net: the applicable net namespace
 632  *      @name: name to find
 633  *
 634  *      Find an interface by name.
 635  *      If the name is found a pointer to the device is returned.
 636  *      If the name is not found then %NULL is returned.
 637  *      The reference counters are not incremented so the caller must be
 638  *      careful with locks. The caller must hold RCU lock.
 639  */
 640
 641 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 642 {
 643         struct hlist_node *p;
 644         struct net_device *dev;
 645         struct hlist_head *head = dev_name_hash(net, name);
 646
 647         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 648                 if (!strncmp(dev->name, name, IFNAMSIZ))
 649                         return dev;
 650
 651         return NULL;
 652 }
 653 EXPORT_SYMBOL(dev_get_by_name_rcu);
 654
 655 /**
 656  *      dev_get_by_name         - find a device by its name
 657  *      @net: the applicable net namespace
 658  *      @name: name to find
 659  *
 660  *      Find an interface by name. This can be called from any
 661  *      context and does its own locking. The returned handle has
 662  *      the usage count incremented and the caller must use dev_put() to
 663  *      release it when it is no longer needed. %NULL is returned if no
 664  *      matching device is found.
 665  */
 666
 667 struct net_device *dev_get_by_name(struct net *net, const char *name)
 668 {
 669         struct net_device *dev;
 670
 671         rcu_read_lock();
 672         dev = dev_get_by_name_rcu(net, name);
 673         if (dev)
 674                 dev_hold(dev);
 675         rcu_read_unlock();
 676         return dev;
 677 }
 678 EXPORT_SYMBOL(dev_get_by_name);
 679
 680 /**
 681  *      __dev_get_by_index - find a device by its ifindex
 682  *      @net: the applicable net namespace
 683  *      @ifindex: index of device
 684  *
 685  *      Search for an interface by index. Returns %NULL if the device
 686  *      is not found or a pointer to the device. The device has not
 687  *      had its reference counter increased so the caller must be careful
 688  *      about locking. The caller must hold either the RTNL semaphore
 689  *      or @dev_base_lock.
 690  */
 691
 692 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 693 {
 694         struct hlist_node *p;
 695         struct net_device *dev;
 696         struct hlist_head *head = dev_index_hash(net, ifindex);
 697
 698         hlist_for_each_entry(dev, p, head, index_hlist)
 699                 if (dev->ifindex == ifindex)
 700                         return dev;
 701
 702         return NULL;
 703 }
 704 EXPORT_SYMBOL(__dev_get_by_index);
 705
 706 /**
 707  *      dev_get_by_index_rcu - find a device by its ifindex
 708  *      @net: the applicable net namespace
 709  *      @ifindex: index of device
 710  *
 711  *      Search for an interface by index. Returns %NULL if the device
 712  *      is not found or a pointer to the device. The device has not
 713  *      had its reference counter increased so the caller must be careful
 714  *      about locking. The caller must hold RCU lock.
 715  */
 716
 717 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 718 {
 719         struct hlist_node *p;
 720         struct net_device *dev;
 721         struct hlist_head *head = dev_index_hash(net, ifindex);
 722
 723         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 724                 if (dev->ifindex == ifindex)
 725                         return dev;
 726
 727         return NULL;
 728 }
 729 EXPORT_SYMBOL(dev_get_by_index_rcu);
 730
 731
 732 /**
 733  *      dev_get_by_index - find a device by its ifindex
 734  *      @net: the applicable net namespace
 735  *      @ifindex: index of device
 736  *
 737  *      Search for an interface by index. Returns NULL if the device
 738  *      is not found or a pointer to the device. The device returned has
 739  *      had a reference added and the pointer is safe until the user calls
 740  *      dev_put to indicate they have finished with it.
 741  */
 742
 743 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 744 {
 745         struct net_device *dev;
 746
 747         rcu_read_lock();
 748         dev = dev_get_by_index_rcu(net, ifindex);
 749         if (dev)
 750                 dev_hold(dev);
 751         rcu_read_unlock();
 752         return dev;
 753 }
 754 EXPORT_SYMBOL(dev_get_by_index);
 755
 756 /**
 757  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 758  *      @net: the applicable net namespace
 759  *      @type: media type of device
 760  *      @ha: hardware address
 761  *
 762  *      Search for an interface by MAC address. Returns NULL if the device
 763  *      is not found or a pointer to the device.
 764  *      The caller must hold RCU or RTNL.
 765  *      The returned device has not had its ref count increased
 766  *      and the caller must therefore be careful about locking
 767  *
 768  */
 769
 770 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 771                                        const char *ha)
 772 {
 773         struct net_device *dev;
 774
 775         for_each_netdev_rcu(net, dev)
 776                 if (dev->type == type &&
 777                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 778                         return dev;
 779
 780         return NULL;
 781 }
 782 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 783
 784 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 785 {
 786         struct net_device *dev;
 787
 788         ASSERT_RTNL();
 789         for_each_netdev(net, dev)
 790                 if (dev->type == type)
 791                         return dev;
 792
 793         return NULL;
 794 }
 795 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 796
 797 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 798 {
 799         struct net_device *dev, *ret = NULL;
 800
 801         rcu_read_lock();
 802         for_each_netdev_rcu(net, dev)
 803                 if (dev->type == type) {
 804                         dev_hold(dev);
 805                         ret = dev;
 806                         break;
 807                 }
 808         rcu_read_unlock();
 809         return ret;
 810 }
 811 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 812
 813 /**
 814  *      dev_get_by_flags_rcu - find any device with given flags
 815  *      @net: the applicable net namespace
 816  *      @if_flags: IFF_* values
 817  *      @mask: bitmask of bits in if_flags to check
 818  *
 819  *      Search for any interface with the given flags. Returns NULL if a device
 820  *      is not found or a pointer to the device. Must be called inside
 821  *      rcu_read_lock(), and result refcount is unchanged.
 822  */
 823
 824 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 825                                     unsigned short mask)
 826 {
 827         struct net_device *dev, *ret;
 828
 829         ret = NULL;
 830         for_each_netdev_rcu(net, dev) {
 831                 if (((dev->flags ^ if_flags) & mask) == 0) {
 832                         ret = dev;
 833                         break;
 834                 }
 835         }
 836         return ret;
 837 }
 838 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 839
 840 /**
 841  *      dev_valid_name - check if name is okay for network device
 842  *      @name: name string
 843  *
 844  *      Network device names need to be valid file names to
 845  *      to allow sysfs to work.  We also disallow any kind of
 846  *      whitespace.
 847  */
 848 int dev_valid_name(const char *name)
 849 {
 850         if (*name == '\0')
 851                 return 0;
 852         if (strlen(name) >= IFNAMSIZ)
 853                 return 0;
 854         if (!strcmp(name, ".") || !strcmp(name, ".."))
 855                 return 0;
 856
 857         while (*name) {
 858                 if (*name == '/' || isspace(*name))
 859                         return 0;
 860                 name++;
 861         }
 862         return 1;
 863 }
 864 EXPORT_SYMBOL(dev_valid_name);
 865
 866 /**
 867  *      __dev_alloc_name - allocate a name for a device
 868  *      @net: network namespace to allocate the device name in
 869  *      @name: name format string
 870  *      @buf:  scratch buffer and result name string
 871  *
 872  *      Passed a format string - eg "lt%d" it will try and find a suitable
 873  *      id. It scans list of devices to build up a free map, then chooses
 874  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 875  *      while allocating the name and adding the device in order to avoid
 876  *      duplicates.
 877  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 878  *      Returns the number of the unit assigned or a negative errno code.
 879  */
 880
 881 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 882 {
 883         int i = 0;
 884         const char *p;
 885         const int max_netdevices = 8*PAGE_SIZE;
 886         unsigned long *inuse;
 887         struct net_device *d;
 888
 889         p = strnchr(name, IFNAMSIZ-1, '%');
 890         if (p) {
 891                 /*
 892                  * Verify the string as this thing may have come from
 893                  * the user.  There must be either one "%d" and no other "%"
 894                  * characters.
 895                  */
 896                 if (p[1] != 'd' || strchr(p + 2, '%'))
 897                         return -EINVAL;
 898
 899                 /* Use one page as a bit array of possible slots */
 900                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 901                 if (!inuse)
 902                         return -ENOMEM;
 903
 904                 for_each_netdev(net, d) {
 905                         if (!sscanf(d->name, name, &i))
 906                                 continue;
 907                         if (i < 0 || i >= max_netdevices)
 908                                 continue;
 909
 910                         /*  avoid cases where sscanf is not exact inverse of printf */
 911                         snprintf(buf, IFNAMSIZ, name, i);
 912                         if (!strncmp(buf, d->name, IFNAMSIZ))
 913                                 set_bit(i, inuse);
 914                 }
 915
 916                 i = find_first_zero_bit(inuse, max_netdevices);
 917                 free_page((unsigned long) inuse);
 918         }
 919
 920         if (buf != name)
 921                 snprintf(buf, IFNAMSIZ, name, i);
 922         if (!__dev_get_by_name(net, buf))
 923                 return i;
 924
 925         /* It is possible to run out of possible slots
 926          * when the name is long and there isn't enough space left
 927          * for the digits, or if all bits are used.
 928          */
 929         return -ENFILE;
 930 }
 931
 932 /**
 933  *      dev_alloc_name - allocate a name for a device
 934  *      @dev: device
 935  *      @name: name format string
 936  *
 937  *      Passed a format string - eg "lt%d" it will try and find a suitable
 938  *      id. It scans list of devices to build up a free map, then chooses
 939  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 940  *      while allocating the name and adding the device in order to avoid
 941  *      duplicates.
 942  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 943  *      Returns the number of the unit assigned or a negative errno code.
 944  */
 945
 946 int dev_alloc_name(struct net_device *dev, const char *name)
 947 {
 948         char buf[IFNAMSIZ];
 949         struct net *net;
 950         int ret;
 951
 952         BUG_ON(!dev_net(dev));
 953         net = dev_net(dev);
 954         ret = __dev_alloc_name(net, name, buf);
 955         if (ret >= 0)
 956                 strlcpy(dev->name, buf, IFNAMSIZ);
 957         return ret;
 958 }
 959 EXPORT_SYMBOL(dev_alloc_name);
 960
 961 static int dev_get_valid_name(struct net_device *dev, const char *name)
 962 {
 963         struct net *net;
 964
 965         BUG_ON(!dev_net(dev));
 966         net = dev_net(dev);
 967
 968         if (!dev_valid_name(name))
 969                 return -EINVAL;
 970
 971         if (strchr(name, '%'))
 972                 return dev_alloc_name(dev, name);
 973         else if (__dev_get_by_name(net, name))
 974                 return -EEXIST;
 975         else if (dev->name != name)
 976                 strlcpy(dev->name, name, IFNAMSIZ);
 977
 978         return 0;
 979 }
 980
 981 /**
 982  *      dev_change_name - change name of a device
 983  *      @dev: device
 984  *      @newname: name (or format string) must be at least IFNAMSIZ
 985  *
 986  *      Change name of a device, can pass format strings "eth%d".
 987  *      for wildcarding.
 988  */
 989 int dev_change_name(struct net_device *dev, const char *newname)
 990 {
 991         char oldname[IFNAMSIZ];
 992         int err = 0;
 993         int ret;
 994         struct net *net;
 995
 996         ASSERT_RTNL();
 997         BUG_ON(!dev_net(dev));
 998
 999         net = dev_net(dev);
1000         if (dev->flags & IFF_UP)
1001                 return -EBUSY;
1002
1003         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1004                 return 0;
1005
1006         memcpy(oldname, dev->name, IFNAMSIZ);
1007
1008         err = dev_get_valid_name(dev, newname);
1009         if (err < 0)
1010                 return err;
1011
1012 rollback:
1013         ret = device_rename(&dev->dev, dev->name);
1014         if (ret) {
1015                 memcpy(dev->name, oldname, IFNAMSIZ);
1016                 return ret;
1017         }
1018
1019         write_lock_bh(&dev_base_lock);
1020         hlist_del_rcu(&dev->name_hlist);
1021         write_unlock_bh(&dev_base_lock);
1022
1023         synchronize_rcu();
1024
1025         write_lock_bh(&dev_base_lock);
1026         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1027         write_unlock_bh(&dev_base_lock);
1028
1029         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1030         ret = notifier_to_errno(ret);
1031
1032         if (ret) {
1033                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1034                 if (err >= 0) {
1035                         err = ret;
1036                         memcpy(dev->name, oldname, IFNAMSIZ);
1037                         goto rollback;
1038                 } else {
1039                         printk(KERN_ERR
1040                                "%s: name change rollback failed: %d.\n",
1041                                dev->name, ret);
1042                 }
1043         }
1044
1045         return err;
1046 }
1047
1048 /**
1049  *      dev_set_alias - change ifalias of a device
1050  *      @dev: device
1051  *      @alias: name up to IFALIASZ
1052  *      @len: limit of bytes to copy from info
1053  *
1054  *      Set ifalias for a device,
1055  */
1056 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057 {
1058         ASSERT_RTNL();
1059
1060         if (len >= IFALIASZ)
1061                 return -EINVAL;
1062
1063         if (!len) {
1064                 if (dev->ifalias) {
1065                         kfree(dev->ifalias);
1066                         dev->ifalias = NULL;
1067                 }
1068                 return 0;
1069         }
1070
1071         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1072         if (!dev->ifalias)
1073                 return -ENOMEM;
1074
1075         strlcpy(dev->ifalias, alias, len+1);
1076         return len;
1077 }
1078
1079
1080 /**
1081  *      netdev_features_change - device changes features
1082  *      @dev: device to cause notification
1083  *
1084  *      Called to indicate a device has changed features.
1085  */
1086 void netdev_features_change(struct net_device *dev)
1087 {
1088         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1089 }
1090 EXPORT_SYMBOL(netdev_features_change);
1091
1092 /**
1093  *      netdev_state_change - device changes state
1094  *      @dev: device to cause notification
1095  *
1096  *      Called to indicate a device has changed state. This function calls
1097  *      the notifier chains for netdev_chain and sends a NEWLINK message
1098  *      to the routing socket.
1099  */
1100 void netdev_state_change(struct net_device *dev)
1101 {
1102         if (dev->flags & IFF_UP) {
1103                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1104                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1105         }
1106 }
1107 EXPORT_SYMBOL(netdev_state_change);
1108
1109 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1110 {
1111         return call_netdevice_notifiers(event, dev);
1112 }
1113 EXPORT_SYMBOL(netdev_bonding_change);
1114
1115 /**
1116  *      dev_load        - load a network module
1117  *      @net: the applicable net namespace
1118  *      @name: name of interface
1119  *
1120  *      If a network interface is not present and the process has suitable
1121  *      privileges this function loads the module. If module loading is not
1122  *      available in this kernel then it becomes a nop.
1123  */
1124
1125 void dev_load(struct net *net, const char *name)
1126 {
1127         struct net_device *dev;
1128         int no_module;
1129
1130         rcu_read_lock();
1131         dev = dev_get_by_name_rcu(net, name);
1132         rcu_read_unlock();
1133
1134         no_module = !dev;
1135         if (no_module && capable(CAP_NET_ADMIN))
1136                 no_module = request_module("netdev-%s", name);
1137         if (no_module && capable(CAP_SYS_MODULE)) {
1138                 if (!request_module("%s", name))
1139                         pr_err("Loading kernel module for a network device "
1140 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1141 "instead\n", name);
1142         }
1143 }
1144 EXPORT_SYMBOL(dev_load);
1145
1146 static int __dev_open(struct net_device *dev)
1147 {
1148         const struct net_device_ops *ops = dev->netdev_ops;
1149         int ret;
1150
1151         ASSERT_RTNL();
1152
1153         if (!netif_device_present(dev))
1154                 return -ENODEV;
1155
1156         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1157         ret = notifier_to_errno(ret);
1158         if (ret)
1159                 return ret;
1160
1161         set_bit(__LINK_STATE_START, &dev->state);
1162
1163         if (ops->ndo_validate_addr)
1164                 ret = ops->ndo_validate_addr(dev);
1165
1166         if (!ret && ops->ndo_open)
1167                 ret = ops->ndo_open(dev);
1168
1169         if (ret)
1170                 clear_bit(__LINK_STATE_START, &dev->state);
1171         else {
1172                 dev->flags |= IFF_UP;
1173                 net_dmaengine_get();
1174                 dev_set_rx_mode(dev);
1175                 dev_activate(dev);
1176         }
1177
1178         return ret;
1179 }
1180
1181 /**
1182  *      dev_open        - prepare an interface for use.
1183  *      @dev:   device to open
1184  *
1185  *      Takes a device from down to up state. The device's private open
1186  *      function is invoked and then the multicast lists are loaded. Finally
1187  *      the device is moved into the up state and a %NETDEV_UP message is
1188  *      sent to the netdev notifier chain.
1189  *
1190  *      Calling this function on an active interface is a nop. On a failure
1191  *      a negative errno code is returned.
1192  */
1193 int dev_open(struct net_device *dev)
1194 {
1195         int ret;
1196
1197         if (dev->flags & IFF_UP)
1198                 return 0;
1199
1200         ret = __dev_open(dev);
1201         if (ret < 0)
1202                 return ret;
1203
1204         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1205         call_netdevice_notifiers(NETDEV_UP, dev);
1206
1207         return ret;
1208 }
1209 EXPORT_SYMBOL(dev_open);
1210
1211 static int __dev_close_many(struct list_head *head)
1212 {
1213         struct net_device *dev;
1214
1215         ASSERT_RTNL();
1216         might_sleep();
1217
1218         list_for_each_entry(dev, head, unreg_list) {
1219                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1220
1221                 clear_bit(__LINK_STATE_START, &dev->state);
1222
1223                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1224                  * can be even on different cpu. So just clear netif_running().
1225                  *
1226                  * dev->stop() will invoke napi_disable() on all of it's
1227                  * napi_struct instances on this device.
1228                  */
1229                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1230         }
1231
1232         dev_deactivate_many(head);
1233
1234         list_for_each_entry(dev, head, unreg_list) {
1235                 const struct net_device_ops *ops = dev->netdev_ops;
1236
1237                 /*
1238                  *      Call the device specific close. This cannot fail.
1239                  *      Only if device is UP
1240                  *
1241                  *      We allow it to be called even after a DETACH hot-plug
1242                  *      event.
1243                  */
1244                 if (ops->ndo_stop)
1245                         ops->ndo_stop(dev);
1246
1247                 dev->flags &= ~IFF_UP;
1248                 net_dmaengine_put();
1249         }
1250
1251         return 0;
1252 }
1253
1254 static int __dev_close(struct net_device *dev)
1255 {
1256         int retval;
1257         LIST_HEAD(single);
1258
1259         list_add(&dev->unreg_list, &single);
1260         retval = __dev_close_many(&single);
1261         list_del(&single);
1262         return retval;
1263 }
1264
1265 static int dev_close_many(struct list_head *head)
1266 {
1267         struct net_device *dev, *tmp;
1268         LIST_HEAD(tmp_list);
1269
1270         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1271                 if (!(dev->flags & IFF_UP))
1272                         list_move(&dev->unreg_list, &tmp_list);
1273
1274         __dev_close_many(head);
1275
1276         list_for_each_entry(dev, head, unreg_list) {
1277                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1278                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1279         }
1280
1281         /* rollback_registered_many needs the complete original list */
1282         list_splice(&tmp_list, head);
1283         return 0;
1284 }
1285
1286 /**
1287  *      dev_close - shutdown an interface.
1288  *      @dev: device to shutdown
1289  *
1290  *      This function moves an active device into down state. A
1291  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1292  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1293  *      chain.
1294  */
1295 int dev_close(struct net_device *dev)
1296 {
1297         if (dev->flags & IFF_UP) {
1298                 LIST_HEAD(single);
1299
1300                 list_add(&dev->unreg_list, &single);
1301                 dev_close_many(&single);
1302                 list_del(&single);
1303         }
1304         return 0;
1305 }
1306 EXPORT_SYMBOL(dev_close);
1307
1308
1309 /**
1310  *      dev_disable_lro - disable Large Receive Offload on a device
1311  *      @dev: device
1312  *
1313  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1314  *      called under RTNL.  This is needed if received packets may be
1315  *      forwarded to another interface.
1316  */
1317 void dev_disable_lro(struct net_device *dev)
1318 {
1319         u32 flags;
1320
1321         /*
1322          * If we're trying to disable lro on a vlan device
1323          * use the underlying physical device instead
1324          */
1325         if (is_vlan_dev(dev))
1326                 dev = vlan_dev_real_dev(dev);
1327
1328         if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1329                 flags = dev->ethtool_ops->get_flags(dev);
1330         else
1331                 flags = ethtool_op_get_flags(dev);
1332
1333         if (!(flags & ETH_FLAG_LRO))
1334                 return;
1335
1336         __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1337         if (unlikely(dev->features & NETIF_F_LRO))
1338                 netdev_WARN(dev, "failed to disable LRO!\n");
1339 }
1340 EXPORT_SYMBOL(dev_disable_lro);
1341
1342
1343 static int dev_boot_phase = 1;
1344
1345 /**
1346  *      register_netdevice_notifier - register a network notifier block
1347  *      @nb: notifier
1348  *
1349  *      Register a notifier to be called when network device events occur.
1350  *      The notifier passed is linked into the kernel structures and must
1351  *      not be reused until it has been unregistered. A negative errno code
1352  *      is returned on a failure.
1353  *
1354  *      When registered all registration and up events are replayed
1355  *      to the new notifier to allow device to have a race free
1356  *      view of the network device list.
1357  */
1358
1359 int register_netdevice_notifier(struct notifier_block *nb)
1360 {
1361         struct net_device *dev;
1362         struct net_device *last;
1363         struct net *net;
1364         int err;
1365
1366         rtnl_lock();
1367         err = raw_notifier_chain_register(&netdev_chain, nb);
1368         if (err)
1369                 goto unlock;
1370         if (dev_boot_phase)
1371                 goto unlock;
1372         for_each_net(net) {
1373                 for_each_netdev(net, dev) {
1374                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1375                         err = notifier_to_errno(err);
1376                         if (err)
1377                                 goto rollback;
1378
1379                         if (!(dev->flags & IFF_UP))
1380                                 continue;
1381
1382                         nb->notifier_call(nb, NETDEV_UP, dev);
1383                 }
1384         }
1385
1386 unlock:
1387         rtnl_unlock();
1388         return err;
1389
1390 rollback:
1391         last = dev;
1392         for_each_net(net) {
1393                 for_each_netdev(net, dev) {
1394                         if (dev == last)
1395                                 break;
1396
1397                         if (dev->flags & IFF_UP) {
1398                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1399                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1400                         }
1401                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1402                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1403                 }
1404         }
1405
1406         raw_notifier_chain_unregister(&netdev_chain, nb);
1407         goto unlock;
1408 }
1409 EXPORT_SYMBOL(register_netdevice_notifier);
1410
1411 /**
1412  *      unregister_netdevice_notifier - unregister a network notifier block
1413  *      @nb: notifier
1414  *
1415  *      Unregister a notifier previously registered by
1416  *      register_netdevice_notifier(). The notifier is unlinked into the
1417  *      kernel structures and may then be reused. A negative errno code
1418  *      is returned on a failure.
1419  */
1420
1421 int unregister_netdevice_notifier(struct notifier_block *nb)
1422 {
1423         int err;
1424
1425         rtnl_lock();
1426         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1427         rtnl_unlock();
1428         return err;
1429 }
1430 EXPORT_SYMBOL(unregister_netdevice_notifier);
1431
1432 /**
1433  *      call_netdevice_notifiers - call all network notifier blocks
1434  *      @val: value passed unmodified to notifier function
1435  *      @dev: net_device pointer passed unmodified to notifier function
1436  *
1437  *      Call all network notifier blocks.  Parameters and return value
1438  *      are as for raw_notifier_call_chain().
1439  */
1440
1441 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1442 {
1443         ASSERT_RTNL();
1444         return raw_notifier_call_chain(&netdev_chain, val, dev);
1445 }
1446 EXPORT_SYMBOL(call_netdevice_notifiers);
1447
1448 /* When > 0 there are consumers of rx skb time stamps */
1449 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1450
1451 void net_enable_timestamp(void)
1452 {
1453         atomic_inc(&netstamp_needed);
1454 }
1455 EXPORT_SYMBOL(net_enable_timestamp);
1456
1457 void net_disable_timestamp(void)
1458 {
1459         atomic_dec(&netstamp_needed);
1460 }
1461 EXPORT_SYMBOL(net_disable_timestamp);
1462
1463 static inline void net_timestamp_set(struct sk_buff *skb)
1464 {
1465         if (atomic_read(&netstamp_needed))
1466                 __net_timestamp(skb);
1467         else
1468                 skb->tstamp.tv64 = 0;
1469 }
1470
1471 static inline void net_timestamp_check(struct sk_buff *skb)
1472 {
1473         if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1474                 __net_timestamp(skb);
1475 }
1476
1477 static inline bool is_skb_forwardable(struct net_device *dev,
1478                                       struct sk_buff *skb)
1479 {
1480         unsigned int len;
1481
1482         if (!(dev->flags & IFF_UP))
1483                 return false;
1484
1485         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1486         if (skb->len <= len)
1487                 return true;
1488
1489         /* if TSO is enabled, we don't care about the length as the packet
1490          * could be forwarded without being segmented before
1491          */
1492         if (skb_is_gso(skb))
1493                 return true;
1494
1495         return false;
1496 }
1497
1498 /**
1499  * dev_forward_skb - loopback an skb to another netif
1500  *
1501  * @dev: destination network device
1502  * @skb: buffer to forward
1503  *
1504  * return values:
1505  *      NET_RX_SUCCESS  (no congestion)
1506  *      NET_RX_DROP     (packet was dropped, but freed)
1507  *
1508  * dev_forward_skb can be used for injecting an skb from the
1509  * start_xmit function of one device into the receive queue
1510  * of another device.
1511  *
1512  * The receiving device may be in another namespace, so
1513  * we have to clear all information in the skb that could
1514  * impact namespace isolation.
1515  */
1516 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1517 {
1518         skb_orphan(skb);
1519         nf_reset(skb);
1520
1521         if (unlikely(!is_skb_forwardable(dev, skb))) {
1522                 atomic_long_inc(&dev->rx_dropped);
1523                 kfree_skb(skb);
1524                 return NET_RX_DROP;
1525         }
1526         skb_set_dev(skb, dev);
1527         skb->tstamp.tv64 = 0;
1528         skb->pkt_type = PACKET_HOST;
1529         skb->protocol = eth_type_trans(skb, dev);
1530         return netif_rx(skb);
1531 }
1532 EXPORT_SYMBOL_GPL(dev_forward_skb);
1533
1534 static inline int deliver_skb(struct sk_buff *skb,
1535                               struct packet_type *pt_prev,
1536                               struct net_device *orig_dev)
1537 {
1538         atomic_inc(&skb->users);
1539         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1540 }
1541
1542 /*
1543  *      Support routine. Sends outgoing frames to any network
1544  *      taps currently in use.
1545  */
1546
1547 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1548 {
1549         struct packet_type *ptype;
1550         struct sk_buff *skb2 = NULL;
1551         struct packet_type *pt_prev = NULL;
1552
1553         rcu_read_lock();
1554         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1555                 /* Never send packets back to the socket
1556                  * they originated from - MvS (miquels@drinkel.ow.org)
1557                  */
1558                 if ((ptype->dev == dev || !ptype->dev) &&
1559                     (ptype->af_packet_priv == NULL ||
1560                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1561                         if (pt_prev) {
1562                                 deliver_skb(skb2, pt_prev, skb->dev);
1563                                 pt_prev = ptype;
1564                                 continue;
1565                         }
1566
1567                         skb2 = skb_clone(skb, GFP_ATOMIC);
1568                         if (!skb2)
1569                                 break;
1570
1571                         net_timestamp_set(skb2);
1572
1573                         /* skb->nh should be correctly
1574                            set by sender, so that the second statement is
1575                            just protection against buggy protocols.
1576                          */
1577                         skb_reset_mac_header(skb2);
1578
1579                         if (skb_network_header(skb2) < skb2->data ||
1580                             skb2->network_header > skb2->tail) {
1581                                 if (net_ratelimit())
1582                                         printk(KERN_CRIT "protocol %04x is "
1583                                                "buggy, dev %s\n",
1584                                                ntohs(skb2->protocol),
1585                                                dev->name);
1586                                 skb_reset_network_header(skb2);
1587                         }
1588
1589                         skb2->transport_header = skb2->network_header;
1590                         skb2->pkt_type = PACKET_OUTGOING;
1591                         pt_prev = ptype;
1592                 }
1593         }
1594         if (pt_prev)
1595                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1596         rcu_read_unlock();
1597 }
1598
1599 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1600  * @dev: Network device
1601  * @txq: number of queues available
1602  *
1603  * If real_num_tx_queues is changed the tc mappings may no longer be
1604  * valid. To resolve this verify the tc mapping remains valid and if
1605  * not NULL the mapping. With no priorities mapping to this
1606  * offset/count pair it will no longer be used. In the worst case TC0
1607  * is invalid nothing can be done so disable priority mappings. If is
1608  * expected that drivers will fix this mapping if they can before
1609  * calling netif_set_real_num_tx_queues.
1610  */
1611 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1612 {
1613         int i;
1614         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1615
1616         /* If TC0 is invalidated disable TC mapping */
1617         if (tc->offset + tc->count > txq) {
1618                 pr_warning("Number of in use tx queues changed "
1619                            "invalidating tc mappings. Priority "
1620                            "traffic classification disabled!\n");
1621                 dev->num_tc = 0;
1622                 return;
1623         }
1624
1625         /* Invalidated prio to tc mappings set to TC0 */
1626         for (i = 1; i < TC_BITMASK + 1; i++) {
1627                 int q = netdev_get_prio_tc_map(dev, i);
1628
1629                 tc = &dev->tc_to_txq[q];
1630                 if (tc->offset + tc->count > txq) {
1631                         pr_warning("Number of in use tx queues "
1632                                    "changed. Priority %i to tc "
1633                                    "mapping %i is no longer valid "
1634                                    "setting map to 0\n",
1635                                    i, q);
1636                         netdev_set_prio_tc_map(dev, i, 0);
1637                 }
1638         }
1639 }
1640
1641 /*
1642  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1643  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1644  */
1645 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1646 {
1647         int rc;
1648
1649         if (txq < 1 || txq > dev->num_tx_queues)
1650                 return -EINVAL;
1651
1652         if (dev->reg_state == NETREG_REGISTERED ||
1653             dev->reg_state == NETREG_UNREGISTERING) {
1654                 ASSERT_RTNL();
1655
1656                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1657                                                   txq);
1658                 if (rc)
1659                         return rc;
1660
1661                 if (dev->num_tc)
1662                         netif_setup_tc(dev, txq);
1663
1664                 if (txq < dev->real_num_tx_queues)
1665                         qdisc_reset_all_tx_gt(dev, txq);
1666         }
1667
1668         dev->real_num_tx_queues = txq;
1669         return 0;
1670 }
1671 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1672
1673 #ifdef CONFIG_RPS
1674 /**
1675  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1676  *      @dev: Network device
1677  *      @rxq: Actual number of RX queues
1678  *
1679  *      This must be called either with the rtnl_lock held or before
1680  *      registration of the net device.  Returns 0 on success, or a
1681  *      negative error code.  If called before registration, it always
1682  *      succeeds.
1683  */
1684 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1685 {
1686         int rc;
1687
1688         if (rxq < 1 || rxq > dev->num_rx_queues)
1689                 return -EINVAL;
1690
1691         if (dev->reg_state == NETREG_REGISTERED) {
1692                 ASSERT_RTNL();
1693
1694                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1695                                                   rxq);
1696                 if (rc)
1697                         return rc;
1698         }
1699
1700         dev->real_num_rx_queues = rxq;
1701         return 0;
1702 }
1703 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1704 #endif
1705
1706 static inline void __netif_reschedule(struct Qdisc *q)
1707 {
1708         struct softnet_data *sd;
1709         unsigned long flags;
1710
1711         local_irq_save(flags);
1712         sd = &__get_cpu_var(softnet_data);
1713         q->next_sched = NULL;
1714         *sd->output_queue_tailp = q;
1715         sd->output_queue_tailp = &q->next_sched;
1716         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1717         local_irq_restore(flags);
1718 }
1719
1720 void __netif_schedule(struct Qdisc *q)
1721 {
1722         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1723                 __netif_reschedule(q);
1724 }
1725 EXPORT_SYMBOL(__netif_schedule);
1726
1727 void dev_kfree_skb_irq(struct sk_buff *skb)
1728 {
1729         if (atomic_dec_and_test(&skb->users)) {
1730                 struct softnet_data *sd;
1731                 unsigned long flags;
1732
1733                 local_irq_save(flags);
1734                 sd = &__get_cpu_var(softnet_data);
1735                 skb->next = sd->completion_queue;
1736                 sd->completion_queue = skb;
1737                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1738                 local_irq_restore(flags);
1739         }
1740 }
1741 EXPORT_SYMBOL(dev_kfree_skb_irq);
1742
1743 void dev_kfree_skb_any(struct sk_buff *skb)
1744 {
1745         if (in_irq() || irqs_disabled())
1746                 dev_kfree_skb_irq(skb);
1747         else
1748                 dev_kfree_skb(skb);
1749 }
1750 EXPORT_SYMBOL(dev_kfree_skb_any);
1751
1752
1753 /**
1754  * netif_device_detach - mark device as removed
1755  * @dev: network device
1756  *
1757  * Mark device as removed from system and therefore no longer available.
1758  */
1759 void netif_device_detach(struct net_device *dev)
1760 {
1761         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1762             netif_running(dev)) {
1763                 netif_tx_stop_all_queues(dev);
1764         }
1765 }
1766 EXPORT_SYMBOL(netif_device_detach);
1767
1768 /**
1769  * netif_device_attach - mark device as attached
1770  * @dev: network device
1771  *
1772  * Mark device as attached from system and restart if needed.
1773  */
1774 void netif_device_attach(struct net_device *dev)
1775 {
1776         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1777             netif_running(dev)) {
1778                 netif_tx_wake_all_queues(dev);
1779                 __netdev_watchdog_up(dev);
1780         }
1781 }
1782 EXPORT_SYMBOL(netif_device_attach);
1783
1784 /**
1785  * skb_dev_set -- assign a new device to a buffer
1786  * @skb: buffer for the new device
1787  * @dev: network device
1788  *
1789  * If an skb is owned by a device already, we have to reset
1790  * all data private to the namespace a device belongs to
1791  * before assigning it a new device.
1792  */
1793 #ifdef CONFIG_NET_NS
1794 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1795 {
1796         skb_dst_drop(skb);
1797         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1798                 secpath_reset(skb);
1799                 nf_reset(skb);
1800                 skb_init_secmark(skb);
1801                 skb->mark = 0;
1802                 skb->priority = 0;
1803                 skb->nf_trace = 0;
1804                 skb->ipvs_property = 0;
1805 #ifdef CONFIG_NET_SCHED
1806                 skb->tc_index = 0;
1807 #endif
1808         }
1809         skb->dev = dev;
1810 }
1811 EXPORT_SYMBOL(skb_set_dev);
1812 #endif /* CONFIG_NET_NS */
1813
1814 /*
1815  * Invalidate hardware checksum when packet is to be mangled, and
1816  * complete checksum manually on outgoing path.
1817  */
1818 int skb_checksum_help(struct sk_buff *skb)
1819 {
1820         __wsum csum;
1821         int ret = 0, offset;
1822
1823         if (skb->ip_summed == CHECKSUM_COMPLETE)
1824                 goto out_set_summed;
1825
1826         if (unlikely(skb_shinfo(skb)->gso_size)) {
1827                 /* Let GSO fix up the checksum. */
1828                 goto out_set_summed;
1829         }
1830
1831         offset = skb_checksum_start_offset(skb);
1832         BUG_ON(offset >= skb_headlen(skb));
1833         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1834
1835         offset += skb->csum_offset;
1836         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1837
1838         if (skb_cloned(skb) &&
1839             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1840                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1841                 if (ret)
1842                         goto out;
1843         }
1844
1845         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1846 out_set_summed:
1847         skb->ip_summed = CHECKSUM_NONE;
1848 out:
1849         return ret;
1850 }
1851 EXPORT_SYMBOL(skb_checksum_help);
1852
1853 /**
1854  *      skb_gso_segment - Perform segmentation on skb.
1855  *      @skb: buffer to segment
1856  *      @features: features for the output path (see dev->features)
1857  *
1858  *      This function segments the given skb and returns a list of segments.
1859  *
1860  *      It may return NULL if the skb requires no segmentation.  This is
1861  *      only possible when GSO is used for verifying header integrity.
1862  */
1863 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1864 {
1865         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1866         struct packet_type *ptype;
1867         __be16 type = skb->protocol;
1868         int vlan_depth = ETH_HLEN;
1869         int err;
1870
1871         while (type == htons(ETH_P_8021Q)) {
1872                 struct vlan_hdr *vh;
1873
1874                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1875                         return ERR_PTR(-EINVAL);
1876
1877                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1878                 type = vh->h_vlan_encapsulated_proto;
1879                 vlan_depth += VLAN_HLEN;
1880         }
1881
1882         skb_reset_mac_header(skb);
1883         skb->mac_len = skb->network_header - skb->mac_header;
1884         __skb_pull(skb, skb->mac_len);
1885
1886         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1887                 struct net_device *dev = skb->dev;
1888                 struct ethtool_drvinfo info = {};
1889
1890                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1891                         dev->ethtool_ops->get_drvinfo(dev, &info);
1892
1893                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1894                      info.driver, dev ? dev->features : 0L,
1895                      skb->sk ? skb->sk->sk_route_caps : 0L,
1896                      skb->len, skb->data_len, skb->ip_summed);
1897
1898                 if (skb_header_cloned(skb) &&
1899                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1900                         return ERR_PTR(err);
1901         }
1902
1903         rcu_read_lock();
1904         list_for_each_entry_rcu(ptype,
1905                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1906                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1907                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1908                                 err = ptype->gso_send_check(skb);
1909                                 segs = ERR_PTR(err);
1910                                 if (err || skb_gso_ok(skb, features))
1911                                         break;
1912                                 __skb_push(skb, (skb->data -
1913                                                  skb_network_header(skb)));
1914                         }
1915                         segs = ptype->gso_segment(skb, features);
1916                         break;
1917                 }
1918         }
1919         rcu_read_unlock();
1920
1921         __skb_push(skb, skb->data - skb_mac_header(skb));
1922
1923         return segs;
1924 }
1925 EXPORT_SYMBOL(skb_gso_segment);
1926
1927 /* Take action when hardware reception checksum errors are detected. */
1928 #ifdef CONFIG_BUG
1929 void netdev_rx_csum_fault(struct net_device *dev)
1930 {
1931         if (net_ratelimit()) {
1932                 printk(KERN_ERR "%s: hw csum failure.\n",
1933                         dev ? dev->name : "<unknown>");
1934                 dump_stack();
1935         }
1936 }
1937 EXPORT_SYMBOL(netdev_rx_csum_fault);
1938 #endif
1939
1940 /* Actually, we should eliminate this check as soon as we know, that:
1941  * 1. IOMMU is present and allows to map all the memory.
1942  * 2. No high memory really exists on this machine.
1943  */
1944
1945 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1946 {
1947 #ifdef CONFIG_HIGHMEM
1948         int i;
1949         if (!(dev->features & NETIF_F_HIGHDMA)) {
1950                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1951                         if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1952                                 return 1;
1953         }
1954
1955         if (PCI_DMA_BUS_IS_PHYS) {
1956                 struct device *pdev = dev->dev.parent;
1957
1958                 if (!pdev)
1959                         return 0;
1960                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1961                         dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1962                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1963                                 return 1;
1964                 }
1965         }
1966 #endif
1967         return 0;
1968 }
1969
1970 struct dev_gso_cb {
1971         void (*destructor)(struct sk_buff *skb);
1972 };
1973
1974 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1975
1976 static void dev_gso_skb_destructor(struct sk_buff *skb)
1977 {
1978         struct dev_gso_cb *cb;
1979
1980         do {
1981                 struct sk_buff *nskb = skb->next;
1982
1983                 skb->next = nskb->next;
1984                 nskb->next = NULL;
1985                 kfree_skb(nskb);
1986         } while (skb->next);
1987
1988         cb = DEV_GSO_CB(skb);
1989         if (cb->destructor)
1990                 cb->destructor(skb);
1991 }
1992
1993 /**
1994  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1995  *      @skb: buffer to segment
1996  *      @features: device features as applicable to this skb
1997  *
1998  *      This function segments the given skb and stores the list of segments
1999  *      in skb->next.
2000  */
2001 static int dev_gso_segment(struct sk_buff *skb, int features)
2002 {
2003         struct sk_buff *segs;
2004
2005         segs = skb_gso_segment(skb, features);
2006
2007         /* Verifying header integrity only. */
2008         if (!segs)
2009                 return 0;
2010
2011         if (IS_ERR(segs))
2012                 return PTR_ERR(segs);
2013
2014         skb->next = segs;
2015         DEV_GSO_CB(skb)->destructor = skb->destructor;
2016         skb->destructor = dev_gso_skb_destructor;
2017
2018         return 0;
2019 }
2020
2021 /*
2022  * Try to orphan skb early, right before transmission by the device.
2023  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2024  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2025  */
2026 static inline void skb_orphan_try(struct sk_buff *skb)
2027 {
2028         struct sock *sk = skb->sk;
2029
2030         if (sk && !skb_shinfo(skb)->tx_flags) {
2031                 /* skb_tx_hash() wont be able to get sk.
2032                  * We copy sk_hash into skb->rxhash
2033                  */
2034                 if (!skb->rxhash)
2035                         skb->rxhash = sk->sk_hash;
2036                 skb_orphan(skb);
2037         }
2038 }
2039
2040 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2041 {
2042         return ((features & NETIF_F_GEN_CSUM) ||
2043                 ((features & NETIF_F_V4_CSUM) &&
2044                  protocol == htons(ETH_P_IP)) ||
2045                 ((features & NETIF_F_V6_CSUM) &&
2046                  protocol == htons(ETH_P_IPV6)) ||
2047                 ((features & NETIF_F_FCOE_CRC) &&
2048                  protocol == htons(ETH_P_FCOE)));
2049 }
2050
2051 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2052 {
2053         if (!can_checksum_protocol(features, protocol)) {
2054                 features &= ~NETIF_F_ALL_CSUM;
2055                 features &= ~NETIF_F_SG;
2056         } else if (illegal_highdma(skb->dev, skb)) {
2057                 features &= ~NETIF_F_SG;
2058         }
2059
2060         return features;
2061 }
2062
2063 u32 netif_skb_features(struct sk_buff *skb)
2064 {
2065         __be16 protocol = skb->protocol;
2066         u32 features = skb->dev->features;
2067
2068         if (protocol == htons(ETH_P_8021Q)) {
2069                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2070                 protocol = veh->h_vlan_encapsulated_proto;
2071         } else if (!vlan_tx_tag_present(skb)) {
2072                 return harmonize_features(skb, protocol, features);
2073         }
2074
2075         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2076
2077         if (protocol != htons(ETH_P_8021Q)) {
2078                 return harmonize_features(skb, protocol, features);
2079         } else {
2080                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2081                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2082                 return harmonize_features(skb, protocol, features);
2083         }
2084 }
2085 EXPORT_SYMBOL(netif_skb_features);
2086
2087 /*
2088  * Returns true if either:
2089  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2090  *      2. skb is fragmented and the device does not support SG, or if
2091  *         at least one of fragments is in highmem and device does not
2092  *         support DMA from it.
2093  */
2094 static inline int skb_needs_linearize(struct sk_buff *skb,
2095                                       int features)
2096 {
2097         return skb_is_nonlinear(skb) &&
2098                         ((skb_has_frag_list(skb) &&
2099                                 !(features & NETIF_F_FRAGLIST)) ||
2100                         (skb_shinfo(skb)->nr_frags &&
2101                                 !(features & NETIF_F_SG)));
2102 }
2103
2104 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2105                         struct netdev_queue *txq)
2106 {
2107         const struct net_device_ops *ops = dev->netdev_ops;
2108         int rc = NETDEV_TX_OK;
2109         unsigned int skb_len;
2110
2111         if (likely(!skb->next)) {
2112                 u32 features;
2113
2114                 /*
2115                  * If device doesn't need skb->dst, release it right now while
2116                  * its hot in this cpu cache
2117                  */
2118                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2119                         skb_dst_drop(skb);
2120
2121                 if (!list_empty(&ptype_all))
2122                         dev_queue_xmit_nit(skb, dev);
2123
2124                 skb_orphan_try(skb);
2125
2126                 features = netif_skb_features(skb);
2127
2128                 if (vlan_tx_tag_present(skb) &&
2129                     !(features & NETIF_F_HW_VLAN_TX)) {
2130                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2131                         if (unlikely(!skb))
2132                                 goto out;
2133
2134                         skb->vlan_tci = 0;
2135                 }
2136
2137                 if (netif_needs_gso(skb, features)) {
2138                         if (unlikely(dev_gso_segment(skb, features)))
2139                                 goto out_kfree_skb;
2140                         if (skb->next)
2141                                 goto gso;
2142                 } else {
2143                         if (skb_needs_linearize(skb, features) &&
2144                             __skb_linearize(skb))
2145                                 goto out_kfree_skb;
2146
2147                         /* If packet is not checksummed and device does not
2148                          * support checksumming for this protocol, complete
2149                          * checksumming here.
2150                          */
2151                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2152                                 skb_set_transport_header(skb,
2153                                         skb_checksum_start_offset(skb));
2154                                 if (!(features & NETIF_F_ALL_CSUM) &&
2155                                      skb_checksum_help(skb))
2156                                         goto out_kfree_skb;
2157                         }
2158                 }
2159
2160                 skb_len = skb->len;
2161                 rc = ops->ndo_start_xmit(skb, dev);
2162                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2163                 if (rc == NETDEV_TX_OK)
2164                         txq_trans_update(txq);
2165                 return rc;
2166         }
2167
2168 gso:
2169         do {
2170                 struct sk_buff *nskb = skb->next;
2171
2172                 skb->next = nskb->next;
2173                 nskb->next = NULL;
2174
2175                 /*
2176                  * If device doesn't need nskb->dst, release it right now while
2177                  * its hot in this cpu cache
2178                  */
2179                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2180                         skb_dst_drop(nskb);
2181
2182                 skb_len = nskb->len;
2183                 rc = ops->ndo_start_xmit(nskb, dev);
2184                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2185                 if (unlikely(rc != NETDEV_TX_OK)) {
2186                         if (rc & ~NETDEV_TX_MASK)
2187                                 goto out_kfree_gso_skb;
2188                         nskb->next = skb->next;
2189                         skb->next = nskb;
2190                         return rc;
2191                 }
2192                 txq_trans_update(txq);
2193                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2194                         return NETDEV_TX_BUSY;
2195         } while (skb->next);
2196
2197 out_kfree_gso_skb:
2198         if (likely(skb->next == NULL))
2199                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2200 out_kfree_skb:
2201         kfree_skb(skb);
2202 out:
2203         return rc;
2204 }
2205
2206 static u32 hashrnd __read_mostly;
2207
2208 /*
2209  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2210  * to be used as a distribution range.
2211  */
2212 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2213                   unsigned int num_tx_queues)
2214 {
2215         u32 hash;
2216         u16 qoffset = 0;
2217         u16 qcount = num_tx_queues;
2218
2219         if (skb_rx_queue_recorded(skb)) {
2220                 hash = skb_get_rx_queue(skb);
2221                 while (unlikely(hash >= num_tx_queues))
2222                         hash -= num_tx_queues;
2223                 return hash;
2224         }
2225
2226         if (dev->num_tc) {
2227                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2228                 qoffset = dev->tc_to_txq[tc].offset;
2229                 qcount = dev->tc_to_txq[tc].count;
2230         }
2231
2232         if (skb->sk && skb->sk->sk_hash)
2233                 hash = skb->sk->sk_hash;
2234         else
2235                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2236         hash = jhash_1word(hash, hashrnd);
2237
2238         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2239 }
2240 EXPORT_SYMBOL(__skb_tx_hash);
2241
2242 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2243 {
2244         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2245                 if (net_ratelimit()) {
2246                         pr_warning("%s selects TX queue %d, but "
2247                                 "real number of TX queues is %d\n",
2248                                 dev->name, queue_index, dev->real_num_tx_queues);
2249                 }
2250                 return 0;
2251         }
2252         return queue_index;
2253 }
2254
2255 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2256 {
2257 #ifdef CONFIG_XPS
2258         struct xps_dev_maps *dev_maps;
2259         struct xps_map *map;
2260         int queue_index = -1;
2261
2262         rcu_read_lock();
2263         dev_maps = rcu_dereference(dev->xps_maps);
2264         if (dev_maps) {
2265                 map = rcu_dereference(
2266                     dev_maps->cpu_map[raw_smp_processor_id()]);
2267                 if (map) {
2268                         if (map->len == 1)
2269                                 queue_index = map->queues[0];
2270                         else {
2271                                 u32 hash;
2272                                 if (skb->sk && skb->sk->sk_hash)
2273                                         hash = skb->sk->sk_hash;
2274                                 else
2275                                         hash = (__force u16) skb->protocol ^
2276                                             skb->rxhash;
2277                                 hash = jhash_1word(hash, hashrnd);
2278                                 queue_index = map->queues[
2279                                     ((u64)hash * map->len) >> 32];
2280                         }
2281                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2282                                 queue_index = -1;
2283                 }
2284         }
2285         rcu_read_unlock();
2286
2287         return queue_index;
2288 #else
2289         return -1;
2290 #endif
2291 }
2292
2293 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2294                                         struct sk_buff *skb)
2295 {
2296         int queue_index;
2297         const struct net_device_ops *ops = dev->netdev_ops;
2298
2299         if (dev->real_num_tx_queues == 1)
2300                 queue_index = 0;
2301         else if (ops->ndo_select_queue) {
2302                 queue_index = ops->ndo_select_queue(dev, skb);
2303                 queue_index = dev_cap_txqueue(dev, queue_index);
2304         } else {
2305                 struct sock *sk = skb->sk;
2306                 queue_index = sk_tx_queue_get(sk);
2307
2308                 if (queue_index < 0 || skb->ooo_okay ||
2309                     queue_index >= dev->real_num_tx_queues) {
2310                         int old_index = queue_index;
2311
2312                         queue_index = get_xps_queue(dev, skb);
2313                         if (queue_index < 0)
2314                                 queue_index = skb_tx_hash(dev, skb);
2315
2316                         if (queue_index != old_index && sk) {
2317                                 struct dst_entry *dst =
2318                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2319
2320                                 if (dst && skb_dst(skb) == dst)
2321                                         sk_tx_queue_set(sk, queue_index);
2322                         }
2323                 }
2324         }
2325
2326         skb_set_queue_mapping(skb, queue_index);
2327         return netdev_get_tx_queue(dev, queue_index);
2328 }
2329
2330 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2331                                  struct net_device *dev,
2332                                  struct netdev_queue *txq)
2333 {
2334         spinlock_t *root_lock = qdisc_lock(q);
2335         bool contended;
2336         int rc;
2337
2338         qdisc_skb_cb(skb)->pkt_len = skb->len;
2339         qdisc_calculate_pkt_len(skb, q);
2340         /*
2341          * Heuristic to force contended enqueues to serialize on a
2342          * separate lock before trying to get qdisc main lock.
2343          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2344          * and dequeue packets faster.
2345          */
2346         contended = qdisc_is_running(q);
2347         if (unlikely(contended))
2348                 spin_lock(&q->busylock);
2349
2350         spin_lock(root_lock);
2351         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2352                 kfree_skb(skb);
2353                 rc = NET_XMIT_DROP;
2354         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2355                    qdisc_run_begin(q)) {
2356                 /*
2357                  * This is a work-conserving queue; there are no old skbs
2358                  * waiting to be sent out; and the qdisc is not running -
2359                  * xmit the skb directly.
2360                  */
2361                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2362                         skb_dst_force(skb);
2363
2364                 qdisc_bstats_update(q, skb);
2365
2366                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2367                         if (unlikely(contended)) {
2368                                 spin_unlock(&q->busylock);
2369                                 contended = false;
2370                         }
2371                         __qdisc_run(q);
2372                 } else
2373                         qdisc_run_end(q);
2374
2375                 rc = NET_XMIT_SUCCESS;
2376         } else {
2377                 skb_dst_force(skb);
2378                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2379                 if (qdisc_run_begin(q)) {
2380                         if (unlikely(contended)) {
2381                                 spin_unlock(&q->busylock);
2382                                 contended = false;
2383                         }
2384                         __qdisc_run(q);
2385                 }
2386         }
2387         spin_unlock(root_lock);
2388         if (unlikely(contended))
2389                 spin_unlock(&q->busylock);
2390         return rc;
2391 }
2392
2393 static DEFINE_PER_CPU(int, xmit_recursion);
2394 #define RECURSION_LIMIT 10
2395
2396 /**
2397  *      dev_queue_xmit - transmit a buffer
2398  *      @skb: buffer to transmit
2399  *
2400  *      Queue a buffer for transmission to a network device. The caller must
2401  *      have set the device and priority and built the buffer before calling
2402  *      this function. The function can be called from an interrupt.
2403  *
2404  *      A negative errno code is returned on a failure. A success does not
2405  *      guarantee the frame will be transmitted as it may be dropped due
2406  *      to congestion or traffic shaping.
2407  *
2408  * -----------------------------------------------------------------------------------
2409  *      I notice this method can also return errors from the queue disciplines,
2410  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2411  *      be positive.
2412  *
2413  *      Regardless of the return value, the skb is consumed, so it is currently
2414  *      difficult to retry a send to this method.  (You can bump the ref count
2415  *      before sending to hold a reference for retry if you are careful.)
2416  *
2417  *      When calling this method, interrupts MUST be enabled.  This is because
2418  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2419  *          --BLG
2420  */
2421 int dev_queue_xmit(struct sk_buff *skb)
2422 {
2423         struct net_device *dev = skb->dev;
2424         struct netdev_queue *txq;
2425         struct Qdisc *q;
2426         int rc = -ENOMEM;
2427
2428         /* Disable soft irqs for various locks below. Also
2429          * stops preemption for RCU.
2430          */
2431         rcu_read_lock_bh();
2432
2433         txq = dev_pick_tx(dev, skb);
2434         q = rcu_dereference_bh(txq->qdisc);
2435
2436 #ifdef CONFIG_NET_CLS_ACT
2437         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2438 #endif
2439         trace_net_dev_queue(skb);
2440         if (q->enqueue) {
2441                 rc = __dev_xmit_skb(skb, q, dev, txq);
2442                 goto out;
2443         }
2444
2445         /* The device has no queue. Common case for software devices:
2446            loopback, all the sorts of tunnels...
2447
2448            Really, it is unlikely that netif_tx_lock protection is necessary
2449            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2450            counters.)
2451            However, it is possible, that they rely on protection
2452            made by us here.
2453
2454            Check this and shot the lock. It is not prone from deadlocks.
2455            Either shot noqueue qdisc, it is even simpler 8)
2456          */
2457         if (dev->flags & IFF_UP) {
2458                 int cpu = smp_processor_id(); /* ok because BHs are off */
2459
2460                 if (txq->xmit_lock_owner != cpu) {
2461
2462                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2463                                 goto recursion_alert;
2464
2465                         HARD_TX_LOCK(dev, txq, cpu);
2466
2467                         if (!netif_tx_queue_stopped(txq)) {
2468                                 __this_cpu_inc(xmit_recursion);
2469                                 rc = dev_hard_start_xmit(skb, dev, txq);
2470                                 __this_cpu_dec(xmit_recursion);
2471                                 if (dev_xmit_complete(rc)) {
2472                                         HARD_TX_UNLOCK(dev, txq);
2473                                         goto out;
2474                                 }
2475                         }
2476                         HARD_TX_UNLOCK(dev, txq);
2477                         if (net_ratelimit())
2478                                 printk(KERN_CRIT "Virtual device %s asks to "
2479                                        "queue packet!\n", dev->name);
2480                 } else {
2481                         /* Recursion is detected! It is possible,
2482                          * unfortunately
2483                          */
2484 recursion_alert:
2485                         if (net_ratelimit())
2486                                 printk(KERN_CRIT "Dead loop on virtual device "
2487                                        "%s, fix it urgently!\n", dev->name);
2488                 }
2489         }
2490
2491         rc = -ENETDOWN;
2492         rcu_read_unlock_bh();
2493
2494         kfree_skb(skb);
2495         return rc;
2496 out:
2497         rcu_read_unlock_bh();
2498         return rc;
2499 }
2500 EXPORT_SYMBOL(dev_queue_xmit);
2501
2502
2503 /*=======================================================================
2504                         Receiver routines
2505   =======================================================================*/
2506
2507 int netdev_max_backlog __read_mostly = 1000;
2508 int netdev_tstamp_prequeue __read_mostly = 1;
2509 int netdev_budget __read_mostly = 300;
2510 int weight_p __read_mostly = 64;            /* old backlog weight */
2511
2512 /* Called with irq disabled */
2513 static inline void ____napi_schedule(struct softnet_data *sd,
2514                                      struct napi_struct *napi)
2515 {
2516         list_add_tail(&napi->poll_list, &sd->poll_list);
2517         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2518 }
2519
2520 /*
2521  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2522  * and src/dst port numbers. Returns a non-zero hash number on success
2523  * and 0 on failure.
2524  */
2525 __u32 __skb_get_rxhash(struct sk_buff *skb)
2526 {
2527         int nhoff, hash = 0, poff;
2528         const struct ipv6hdr *ip6;
2529         const struct iphdr *ip;
2530         u8 ip_proto;
2531         u32 addr1, addr2, ihl;
2532         union {
2533                 u32 v32;
2534                 u16 v16[2];
2535         } ports;
2536
2537         nhoff = skb_network_offset(skb);
2538
2539         switch (skb->protocol) {
2540         case __constant_htons(ETH_P_IP):
2541                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2542                         goto done;
2543
2544                 ip = (const struct iphdr *) (skb->data + nhoff);
2545                 if (ip_is_fragment(ip))
2546                         ip_proto = 0;
2547                 else
2548                         ip_proto = ip->protocol;
2549                 addr1 = (__force u32) ip->saddr;
2550                 addr2 = (__force u32) ip->daddr;
2551                 ihl = ip->ihl;
2552                 break;
2553         case __constant_htons(ETH_P_IPV6):
2554                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2555                         goto done;
2556
2557                 ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2558                 ip_proto = ip6->nexthdr;
2559                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2560                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2561                 ihl = (40 >> 2);
2562                 break;
2563         default:
2564                 goto done;
2565         }
2566
2567         ports.v32 = 0;
2568         poff = proto_ports_offset(ip_proto);
2569         if (poff >= 0) {
2570                 nhoff += ihl * 4 + poff;
2571                 if (pskb_may_pull(skb, nhoff + 4)) {
2572                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2573                         if (ports.v16[1] < ports.v16[0])
2574                                 swap(ports.v16[0], ports.v16[1]);
2575                 }
2576         }
2577
2578         /* get a consistent hash (same value on both flow directions) */
2579         if (addr2 < addr1)
2580                 swap(addr1, addr2);
2581
2582         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2583         if (!hash)
2584                 hash = 1;
2585
2586 done:
2587         return hash;
2588 }
2589 EXPORT_SYMBOL(__skb_get_rxhash);
2590
2591 #ifdef CONFIG_RPS
2592
2593 /* One global table that all flow-based protocols share. */
2594 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2595 EXPORT_SYMBOL(rps_sock_flow_table);
2596
2597 static struct rps_dev_flow *
2598 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2599             struct rps_dev_flow *rflow, u16 next_cpu)
2600 {
2601         u16 tcpu;
2602
2603         tcpu = rflow->cpu = next_cpu;
2604         if (tcpu != RPS_NO_CPU) {
2605 #ifdef CONFIG_RFS_ACCEL
2606                 struct netdev_rx_queue *rxqueue;
2607                 struct rps_dev_flow_table *flow_table;
2608                 struct rps_dev_flow *old_rflow;
2609                 u32 flow_id;
2610                 u16 rxq_index;
2611                 int rc;
2612
2613                 /* Should we steer this flow to a different hardware queue? */
2614                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2615                     !(dev->features & NETIF_F_NTUPLE))
2616                         goto out;
2617                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2618                 if (rxq_index == skb_get_rx_queue(skb))
2619                         goto out;
2620
2621                 rxqueue = dev->_rx + rxq_index;
2622                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2623                 if (!flow_table)
2624                         goto out;
2625                 flow_id = skb->rxhash & flow_table->mask;
2626                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2627                                                         rxq_index, flow_id);
2628                 if (rc < 0)
2629                         goto out;
2630                 old_rflow = rflow;
2631                 rflow = &flow_table->flows[flow_id];
2632                 rflow->cpu = next_cpu;
2633                 rflow->filter = rc;
2634                 if (old_rflow->filter == rflow->filter)
2635                         old_rflow->filter = RPS_NO_FILTER;
2636         out:
2637 #endif
2638                 rflow->last_qtail =
2639                         per_cpu(softnet_data, tcpu).input_queue_head;
2640         }
2641
2642         return rflow;
2643 }
2644
2645 /*
2646  * get_rps_cpu is called from netif_receive_skb and returns the target
2647  * CPU from the RPS map of the receiving queue for a given skb.
2648  * rcu_read_lock must be held on entry.
2649  */
2650 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2651                        struct rps_dev_flow **rflowp)
2652 {
2653         struct netdev_rx_queue *rxqueue;
2654         struct rps_map *map;
2655         struct rps_dev_flow_table *flow_table;
2656         struct rps_sock_flow_table *sock_flow_table;
2657         int cpu = -1;
2658         u16 tcpu;
2659
2660         if (skb_rx_queue_recorded(skb)) {
2661                 u16 index = skb_get_rx_queue(skb);
2662                 if (unlikely(index >= dev->real_num_rx_queues)) {
2663                         WARN_ONCE(dev->real_num_rx_queues > 1,
2664                                   "%s received packet on queue %u, but number "
2665                                   "of RX queues is %u\n",
2666                                   dev->name, index, dev->real_num_rx_queues);
2667                         goto done;
2668                 }
2669                 rxqueue = dev->_rx + index;
2670         } else
2671                 rxqueue = dev->_rx;
2672
2673         map = rcu_dereference(rxqueue->rps_map);
2674         if (map) {
2675                 if (map->len == 1 &&
2676                     !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2677                         tcpu = map->cpus[0];
2678                         if (cpu_online(tcpu))
2679                                 cpu = tcpu;
2680                         goto done;
2681                 }
2682         } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2683                 goto done;
2684         }
2685
2686         skb_reset_network_header(skb);
2687         if (!skb_get_rxhash(skb))
2688                 goto done;
2689
2690         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2691         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2692         if (flow_table && sock_flow_table) {
2693                 u16 next_cpu;
2694                 struct rps_dev_flow *rflow;
2695
2696                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2697                 tcpu = rflow->cpu;
2698
2699                 next_cpu = sock_flow_table->ents[skb->rxhash &
2700                     sock_flow_table->mask];
2701
2702                 /*
2703                  * If the desired CPU (where last recvmsg was done) is
2704                  * different from current CPU (one in the rx-queue flow
2705                  * table entry), switch if one of the following holds:
2706                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2707                  *   - Current CPU is offline.
2708                  *   - The current CPU's queue tail has advanced beyond the
2709                  *     last packet that was enqueued using this table entry.
2710                  *     This guarantees that all previous packets for the flow
2711                  *     have been dequeued, thus preserving in order delivery.
2712                  */
2713                 if (unlikely(tcpu != next_cpu) &&
2714                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2715                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2716                       rflow->last_qtail)) >= 0))
2717                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2718
2719                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2720                         *rflowp = rflow;
2721                         cpu = tcpu;
2722                         goto done;
2723                 }
2724         }
2725
2726         if (map) {
2727                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2728
2729                 if (cpu_online(tcpu)) {
2730                         cpu = tcpu;
2731                         goto done;
2732                 }
2733         }
2734
2735 done:
2736         return cpu;
2737 }
2738
2739 #ifdef CONFIG_RFS_ACCEL
2740
2741 /**
2742  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2743  * @dev: Device on which the filter was set
2744  * @rxq_index: RX queue index
2745  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2746  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2747  *
2748  * Drivers that implement ndo_rx_flow_steer() should periodically call
2749  * this function for each installed filter and remove the filters for
2750  * which it returns %true.
2751  */
2752 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2753                          u32 flow_id, u16 filter_id)
2754 {
2755         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2756         struct rps_dev_flow_table *flow_table;
2757         struct rps_dev_flow *rflow;
2758         bool expire = true;
2759         int cpu;
2760
2761         rcu_read_lock();
2762         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2763         if (flow_table && flow_id <= flow_table->mask) {
2764                 rflow = &flow_table->flows[flow_id];
2765                 cpu = ACCESS_ONCE(rflow->cpu);
2766                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2767                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2768                            rflow->last_qtail) <
2769                      (int)(10 * flow_table->mask)))
2770                         expire = false;
2771         }
2772         rcu_read_unlock();
2773         return expire;
2774 }
2775 EXPORT_SYMBOL(rps_may_expire_flow);
2776
2777 #endif /* CONFIG_RFS_ACCEL */
2778
2779 /* Called from hardirq (IPI) context */
2780 static void rps_trigger_softirq(void *data)
2781 {
2782         struct softnet_data *sd = data;
2783
2784         ____napi_schedule(sd, &sd->backlog);
2785         sd->received_rps++;
2786 }
2787
2788 #endif /* CONFIG_RPS */
2789
2790 /*
2791  * Check if this softnet_data structure is another cpu one
2792  * If yes, queue it to our IPI list and return 1
2793  * If no, return 0
2794  */
2795 static int rps_ipi_queued(struct softnet_data *sd)
2796 {
2797 #ifdef CONFIG_RPS
2798         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2799
2800         if (sd != mysd) {
2801                 sd->rps_ipi_next = mysd->rps_ipi_list;
2802                 mysd->rps_ipi_list = sd;
2803
2804                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2805                 return 1;
2806         }
2807 #endif /* CONFIG_RPS */
2808         return 0;
2809 }
2810
2811 /*
2812  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2813  * queue (may be a remote CPU queue).
2814  */
2815 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2816                               unsigned int *qtail)
2817 {
2818         struct softnet_data *sd;
2819         unsigned long flags;
2820
2821         sd = &per_cpu(softnet_data, cpu);
2822
2823         local_irq_save(flags);
2824
2825         rps_lock(sd);
2826         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2827                 if (skb_queue_len(&sd->input_pkt_queue)) {
2828 enqueue:
2829                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2830                         input_queue_tail_incr_save(sd, qtail);
2831                         rps_unlock(sd);
2832                         local_irq_restore(flags);
2833                         return NET_RX_SUCCESS;
2834                 }
2835
2836                 /* Schedule NAPI for backlog device
2837                  * We can use non atomic operation since we own the queue lock
2838                  */
2839                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2840                         if (!rps_ipi_queued(sd))
2841                                 ____napi_schedule(sd, &sd->backlog);
2842                 }
2843                 goto enqueue;
2844         }
2845
2846         sd->dropped++;
2847         rps_unlock(sd);
2848
2849         local_irq_restore(flags);
2850
2851         atomic_long_inc(&skb->dev->rx_dropped);
2852         kfree_skb(skb);
2853         return NET_RX_DROP;
2854 }
2855
2856 /**
2857  *      netif_rx        -       post buffer to the network code
2858  *      @skb: buffer to post
2859  *
2860  *      This function receives a packet from a device driver and queues it for
2861  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2862  *      may be dropped during processing for congestion control or by the
2863  *      protocol layers.
2864  *
2865  *      return values:
2866  *      NET_RX_SUCCESS  (no congestion)
2867  *      NET_RX_DROP     (packet was dropped)
2868  *
2869  */
2870
2871 int netif_rx(struct sk_buff *skb)
2872 {
2873         int ret;
2874
2875         /* if netpoll wants it, pretend we never saw it */
2876         if (netpoll_rx(skb))
2877                 return NET_RX_DROP;
2878
2879         if (netdev_tstamp_prequeue)
2880                 net_timestamp_check(skb);
2881
2882         trace_netif_rx(skb);
2883 #ifdef CONFIG_RPS
2884         {
2885                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2886                 int cpu;
2887
2888                 preempt_disable();
2889                 rcu_read_lock();
2890
2891                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2892                 if (cpu < 0)
2893                         cpu = smp_processor_id();
2894
2895                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2896
2897                 rcu_read_unlock();
2898                 preempt_enable();
2899         }
2900 #else
2901         {
2902                 unsigned int qtail;
2903                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2904                 put_cpu();
2905         }
2906 #endif
2907         return ret;
2908 }
2909 EXPORT_SYMBOL(netif_rx);
2910
2911 int netif_rx_ni(struct sk_buff *skb)
2912 {
2913         int err;
2914
2915         preempt_disable();
2916         err = netif_rx(skb);
2917         if (local_softirq_pending())
2918                 do_softirq();
2919         preempt_enable();
2920
2921         return err;
2922 }
2923 EXPORT_SYMBOL(netif_rx_ni);
2924
2925 static void net_tx_action(struct softirq_action *h)
2926 {
2927         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2928
2929         if (sd->completion_queue) {
2930                 struct sk_buff *clist;
2931
2932                 local_irq_disable();
2933                 clist = sd->completion_queue;
2934                 sd->completion_queue = NULL;
2935                 local_irq_enable();
2936
2937                 while (clist) {
2938                         struct sk_buff *skb = clist;
2939                         clist = clist->next;
2940
2941                         WARN_ON(atomic_read(&skb->users));
2942                         trace_kfree_skb(skb, net_tx_action);
2943                         __kfree_skb(skb);
2944                 }
2945         }
2946
2947         if (sd->output_queue) {
2948                 struct Qdisc *head;
2949
2950                 local_irq_disable();
2951                 head = sd->output_queue;
2952                 sd->output_queue = NULL;
2953                 sd->output_queue_tailp = &sd->output_queue;
2954                 local_irq_enable();
2955
2956                 while (head) {
2957                         struct Qdisc *q = head;
2958                         spinlock_t *root_lock;
2959
2960                         head = head->next_sched;
2961
2962                         root_lock = qdisc_lock(q);
2963                         if (spin_trylock(root_lock)) {
2964                                 smp_mb__before_clear_bit();
2965                                 clear_bit(__QDISC_STATE_SCHED,
2966                                           &q->state);
2967                                 qdisc_run(q);
2968                                 spin_unlock(root_lock);
2969                         } else {
2970                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2971                                               &q->state)) {
2972                                         __netif_reschedule(q);
2973                                 } else {
2974                                         smp_mb__before_clear_bit();
2975                                         clear_bit(__QDISC_STATE_SCHED,
2976                                                   &q->state);
2977                                 }
2978                         }
2979                 }
2980         }
2981 }
2982
2983 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2984     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2985 /* This hook is defined here for ATM LANE */
2986 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2987                              unsigned char *addr) __read_mostly;
2988 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2989 #endif
2990
2991 #ifdef CONFIG_NET_CLS_ACT
2992 /* TODO: Maybe we should just force sch_ingress to be compiled in
2993  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2994  * a compare and 2 stores extra right now if we dont have it on
2995  * but have CONFIG_NET_CLS_ACT
2996  * NOTE: This doesn't stop any functionality; if you dont have
2997  * the ingress scheduler, you just can't add policies on ingress.
2998  *
2999  */
3000 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3001 {
3002         struct net_device *dev = skb->dev;
3003         u32 ttl = G_TC_RTTL(skb->tc_verd);
3004         int result = TC_ACT_OK;
3005         struct Qdisc *q;
3006
3007         if (unlikely(MAX_RED_LOOP < ttl++)) {
3008                 if (net_ratelimit())
3009                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3010                                skb->skb_iif, dev->ifindex);
3011                 return TC_ACT_SHOT;
3012         }
3013
3014         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3015         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3016
3017         q = rxq->qdisc;
3018         if (q != &noop_qdisc) {
3019                 spin_lock(qdisc_lock(q));
3020                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3021                         result = qdisc_enqueue_root(skb, q);
3022                 spin_unlock(qdisc_lock(q));
3023         }
3024
3025         return result;
3026 }
3027
3028 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3029                                          struct packet_type **pt_prev,
3030                                          int *ret, struct net_device *orig_dev)
3031 {
3032         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3033
3034         if (!rxq || rxq->qdisc == &noop_qdisc)
3035                 goto out;
3036
3037         if (*pt_prev) {
3038                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3039                 *pt_prev = NULL;
3040         }
3041
3042         switch (ing_filter(skb, rxq)) {
3043         case TC_ACT_SHOT:
3044         case TC_ACT_STOLEN:
3045                 kfree_skb(skb);
3046                 return NULL;
3047         }
3048
3049 out:
3050         skb->tc_verd = 0;
3051         return skb;
3052 }
3053 #endif
3054
3055 /**
3056  *      netdev_rx_handler_register - register receive handler
3057  *      @dev: device to register a handler for
3058  *      @rx_handler: receive handler to register
3059  *      @rx_handler_data: data pointer that is used by rx handler
3060  *
3061  *      Register a receive hander for a device. This handler will then be
3062  *      called from __netif_receive_skb. A negative errno code is returned
3063  *      on a failure.
3064  *
3065  *      The caller must hold the rtnl_mutex.
3066  *
3067  *      For a general description of rx_handler, see enum rx_handler_result.
3068  */
3069 int netdev_rx_handler_register(struct net_device *dev,
3070                                rx_handler_func_t *rx_handler,
3071                                void *rx_handler_data)
3072 {
3073         ASSERT_RTNL();
3074
3075         if (dev->rx_handler)
3076                 return -EBUSY;
3077
3078         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3079         rcu_assign_pointer(dev->rx_handler, rx_handler);
3080
3081         return 0;
3082 }
3083 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3084
3085 /**
3086  *      netdev_rx_handler_unregister - unregister receive handler
3087  *      @dev: device to unregister a handler from
3088  *
3089  *      Unregister a receive hander from a device.
3090  *
3091  *      The caller must hold the rtnl_mutex.
3092  */
3093 void netdev_rx_handler_unregister(struct net_device *dev)
3094 {
3095
3096         ASSERT_RTNL();
3097         rcu_assign_pointer(dev->rx_handler, NULL);
3098         rcu_assign_pointer(dev->rx_handler_data, NULL);
3099 }
3100 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3101
3102 static int __netif_receive_skb(struct sk_buff *skb)
3103 {
3104         struct packet_type *ptype, *pt_prev;
3105         rx_handler_func_t *rx_handler;
3106         struct net_device *orig_dev;
3107         struct net_device *null_or_dev;
3108         bool deliver_exact = false;
3109         int ret = NET_RX_DROP;
3110         __be16 type;
3111
3112         if (!netdev_tstamp_prequeue)
3113                 net_timestamp_check(skb);
3114
3115         trace_netif_receive_skb(skb);
3116
3117         /* if we've gotten here through NAPI, check netpoll */
3118         if (netpoll_receive_skb(skb))
3119                 return NET_RX_DROP;
3120
3121         if (!skb->skb_iif)
3122                 skb->skb_iif = skb->dev->ifindex;
3123         orig_dev = skb->dev;
3124
3125         skb_reset_network_header(skb);
3126         skb_reset_transport_header(skb);
3127         skb_reset_mac_len(skb);
3128
3129         pt_prev = NULL;
3130
3131         rcu_read_lock();
3132
3133 another_round:
3134
3135         __this_cpu_inc(softnet_data.processed);
3136
3137         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3138                 skb = vlan_untag(skb);
3139                 if (unlikely(!skb))
3140                         goto out;
3141         }
3142
3143 #ifdef CONFIG_NET_CLS_ACT
3144         if (skb->tc_verd & TC_NCLS) {
3145                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3146                 goto ncls;
3147         }
3148 #endif
3149
3150         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3151                 if (!ptype->dev || ptype->dev == skb->dev) {
3152                         if (pt_prev)
3153                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3154                         pt_prev = ptype;
3155                 }
3156         }
3157
3158 #ifdef CONFIG_NET_CLS_ACT
3159         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3160         if (!skb)
3161                 goto out;
3162 ncls:
3163 #endif
3164
3165         rx_handler = rcu_dereference(skb->dev->rx_handler);
3166         if (rx_handler) {
3167                 if (pt_prev) {
3168                         ret = deliver_skb(skb, pt_prev, orig_dev);
3169                         pt_prev = NULL;
3170                 }
3171                 switch (rx_handler(&skb)) {
3172                 case RX_HANDLER_CONSUMED:
3173                         goto out;
3174                 case RX_HANDLER_ANOTHER:
3175                         goto another_round;
3176                 case RX_HANDLER_EXACT:
3177                         deliver_exact = true;
3178                 case RX_HANDLER_PASS:
3179                         break;
3180                 default:
3181                         BUG();
3182                 }
3183         }
3184
3185         if (vlan_tx_tag_present(skb)) {
3186                 if (pt_prev) {
3187                         ret = deliver_skb(skb, pt_prev, orig_dev);
3188                         pt_prev = NULL;
3189                 }
3190                 if (vlan_do_receive(&skb)) {
3191                         ret = __netif_receive_skb(skb);
3192                         goto out;
3193                 } else if (unlikely(!skb))
3194                         goto out;
3195         }
3196
3197         /* deliver only exact match when indicated */
3198         null_or_dev = deliver_exact ? skb->dev : NULL;
3199
3200         type = skb->protocol;
3201         list_for_each_entry_rcu(ptype,
3202                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3203                 if (ptype->type == type &&
3204                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3205                      ptype->dev == orig_dev)) {
3206                         if (pt_prev)
3207                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3208                         pt_prev = ptype;
3209                 }
3210         }
3211
3212         if (pt_prev) {
3213                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3214         } else {
3215                 atomic_long_inc(&skb->dev->rx_dropped);
3216                 kfree_skb(skb);
3217                 /* Jamal, now you will not able to escape explaining
3218                  * me how you were going to use this. :-)
3219                  */
3220                 ret = NET_RX_DROP;
3221         }
3222
3223 out:
3224         rcu_read_unlock();
3225         return ret;
3226 }
3227
3228 /**
3229  *      netif_receive_skb - process receive buffer from network
3230  *      @skb: buffer to process
3231  *
3232  *      netif_receive_skb() is the main receive data processing function.
3233  *      It always succeeds. The buffer may be dropped during processing
3234  *      for congestion control or by the protocol layers.
3235  *
3236  *      This function may only be called from softirq context and interrupts
3237  *      should be enabled.
3238  *
3239  *      Return values (usually ignored):
3240  *      NET_RX_SUCCESS: no congestion
3241  *      NET_RX_DROP: packet was dropped
3242  */
3243 int netif_receive_skb(struct sk_buff *skb)
3244 {
3245         if (netdev_tstamp_prequeue)
3246                 net_timestamp_check(skb);
3247
3248         if (skb_defer_rx_timestamp(skb))
3249                 return NET_RX_SUCCESS;
3250
3251 #ifdef CONFIG_RPS
3252         {
3253                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3254                 int cpu, ret;
3255
3256                 rcu_read_lock();
3257
3258                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3259
3260                 if (cpu >= 0) {
3261                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3262                         rcu_read_unlock();
3263                 } else {
3264                         rcu_read_unlock();
3265                         ret = __netif_receive_skb(skb);
3266                 }
3267
3268                 return ret;
3269         }
3270 #else
3271         return __netif_receive_skb(skb);
3272 #endif
3273 }
3274 EXPORT_SYMBOL(netif_receive_skb);
3275
3276 /* Network device is going away, flush any packets still pending
3277  * Called with irqs disabled.
3278  */
3279 static void flush_backlog(void *arg)
3280 {
3281         struct net_device *dev = arg;
3282         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3283         struct sk_buff *skb, *tmp;
3284
3285         rps_lock(sd);
3286         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3287                 if (skb->dev == dev) {
3288                         __skb_unlink(skb, &sd->input_pkt_queue);
3289                         kfree_skb(skb);
3290                         input_queue_head_incr(sd);
3291                 }
3292         }
3293         rps_unlock(sd);
3294
3295         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3296                 if (skb->dev == dev) {
3297                         __skb_unlink(skb, &sd->process_queue);
3298                         kfree_skb(skb);
3299                         input_queue_head_incr(sd);
3300                 }
3301         }
3302 }
3303
3304 static int napi_gro_complete(struct sk_buff *skb)
3305 {
3306         struct packet_type *ptype;
3307         __be16 type = skb->protocol;
3308         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3309         int err = -ENOENT;
3310
3311         if (NAPI_GRO_CB(skb)->count == 1) {
3312                 skb_shinfo(skb)->gso_size = 0;
3313                 goto out;
3314         }
3315
3316         rcu_read_lock();
3317         list_for_each_entry_rcu(ptype, head, list) {
3318                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3319                         continue;
3320
3321                 err = ptype->gro_complete(skb);
3322                 break;
3323         }
3324         rcu_read_unlock();
3325
3326         if (err) {
3327                 WARN_ON(&ptype->list == head);
3328                 kfree_skb(skb);
3329                 return NET_RX_SUCCESS;
3330         }
3331
3332 out:
3333         return netif_receive_skb(skb);
3334 }
3335
3336 inline void napi_gro_flush(struct napi_struct *napi)
3337 {
3338         struct sk_buff *skb, *next;
3339
3340         for (skb = napi->gro_list; skb; skb = next) {
3341                 next = skb->next;
3342                 skb->next = NULL;
3343                 napi_gro_complete(skb);
3344         }
3345
3346         napi->gro_count = 0;
3347         napi->gro_list = NULL;
3348 }
3349 EXPORT_SYMBOL(napi_gro_flush);
3350
3351 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3352 {
3353         struct sk_buff **pp = NULL;
3354         struct packet_type *ptype;
3355         __be16 type = skb->protocol;
3356         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3357         int same_flow;
3358         int mac_len;
3359         enum gro_result ret;
3360
3361         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3362                 goto normal;
3363
3364         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3365                 goto normal;
3366
3367         rcu_read_lock();
3368         list_for_each_entry_rcu(ptype, head, list) {
3369                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3370                         continue;
3371
3372                 skb_set_network_header(skb, skb_gro_offset(skb));
3373                 mac_len = skb->network_header - skb->mac_header;
3374                 skb->mac_len = mac_len;
3375                 NAPI_GRO_CB(skb)->same_flow = 0;
3376                 NAPI_GRO_CB(skb)->flush = 0;
3377                 NAPI_GRO_CB(skb)->free = 0;
3378
3379                 pp = ptype->gro_receive(&napi->gro_list, skb);
3380                 break;
3381         }
3382         rcu_read_unlock();
3383
3384         if (&ptype->list == head)
3385                 goto normal;
3386
3387         same_flow = NAPI_GRO_CB(skb)->same_flow;
3388         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3389
3390         if (pp) {
3391                 struct sk_buff *nskb = *pp;
3392
3393                 *pp = nskb->next;
3394                 nskb->next = NULL;
3395                 napi_gro_complete(nskb);
3396                 napi->gro_count--;
3397         }
3398
3399         if (same_flow)
3400                 goto ok;
3401
3402         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3403                 goto normal;
3404
3405         napi->gro_count++;
3406         NAPI_GRO_CB(skb)->count = 1;
3407         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3408         skb->next = napi->gro_list;
3409         napi->gro_list = skb;
3410         ret = GRO_HELD;
3411
3412 pull:
3413         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3414                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3415
3416                 BUG_ON(skb->end - skb->tail < grow);
3417
3418                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3419
3420                 skb->tail += grow;
3421                 skb->data_len -= grow;
3422
3423                 skb_shinfo(skb)->frags[0].page_offset += grow;
3424                 skb_shinfo(skb)->frags[0].size -= grow;
3425
3426                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3427                         put_page(skb_shinfo(skb)->frags[0].page);
3428                         memmove(skb_shinfo(skb)->frags,
3429                                 skb_shinfo(skb)->frags + 1,
3430                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3431                 }
3432         }
3433
3434 ok:
3435         return ret;
3436
3437 normal:
3438         ret = GRO_NORMAL;
3439         goto pull;
3440 }
3441 EXPORT_SYMBOL(dev_gro_receive);
3442
3443 static inline gro_result_t
3444 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3445 {
3446         struct sk_buff *p;
3447
3448         for (p = napi->gro_list; p; p = p->next) {
3449                 unsigned long diffs;
3450
3451                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3452                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3453                 diffs |= compare_ether_header(skb_mac_header(p),
3454                                               skb_gro_mac_header(skb));
3455                 NAPI_GRO_CB(p)->same_flow = !diffs;
3456                 NAPI_GRO_CB(p)->flush = 0;
3457         }
3458
3459         return dev_gro_receive(napi, skb);
3460 }
3461
3462 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3463 {
3464         switch (ret) {
3465         case GRO_NORMAL:
3466                 if (netif_receive_skb(skb))
3467                         ret = GRO_DROP;
3468                 break;
3469
3470         case GRO_DROP:
3471         case GRO_MERGED_FREE:
3472                 kfree_skb(skb);
3473                 break;
3474
3475         case GRO_HELD:
3476         case GRO_MERGED:
3477                 break;
3478         }
3479
3480         return ret;
3481 }
3482 EXPORT_SYMBOL(napi_skb_finish);
3483
3484 void skb_gro_reset_offset(struct sk_buff *skb)
3485 {
3486         NAPI_GRO_CB(skb)->data_offset = 0;
3487         NAPI_GRO_CB(skb)->frag0 = NULL;
3488         NAPI_GRO_CB(skb)->frag0_len = 0;
3489
3490         if (skb->mac_header == skb->tail &&
3491             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3492                 NAPI_GRO_CB(skb)->frag0 =
3493                         page_address(skb_shinfo(skb)->frags[0].page) +
3494                         skb_shinfo(skb)->frags[0].page_offset;
3495                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3496         }
3497 }
3498 EXPORT_SYMBOL(skb_gro_reset_offset);
3499
3500 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3501 {
3502         skb_gro_reset_offset(skb);
3503
3504         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3505 }
3506 EXPORT_SYMBOL(napi_gro_receive);
3507
3508 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3509 {
3510         __skb_pull(skb, skb_headlen(skb));
3511         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3512         skb->vlan_tci = 0;
3513         skb->dev = napi->dev;
3514         skb->skb_iif = 0;
3515
3516         napi->skb = skb;
3517 }
3518
3519 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3520 {
3521         struct sk_buff *skb = napi->skb;
3522
3523         if (!skb) {
3524                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3525                 if (skb)
3526                         napi->skb = skb;
3527         }
3528         return skb;
3529 }
3530 EXPORT_SYMBOL(napi_get_frags);
3531
3532 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3533                                gro_result_t ret)
3534 {
3535         switch (ret) {
3536         case GRO_NORMAL:
3537         case GRO_HELD:
3538                 skb->protocol = eth_type_trans(skb, skb->dev);
3539
3540                 if (ret == GRO_HELD)
3541                         skb_gro_pull(skb, -ETH_HLEN);
3542                 else if (netif_receive_skb(skb))
3543                         ret = GRO_DROP;
3544                 break;
3545
3546         case GRO_DROP:
3547         case GRO_MERGED_FREE:
3548                 napi_reuse_skb(napi, skb);
3549                 break;
3550
3551         case GRO_MERGED:
3552                 break;
3553         }
3554
3555         return ret;
3556 }
3557 EXPORT_SYMBOL(napi_frags_finish);
3558
3559 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3560 {
3561         struct sk_buff *skb = napi->skb;
3562         struct ethhdr *eth;
3563         unsigned int hlen;
3564         unsigned int off;
3565
3566         napi->skb = NULL;
3567
3568         skb_reset_mac_header(skb);
3569         skb_gro_reset_offset(skb);
3570
3571         off = skb_gro_offset(skb);
3572         hlen = off + sizeof(*eth);
3573         eth = skb_gro_header_fast(skb, off);
3574         if (skb_gro_header_hard(skb, hlen)) {
3575                 eth = skb_gro_header_slow(skb, hlen, off);
3576                 if (unlikely(!eth)) {
3577                         napi_reuse_skb(napi, skb);
3578                         skb = NULL;
3579                         goto out;
3580                 }
3581         }
3582
3583         skb_gro_pull(skb, sizeof(*eth));
3584
3585         /*
3586          * This works because the only protocols we care about don't require
3587          * special handling.  We'll fix it up properly at the end.
3588          */
3589         skb->protocol = eth->h_proto;
3590
3591 out:
3592         return skb;
3593 }
3594 EXPORT_SYMBOL(napi_frags_skb);
3595
3596 gro_result_t napi_gro_frags(struct napi_struct *napi)
3597 {
3598         struct sk_buff *skb = napi_frags_skb(napi);
3599
3600         if (!skb)
3601                 return GRO_DROP;
3602
3603         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3604 }
3605 EXPORT_SYMBOL(napi_gro_frags);
3606
3607 /*
3608  * net_rps_action sends any pending IPI's for rps.
3609  * Note: called with local irq disabled, but exits with local irq enabled.
3610  */
3611 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3612 {
3613 #ifdef CONFIG_RPS
3614         struct softnet_data *remsd = sd->rps_ipi_list;
3615
3616         if (remsd) {
3617                 sd->rps_ipi_list = NULL;
3618
3619                 local_irq_enable();
3620
3621                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3622                 while (remsd) {
3623                         struct softnet_data *next = remsd->rps_ipi_next;
3624
3625                         if (cpu_online(remsd->cpu))
3626                                 __smp_call_function_single(remsd->cpu,
3627                                                            &remsd->csd, 0);
3628                         remsd = next;
3629                 }
3630         } else
3631 #endif
3632                 local_irq_enable();
3633 }
3634
3635 static int process_backlog(struct napi_struct *napi, int quota)
3636 {
3637         int work = 0;
3638         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3639
3640 #ifdef CONFIG_RPS
3641         /* Check if we have pending ipi, its better to send them now,
3642          * not waiting net_rx_action() end.
3643          */
3644         if (sd->rps_ipi_list) {
3645                 local_irq_disable();
3646                 net_rps_action_and_irq_enable(sd);
3647         }
3648 #endif
3649         napi->weight = weight_p;
3650         local_irq_disable();
3651         while (work < quota) {
3652                 struct sk_buff *skb;
3653                 unsigned int qlen;
3654
3655                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3656                         local_irq_enable();
3657                         __netif_receive_skb(skb);
3658                         local_irq_disable();
3659                         input_queue_head_incr(sd);
3660                         if (++work >= quota) {
3661                                 local_irq_enable();
3662                                 return work;
3663                         }
3664                 }
3665
3666                 rps_lock(sd);
3667                 qlen = skb_queue_len(&sd->input_pkt_queue);
3668                 if (qlen)
3669                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3670                                                    &sd->process_queue);
3671
3672                 if (qlen < quota - work) {
3673                         /*
3674                          * Inline a custom version of __napi_complete().
3675                          * only current cpu owns and manipulates this napi,
3676                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3677                          * we can use a plain write instead of clear_bit(),
3678                          * and we dont need an smp_mb() memory barrier.
3679                          */
3680                         list_del(&napi->poll_list);
3681                         napi->state = 0;
3682
3683                         quota = work + qlen;
3684                 }
3685                 rps_unlock(sd);
3686         }
3687         local_irq_enable();
3688
3689         return work;
3690 }
3691
3692 /**
3693  * __napi_schedule - schedule for receive
3694  * @n: entry to schedule
3695  *
3696  * The entry's receive function will be scheduled to run
3697  */
3698 void __napi_schedule(struct napi_struct *n)
3699 {
3700         unsigned long flags;
3701
3702         local_irq_save(flags);
3703         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3704         local_irq_restore(flags);
3705 }
3706 EXPORT_SYMBOL(__napi_schedule);
3707
3708 void __napi_complete(struct napi_struct *n)
3709 {
3710         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3711         BUG_ON(n->gro_list);
3712
3713         list_del(&n->poll_list);
3714         smp_mb__before_clear_bit();
3715         clear_bit(NAPI_STATE_SCHED, &n->state);
3716 }
3717 EXPORT_SYMBOL(__napi_complete);
3718
3719 void napi_complete(struct napi_struct *n)
3720 {
3721         unsigned long flags;
3722
3723         /*
3724          * don't let napi dequeue from the cpu poll list
3725          * just in case its running on a different cpu
3726          */
3727         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3728                 return;
3729
3730         napi_gro_flush(n);
3731         local_irq_save(flags);
3732         __napi_complete(n);
3733         local_irq_restore(flags);
3734 }
3735 EXPORT_SYMBOL(napi_complete);
3736
3737 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3738                     int (*poll)(struct napi_struct *, int), int weight)
3739 {
3740         INIT_LIST_HEAD(&napi->poll_list);
3741         napi->gro_count = 0;
3742         napi->gro_list = NULL;
3743         napi->skb = NULL;
3744         napi->poll = poll;
3745         napi->weight = weight;
3746         list_add(&napi->dev_list, &dev->napi_list);
3747         napi->dev = dev;
3748 #ifdef CONFIG_NETPOLL
3749         spin_lock_init(&napi->poll_lock);
3750         napi->poll_owner = -1;
3751 #endif
3752         set_bit(NAPI_STATE_SCHED, &napi->state);
3753 }
3754 EXPORT_SYMBOL(netif_napi_add);
3755
3756 void netif_napi_del(struct napi_struct *napi)
3757 {
3758         struct sk_buff *skb, *next;
3759
3760         list_del_init(&napi->dev_list);
3761         napi_free_frags(napi);
3762
3763         for (skb = napi->gro_list; skb; skb = next) {
3764                 next = skb->next;
3765                 skb->next = NULL;
3766                 kfree_skb(skb);
3767         }
3768
3769         napi->gro_list = NULL;
3770         napi->gro_count = 0;
3771 }
3772 EXPORT_SYMBOL(netif_napi_del);
3773
3774 static void net_rx_action(struct softirq_action *h)
3775 {
3776         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3777         unsigned long time_limit = jiffies + 2;
3778         int budget = netdev_budget;
3779         void *have;
3780
3781         local_irq_disable();
3782
3783         while (!list_empty(&sd->poll_list)) {
3784                 struct napi_struct *n;
3785                 int work, weight;
3786
3787                 /* If softirq window is exhuasted then punt.
3788                  * Allow this to run for 2 jiffies since which will allow
3789                  * an average latency of 1.5/HZ.
3790                  */
3791                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3792                         goto softnet_break;
3793
3794                 local_irq_enable();
3795
3796                 /* Even though interrupts have been re-enabled, this
3797                  * access is safe because interrupts can only add new
3798                  * entries to the tail of this list, and only ->poll()
3799                  * calls can remove this head entry from the list.
3800                  */
3801                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3802
3803                 have = netpoll_poll_lock(n);
3804
3805                 weight = n->weight;
3806
3807                 /* This NAPI_STATE_SCHED test is for avoiding a race
3808                  * with netpoll's poll_napi().  Only the entity which
3809                  * obtains the lock and sees NAPI_STATE_SCHED set will
3810                  * actually make the ->poll() call.  Therefore we avoid
3811                  * accidentally calling ->poll() when NAPI is not scheduled.
3812                  */
3813                 work = 0;
3814                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3815                         work = n->poll(n, weight);
3816                         trace_napi_poll(n);
3817                 }
3818
3819                 WARN_ON_ONCE(work > weight);
3820
3821                 budget -= work;
3822
3823                 local_irq_disable();
3824
3825                 /* Drivers must not modify the NAPI state if they
3826                  * consume the entire weight.  In such cases this code
3827                  * still "owns" the NAPI instance and therefore can
3828                  * move the instance around on the list at-will.
3829                  */
3830                 if (unlikely(work == weight)) {
3831                         if (unlikely(napi_disable_pending(n))) {
3832                                 local_irq_enable();
3833                                 napi_complete(n);
3834                                 local_irq_disable();
3835                         } else
3836                                 list_move_tail(&n->poll_list, &sd->poll_list);
3837                 }
3838
3839                 netpoll_poll_unlock(have);
3840         }
3841 out:
3842         net_rps_action_and_irq_enable(sd);
3843
3844 #ifdef CONFIG_NET_DMA
3845         /*
3846          * There may not be any more sk_buffs coming right now, so push
3847          * any pending DMA copies to hardware
3848          */
3849         dma_issue_pending_all();
3850 #endif
3851
3852         return;
3853
3854 softnet_break:
3855         sd->time_squeeze++;
3856         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3857         goto out;
3858 }
3859
3860 static gifconf_func_t *gifconf_list[NPROTO];
3861
3862 /**
3863  *      register_gifconf        -       register a SIOCGIF handler
3864  *      @family: Address family
3865  *      @gifconf: Function handler
3866  *
3867  *      Register protocol dependent address dumping routines. The handler
3868  *      that is passed must not be freed or reused until it has been replaced
3869  *      by another handler.
3870  */
3871 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3872 {
3873         if (family >= NPROTO)
3874                 return -EINVAL;
3875         gifconf_list[family] = gifconf;
3876         return 0;
3877 }
3878 EXPORT_SYMBOL(register_gifconf);
3879
3880
3881 /*
3882  *      Map an interface index to its name (SIOCGIFNAME)
3883  */
3884
3885 /*
3886  *      We need this ioctl for efficient implementation of the
3887  *      if_indextoname() function required by the IPv6 API.  Without
3888  *      it, we would have to search all the interfaces to find a
3889  *      match.  --pb
3890  */
3891
3892 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3893 {
3894         struct net_device *dev;
3895         struct ifreq ifr;
3896
3897         /*
3898          *      Fetch the caller's info block.
3899          */
3900
3901         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3902                 return -EFAULT;
3903
3904         rcu_read_lock();
3905         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3906         if (!dev) {
3907                 rcu_read_unlock();
3908                 return -ENODEV;
3909         }
3910
3911         strcpy(ifr.ifr_name, dev->name);
3912         rcu_read_unlock();
3913
3914         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3915                 return -EFAULT;
3916         return 0;
3917 }
3918
3919 /*
3920  *      Perform a SIOCGIFCONF call. This structure will change
3921  *      size eventually, and there is nothing I can do about it.
3922  *      Thus we will need a 'compatibility mode'.
3923  */
3924
3925 static int dev_ifconf(struct net *net, char __user *arg)
3926 {
3927         struct ifconf ifc;
3928         struct net_device *dev;
3929         char __user *pos;
3930         int len;
3931         int total;
3932         int i;
3933
3934         /*
3935          *      Fetch the caller's info block.
3936          */
3937
3938         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3939                 return -EFAULT;
3940
3941         pos = ifc.ifc_buf;
3942         len = ifc.ifc_len;
3943
3944         /*
3945          *      Loop over the interfaces, and write an info block for each.
3946          */
3947
3948         total = 0;
3949         for_each_netdev(net, dev) {
3950                 for (i = 0; i < NPROTO; i++) {
3951                         if (gifconf_list[i]) {
3952                                 int done;
3953                                 if (!pos)
3954                                         done = gifconf_list[i](dev, NULL, 0);
3955                                 else
3956                                         done = gifconf_list[i](dev, pos + total,
3957                                                                len - total);
3958                                 if (done < 0)
3959                                         return -EFAULT;
3960                                 total += done;
3961                         }
3962                 }
3963         }
3964
3965         /*
3966          *      All done.  Write the updated control block back to the caller.
3967          */
3968         ifc.ifc_len = total;
3969
3970         /*
3971          *      Both BSD and Solaris return 0 here, so we do too.
3972          */
3973         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3974 }
3975
3976 #ifdef CONFIG_PROC_FS
3977 /*
3978  *      This is invoked by the /proc filesystem handler to display a device
3979  *      in detail.
3980  */
3981 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3982         __acquires(RCU)
3983 {
3984         struct net *net = seq_file_net(seq);
3985         loff_t off;
3986         struct net_device *dev;
3987
3988         rcu_read_lock();
3989         if (!*pos)
3990                 return SEQ_START_TOKEN;
3991
3992         off = 1;
3993         for_each_netdev_rcu(net, dev)
3994                 if (off++ == *pos)
3995                         return dev;
3996
3997         return NULL;
3998 }
3999
4000 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4001 {
4002         struct net_device *dev = v;
4003
4004         if (v == SEQ_START_TOKEN)
4005                 dev = first_net_device_rcu(seq_file_net(seq));
4006         else
4007                 dev = next_net_device_rcu(dev);
4008
4009         ++*pos;
4010         return dev;
4011 }
4012
4013 void dev_seq_stop(struct seq_file *seq, void *v)
4014         __releases(RCU)
4015 {
4016         rcu_read_unlock();
4017 }
4018
4019 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4020 {
4021         struct rtnl_link_stats64 temp;
4022         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4023
4024         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4025                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4026                    dev->name, stats->rx_bytes, stats->rx_packets,
4027                    stats->rx_errors,
4028                    stats->rx_dropped + stats->rx_missed_errors,
4029                    stats->rx_fifo_errors,
4030                    stats->rx_length_errors + stats->rx_over_errors +
4031                     stats->rx_crc_errors + stats->rx_frame_errors,
4032                    stats->rx_compressed, stats->multicast,
4033                    stats->tx_bytes, stats->tx_packets,
4034                    stats->tx_errors, stats->tx_dropped,
4035                    stats->tx_fifo_errors, stats->collisions,
4036                    stats->tx_carrier_errors +
4037                     stats->tx_aborted_errors +
4038                     stats->tx_window_errors +
4039                     stats->tx_heartbeat_errors,
4040                    stats->tx_compressed);
4041 }
4042
4043 /*
4044  *      Called from the PROCfs module. This now uses the new arbitrary sized
4045  *      /proc/net interface to create /proc/net/dev
4046  */
4047 static int dev_seq_show(struct seq_file *seq, void *v)
4048 {
4049         if (v == SEQ_START_TOKEN)
4050                 seq_puts(seq, "Inter-|   Receive                            "
4051                               "                    |  Transmit\n"
4052                               " face |bytes    packets errs drop fifo frame "
4053                               "compressed multicast|bytes    packets errs "
4054                               "drop fifo colls carrier compressed\n");
4055         else
4056                 dev_seq_printf_stats(seq, v);
4057         return 0;
4058 }
4059
4060 static struct softnet_data *softnet_get_online(loff_t *pos)
4061 {
4062         struct softnet_data *sd = NULL;
4063
4064         while (*pos < nr_cpu_ids)
4065                 if (cpu_online(*pos)) {
4066                         sd = &per_cpu(softnet_data, *pos);
4067                         break;
4068                 } else
4069                         ++*pos;
4070         return sd;
4071 }
4072
4073 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4074 {
4075         return softnet_get_online(pos);
4076 }
4077
4078 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4079 {
4080         ++*pos;
4081         return softnet_get_online(pos);
4082 }
4083
4084 static void softnet_seq_stop(struct seq_file *seq, void *v)
4085 {
4086 }
4087
4088 static int softnet_seq_show(struct seq_file *seq, void *v)
4089 {
4090         struct softnet_data *sd = v;
4091
4092         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4093                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4094                    0, 0, 0, 0, /* was fastroute */
4095                    sd->cpu_collision, sd->received_rps);
4096         return 0;
4097 }
4098
4099 static const struct seq_operations dev_seq_ops = {
4100         .start = dev_seq_start,
4101         .next  = dev_seq_next,
4102         .stop  = dev_seq_stop,
4103         .show  = dev_seq_show,
4104 };
4105
4106 static int dev_seq_open(struct inode *inode, struct file *file)
4107 {
4108         return seq_open_net(inode, file, &dev_seq_ops,
4109                             sizeof(struct seq_net_private));
4110 }
4111
4112 static const struct file_operations dev_seq_fops = {
4113         .owner   = THIS_MODULE,
4114         .open    = dev_seq_open,
4115         .read    = seq_read,
4116         .llseek  = seq_lseek,
4117         .release = seq_release_net,
4118 };
4119
4120 static const struct seq_operations softnet_seq_ops = {
4121         .start = softnet_seq_start,
4122         .next  = softnet_seq_next,
4123         .stop  = softnet_seq_stop,
4124         .show  = softnet_seq_show,
4125 };
4126
4127 static int softnet_seq_open(struct inode *inode, struct file *file)
4128 {
4129         return seq_open(file, &softnet_seq_ops);
4130 }
4131
4132 static const struct file_operations softnet_seq_fops = {
4133         .owner   = THIS_MODULE,
4134         .open    = softnet_seq_open,
4135         .read    = seq_read,
4136         .llseek  = seq_lseek,
4137         .release = seq_release,
4138 };
4139
4140 static void *ptype_get_idx(loff_t pos)
4141 {
4142         struct packet_type *pt = NULL;
4143         loff_t i = 0;
4144         int t;
4145
4146         list_for_each_entry_rcu(pt, &ptype_all, list) {
4147                 if (i == pos)
4148                         return pt;
4149                 ++i;
4150         }
4151
4152         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4153                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4154                         if (i == pos)
4155                                 return pt;
4156                         ++i;
4157                 }
4158         }
4159         return NULL;
4160 }
4161
4162 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4163         __acquires(RCU)
4164 {
4165         rcu_read_lock();
4166         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4167 }
4168
4169 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4170 {
4171         struct packet_type *pt;
4172         struct list_head *nxt;
4173         int hash;
4174
4175         ++*pos;
4176         if (v == SEQ_START_TOKEN)
4177                 return ptype_get_idx(0);
4178
4179         pt = v;
4180         nxt = pt->list.next;
4181         if (pt->type == htons(ETH_P_ALL)) {
4182                 if (nxt != &ptype_all)
4183                         goto found;
4184                 hash = 0;
4185                 nxt = ptype_base[0].next;
4186         } else
4187                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4188
4189         while (nxt == &ptype_base[hash]) {
4190                 if (++hash >= PTYPE_HASH_SIZE)
4191                         return NULL;
4192                 nxt = ptype_base[hash].next;
4193         }
4194 found:
4195         return list_entry(nxt, struct packet_type, list);
4196 }
4197
4198 static void ptype_seq_stop(struct seq_file *seq, void *v)
4199         __releases(RCU)
4200 {
4201         rcu_read_unlock();
4202 }
4203
4204 static int ptype_seq_show(struct seq_file *seq, void *v)
4205 {
4206         struct packet_type *pt = v;
4207
4208         if (v == SEQ_START_TOKEN)
4209                 seq_puts(seq, "Type Device      Function\n");
4210         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4211                 if (pt->type == htons(ETH_P_ALL))
4212                         seq_puts(seq, "ALL ");
4213                 else
4214                         seq_printf(seq, "%04x", ntohs(pt->type));
4215
4216                 seq_printf(seq, " %-8s %pF\n",
4217                            pt->dev ? pt->dev->name : "", pt->func);
4218         }
4219
4220         return 0;
4221 }
4222
4223 static const struct seq_operations ptype_seq_ops = {
4224         .start = ptype_seq_start,
4225         .next  = ptype_seq_next,
4226         .stop  = ptype_seq_stop,
4227         .show  = ptype_seq_show,
4228 };
4229
4230 static int ptype_seq_open(struct inode *inode, struct file *file)
4231 {
4232         return seq_open_net(inode, file, &ptype_seq_ops,
4233                         sizeof(struct seq_net_private));
4234 }
4235
4236 static const struct file_operations ptype_seq_fops = {
4237         .owner   = THIS_MODULE,
4238         .open    = ptype_seq_open,
4239         .read    = seq_read,
4240         .llseek  = seq_lseek,
4241         .release = seq_release_net,
4242 };
4243
4244
4245 static int __net_init dev_proc_net_init(struct net *net)
4246 {
4247         int rc = -ENOMEM;
4248
4249         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4250                 goto out;
4251         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4252                 goto out_dev;
4253         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4254                 goto out_softnet;
4255
4256         if (wext_proc_init(net))
4257                 goto out_ptype;
4258         rc = 0;
4259 out:
4260         return rc;
4261 out_ptype:
4262         proc_net_remove(net, "ptype");
4263 out_softnet:
4264         proc_net_remove(net, "softnet_stat");
4265 out_dev:
4266         proc_net_remove(net, "dev");
4267         goto out;
4268 }
4269
4270 static void __net_exit dev_proc_net_exit(struct net *net)
4271 {
4272         wext_proc_exit(net);
4273
4274         proc_net_remove(net, "ptype");
4275         proc_net_remove(net, "softnet_stat");
4276         proc_net_remove(net, "dev");
4277 }
4278
4279 static struct pernet_operations __net_initdata dev_proc_ops = {
4280         .init = dev_proc_net_init,
4281         .exit = dev_proc_net_exit,
4282 };
4283
4284 static int __init dev_proc_init(void)
4285 {
4286         return register_pernet_subsys(&dev_proc_ops);
4287 }
4288 #else
4289 #define dev_proc_init() 0
4290 #endif  /* CONFIG_PROC_FS */
4291
4292
4293 /**
4294  *      netdev_set_master       -       set up master pointer
4295  *      @slave: slave device
4296  *      @master: new master device
4297  *
4298  *      Changes the master device of the slave. Pass %NULL to break the
4299  *      bonding. The caller must hold the RTNL semaphore. On a failure
4300  *      a negative errno code is returned. On success the reference counts
4301  *      are adjusted and the function returns zero.
4302  */
4303 int netdev_set_master(struct net_device *slave, struct net_device *master)
4304 {
4305         struct net_device *old = slave->master;
4306
4307         ASSERT_RTNL();
4308
4309         if (master) {
4310                 if (old)
4311                         return -EBUSY;
4312                 dev_hold(master);
4313         }
4314
4315         slave->master = master;
4316
4317         if (old)
4318                 dev_put(old);
4319         return 0;
4320 }
4321 EXPORT_SYMBOL(netdev_set_master);
4322
4323 /**
4324  *      netdev_set_bond_master  -       set up bonding master/slave pair
4325  *      @slave: slave device
4326  *      @master: new master device
4327  *
4328  *      Changes the master device of the slave. Pass %NULL to break the
4329  *      bonding. The caller must hold the RTNL semaphore. On a failure
4330  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4331  *      to the routing socket and the function returns zero.
4332  */
4333 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4334 {
4335         int err;
4336
4337         ASSERT_RTNL();
4338
4339         err = netdev_set_master(slave, master);
4340         if (err)
4341                 return err;
4342         if (master)
4343                 slave->flags |= IFF_SLAVE;
4344         else
4345                 slave->flags &= ~IFF_SLAVE;
4346
4347         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4348         return 0;
4349 }
4350 EXPORT_SYMBOL(netdev_set_bond_master);
4351
4352 static void dev_change_rx_flags(struct net_device *dev, int flags)
4353 {
4354         const struct net_device_ops *ops = dev->netdev_ops;
4355
4356         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4357                 ops->ndo_change_rx_flags(dev, flags);
4358 }
4359
4360 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4361 {
4362         unsigned short old_flags = dev->flags;
4363         uid_t uid;
4364         gid_t gid;
4365
4366         ASSERT_RTNL();
4367
4368         dev->flags |= IFF_PROMISC;
4369         dev->promiscuity += inc;
4370         if (dev->promiscuity == 0) {
4371                 /*
4372                  * Avoid overflow.
4373                  * If inc causes overflow, untouch promisc and return error.
4374                  */
4375                 if (inc < 0)
4376                         dev->flags &= ~IFF_PROMISC;
4377                 else {
4378                         dev->promiscuity -= inc;
4379                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4380                                 "set promiscuity failed, promiscuity feature "
4381                                 "of device might be broken.\n", dev->name);
4382                         return -EOVERFLOW;
4383                 }
4384         }
4385         if (dev->flags != old_flags) {
4386                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4387                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4388                                                                "left");
4389                 if (audit_enabled) {
4390                         current_uid_gid(&uid, &gid);
4391                         audit_log(current->audit_context, GFP_ATOMIC,
4392                                 AUDIT_ANOM_PROMISCUOUS,
4393                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4394                                 dev->name, (dev->flags & IFF_PROMISC),
4395                                 (old_flags & IFF_PROMISC),
4396                                 audit_get_loginuid(current),
4397                                 uid, gid,
4398                                 audit_get_sessionid(current));
4399                 }
4400
4401                 dev_change_rx_flags(dev, IFF_PROMISC);
4402         }
4403         return 0;
4404 }
4405
4406 /**
4407  *      dev_set_promiscuity     - update promiscuity count on a device
4408  *      @dev: device
4409  *      @inc: modifier
4410  *
4411  *      Add or remove promiscuity from a device. While the count in the device
4412  *      remains above zero the interface remains promiscuous. Once it hits zero
4413  *      the device reverts back to normal filtering operation. A negative inc
4414  *      value is used to drop promiscuity on the device.
4415  *      Return 0 if successful or a negative errno code on error.
4416  */
4417 int dev_set_promiscuity(struct net_device *dev, int inc)
4418 {
4419         unsigned short old_flags = dev->flags;
4420         int err;
4421
4422         err = __dev_set_promiscuity(dev, inc);
4423         if (err < 0)
4424                 return err;
4425         if (dev->flags != old_flags)
4426                 dev_set_rx_mode(dev);
4427         return err;
4428 }
4429 EXPORT_SYMBOL(dev_set_promiscuity);
4430
4431 /**
4432  *      dev_set_allmulti        - update allmulti count on a device
4433  *      @dev: device
4434  *      @inc: modifier
4435  *
4436  *      Add or remove reception of all multicast frames to a device. While the
4437  *      count in the device remains above zero the interface remains listening
4438  *      to all interfaces. Once it hits zero the device reverts back to normal
4439  *      filtering operation. A negative @inc value is used to drop the counter
4440  *      when releasing a resource needing all multicasts.
4441  *      Return 0 if successful or a negative errno code on error.
4442  */
4443
4444 int dev_set_allmulti(struct net_device *dev, int inc)
4445 {
4446         unsigned short old_flags = dev->flags;
4447
4448         ASSERT_RTNL();
4449
4450         dev->flags |= IFF_ALLMULTI;
4451         dev->allmulti += inc;
4452         if (dev->allmulti == 0) {
4453                 /*
4454                  * Avoid overflow.
4455                  * If inc causes overflow, untouch allmulti and return error.
4456                  */
4457                 if (inc < 0)
4458                         dev->flags &= ~IFF_ALLMULTI;
4459                 else {
4460                         dev->allmulti -= inc;
4461                         printk(KERN_WARNING "%s: allmulti touches roof, "
4462                                 "set allmulti failed, allmulti feature of "
4463                                 "device might be broken.\n", dev->name);
4464                         return -EOVERFLOW;
4465                 }
4466         }
4467         if (dev->flags ^ old_flags) {
4468                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4469                 dev_set_rx_mode(dev);
4470         }
4471         return 0;
4472 }
4473 EXPORT_SYMBOL(dev_set_allmulti);
4474
4475 /*
4476  *      Upload unicast and multicast address lists to device and
4477  *      configure RX filtering. When the device doesn't support unicast
4478  *      filtering it is put in promiscuous mode while unicast addresses
4479  *      are present.
4480  */
4481 void __dev_set_rx_mode(struct net_device *dev)
4482 {
4483         const struct net_device_ops *ops = dev->netdev_ops;
4484
4485         /* dev_open will call this function so the list will stay sane. */
4486         if (!(dev->flags&IFF_UP))
4487                 return;
4488
4489         if (!netif_device_present(dev))
4490                 return;
4491
4492         if (ops->ndo_set_rx_mode)
4493                 ops->ndo_set_rx_mode(dev);
4494         else {
4495                 /* Unicast addresses changes may only happen under the rtnl,
4496                  * therefore calling __dev_set_promiscuity here is safe.
4497                  */
4498                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4499                         __dev_set_promiscuity(dev, 1);
4500                         dev->uc_promisc = true;
4501                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4502                         __dev_set_promiscuity(dev, -1);
4503                         dev->uc_promisc = false;
4504                 }
4505
4506                 if (ops->ndo_set_multicast_list)
4507                         ops->ndo_set_multicast_list(dev);
4508         }
4509 }
4510
4511 void dev_set_rx_mode(struct net_device *dev)
4512 {
4513         netif_addr_lock_bh(dev);
4514         __dev_set_rx_mode(dev);
4515         netif_addr_unlock_bh(dev);
4516 }
4517
4518 /**
4519  *      dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
4520  *      @dev: device
4521  *      @cmd: memory area for ethtool_ops::get_settings() result
4522  *
4523  *      The cmd arg is initialized properly (cleared and
4524  *      ethtool_cmd::cmd field set to ETHTOOL_GSET).
4525  *
4526  *      Return device's ethtool_ops::get_settings() result value or
4527  *      -EOPNOTSUPP when device doesn't expose
4528  *      ethtool_ops::get_settings() operation.
4529  */
4530 int dev_ethtool_get_settings(struct net_device *dev,
4531                              struct ethtool_cmd *cmd)
4532 {
4533         if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
4534                 return -EOPNOTSUPP;
4535
4536         memset(cmd, 0, sizeof(struct ethtool_cmd));
4537         cmd->cmd = ETHTOOL_GSET;
4538         return dev->ethtool_ops->get_settings(dev, cmd);
4539 }
4540 EXPORT_SYMBOL(dev_ethtool_get_settings);
4541
4542 /**
4543  *      dev_get_flags - get flags reported to userspace
4544  *      @dev: device
4545  *
4546  *      Get the combination of flag bits exported through APIs to userspace.
4547  */
4548 unsigned dev_get_flags(const struct net_device *dev)
4549 {
4550         unsigned flags;
4551
4552         flags = (dev->flags & ~(IFF_PROMISC |
4553                                 IFF_ALLMULTI |
4554                                 IFF_RUNNING |
4555                                 IFF_LOWER_UP |
4556                                 IFF_DORMANT)) |
4557                 (dev->gflags & (IFF_PROMISC |
4558                                 IFF_ALLMULTI));
4559
4560         if (netif_running(dev)) {
4561                 if (netif_oper_up(dev))
4562                         flags |= IFF_RUNNING;
4563                 if (netif_carrier_ok(dev))
4564                         flags |= IFF_LOWER_UP;
4565                 if (netif_dormant(dev))
4566                         flags |= IFF_DORMANT;
4567         }
4568
4569         return flags;
4570 }
4571 EXPORT_SYMBOL(dev_get_flags);
4572
4573 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4574 {
4575         int old_flags = dev->flags;
4576         int ret;
4577
4578         ASSERT_RTNL();
4579
4580         /*
4581          *      Set the flags on our device.
4582          */
4583
4584         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4585                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4586                                IFF_AUTOMEDIA)) |
4587                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4588                                     IFF_ALLMULTI));
4589
4590         /*
4591          *      Load in the correct multicast list now the flags have changed.
4592          */
4593
4594         if ((old_flags ^ flags) & IFF_MULTICAST)
4595                 dev_change_rx_flags(dev, IFF_MULTICAST);
4596
4597         dev_set_rx_mode(dev);
4598
4599         /*
4600          *      Have we downed the interface. We handle IFF_UP ourselves
4601          *      according to user attempts to set it, rather than blindly
4602          *      setting it.
4603          */
4604
4605         ret = 0;
4606         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4607                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4608
4609                 if (!ret)
4610                         dev_set_rx_mode(dev);
4611         }
4612
4613         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4614                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4615
4616                 dev->gflags ^= IFF_PROMISC;
4617                 dev_set_promiscuity(dev, inc);
4618         }
4619
4620         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4621            is important. Some (broken) drivers set IFF_PROMISC, when
4622            IFF_ALLMULTI is requested not asking us and not reporting.
4623          */
4624         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4625                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4626
4627                 dev->gflags ^= IFF_ALLMULTI;
4628                 dev_set_allmulti(dev, inc);
4629         }
4630
4631         return ret;
4632 }
4633
4634 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4635 {
4636         unsigned int changes = dev->flags ^ old_flags;
4637
4638         if (changes & IFF_UP) {
4639                 if (dev->flags & IFF_UP)
4640                         call_netdevice_notifiers(NETDEV_UP, dev);
4641                 else
4642                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4643         }
4644
4645         if (dev->flags & IFF_UP &&
4646             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4647                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4648 }
4649
4650 /**
4651  *      dev_change_flags - change device settings
4652  *      @dev: device
4653  *      @flags: device state flags
4654  *
4655  *      Change settings on device based state flags. The flags are
4656  *      in the userspace exported format.
4657  */
4658 int dev_change_flags(struct net_device *dev, unsigned flags)
4659 {
4660         int ret, changes;
4661         int old_flags = dev->flags;
4662
4663         ret = __dev_change_flags(dev, flags);
4664         if (ret < 0)
4665                 return ret;
4666
4667         changes = old_flags ^ dev->flags;
4668         if (changes)
4669                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4670
4671         __dev_notify_flags(dev, old_flags);
4672         return ret;
4673 }
4674 EXPORT_SYMBOL(dev_change_flags);
4675
4676 /**
4677  *      dev_set_mtu - Change maximum transfer unit
4678  *      @dev: device
4679  *      @new_mtu: new transfer unit
4680  *
4681  *      Change the maximum transfer size of the network device.
4682  */
4683 int dev_set_mtu(struct net_device *dev, int new_mtu)
4684 {
4685         const struct net_device_ops *ops = dev->netdev_ops;
4686         int err;
4687
4688         if (new_mtu == dev->mtu)
4689                 return 0;
4690
4691         /*      MTU must be positive.    */
4692         if (new_mtu < 0)
4693                 return -EINVAL;
4694
4695         if (!netif_device_present(dev))
4696                 return -ENODEV;
4697
4698         err = 0;
4699         if (ops->ndo_change_mtu)
4700                 err = ops->ndo_change_mtu(dev, new_mtu);
4701         else
4702                 dev->mtu = new_mtu;
4703
4704         if (!err && dev->flags & IFF_UP)
4705                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4706         return err;
4707 }
4708 EXPORT_SYMBOL(dev_set_mtu);
4709
4710 /**
4711  *      dev_set_group - Change group this device belongs to
4712  *      @dev: device
4713  *      @new_group: group this device should belong to
4714  */
4715 void dev_set_group(struct net_device *dev, int new_group)
4716 {
4717         dev->group = new_group;
4718 }
4719 EXPORT_SYMBOL(dev_set_group);
4720
4721 /**
4722  *      dev_set_mac_address - Change Media Access Control Address
4723  *      @dev: device
4724  *      @sa: new address
4725  *
4726  *      Change the hardware (MAC) address of the device
4727  */
4728 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4729 {
4730         const struct net_device_ops *ops = dev->netdev_ops;
4731         int err;
4732
4733         if (!ops->ndo_set_mac_address)
4734                 return -EOPNOTSUPP;
4735         if (sa->sa_family != dev->type)
4736                 return -EINVAL;
4737         if (!netif_device_present(dev))
4738                 return -ENODEV;
4739         err = ops->ndo_set_mac_address(dev, sa);
4740         if (!err)
4741                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4742         return err;
4743 }
4744 EXPORT_SYMBOL(dev_set_mac_address);
4745
4746 /*
4747  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4748  */
4749 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4750 {
4751         int err;
4752         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4753
4754         if (!dev)
4755                 return -ENODEV;
4756
4757         switch (cmd) {
4758         case SIOCGIFFLAGS:      /* Get interface flags */
4759                 ifr->ifr_flags = (short) dev_get_flags(dev);
4760                 return 0;
4761
4762         case SIOCGIFMETRIC:     /* Get the metric on the interface
4763                                    (currently unused) */
4764                 ifr->ifr_metric = 0;
4765                 return 0;
4766
4767         case SIOCGIFMTU:        /* Get the MTU of a device */
4768                 ifr->ifr_mtu = dev->mtu;
4769                 return 0;
4770
4771         case SIOCGIFHWADDR:
4772                 if (!dev->addr_len)
4773                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4774                 else
4775                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4776                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4777                 ifr->ifr_hwaddr.sa_family = dev->type;
4778                 return 0;
4779
4780         case SIOCGIFSLAVE:
4781                 err = -EINVAL;
4782                 break;
4783
4784         case SIOCGIFMAP:
4785                 ifr->ifr_map.mem_start = dev->mem_start;
4786                 ifr->ifr_map.mem_end   = dev->mem_end;
4787                 ifr->ifr_map.base_addr = dev->base_addr;
4788                 ifr->ifr_map.irq       = dev->irq;
4789                 ifr->ifr_map.dma       = dev->dma;
4790                 ifr->ifr_map.port      = dev->if_port;
4791                 return 0;
4792
4793         case SIOCGIFINDEX:
4794                 ifr->ifr_ifindex = dev->ifindex;
4795                 return 0;
4796
4797         case SIOCGIFTXQLEN:
4798                 ifr->ifr_qlen = dev->tx_queue_len;
4799                 return 0;
4800
4801         default:
4802                 /* dev_ioctl() should ensure this case
4803                  * is never reached
4804                  */
4805                 WARN_ON(1);
4806                 err = -ENOTTY;
4807                 break;
4808
4809         }
4810         return err;
4811 }
4812
4813 /*
4814  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4815  */
4816 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4817 {
4818         int err;
4819         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4820         const struct net_device_ops *ops;
4821
4822         if (!dev)
4823                 return -ENODEV;
4824
4825         ops = dev->netdev_ops;
4826
4827         switch (cmd) {
4828         case SIOCSIFFLAGS:      /* Set interface flags */
4829                 return dev_change_flags(dev, ifr->ifr_flags);
4830
4831         case SIOCSIFMETRIC:     /* Set the metric on the interface
4832                                    (currently unused) */
4833                 return -EOPNOTSUPP;
4834
4835         case SIOCSIFMTU:        /* Set the MTU of a device */
4836                 return dev_set_mtu(dev, ifr->ifr_mtu);
4837
4838         case SIOCSIFHWADDR:
4839                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4840
4841         case SIOCSIFHWBROADCAST:
4842                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4843                         return -EINVAL;
4844                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4845                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4846                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4847                 return 0;
4848
4849         case SIOCSIFMAP:
4850                 if (ops->ndo_set_config) {
4851                         if (!netif_device_present(dev))
4852                                 return -ENODEV;
4853                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4854                 }
4855                 return -EOPNOTSUPP;
4856
4857         case SIOCADDMULTI:
4858                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4859                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4860                         return -EINVAL;
4861                 if (!netif_device_present(dev))
4862                         return -ENODEV;
4863                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4864
4865         case SIOCDELMULTI:
4866                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4867                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4868                         return -EINVAL;
4869                 if (!netif_device_present(dev))
4870                         return -ENODEV;
4871                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4872
4873         case SIOCSIFTXQLEN:
4874                 if (ifr->ifr_qlen < 0)
4875                         return -EINVAL;
4876                 dev->tx_queue_len = ifr->ifr_qlen;
4877                 return 0;
4878
4879         case SIOCSIFNAME:
4880                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4881                 return dev_change_name(dev, ifr->ifr_newname);
4882
4883         /*
4884          *      Unknown or private ioctl
4885          */
4886         default:
4887                 if ((cmd >= SIOCDEVPRIVATE &&
4888                     cmd <= SIOCDEVPRIVATE + 15) ||
4889                     cmd == SIOCBONDENSLAVE ||
4890                     cmd == SIOCBONDRELEASE ||
4891                     cmd == SIOCBONDSETHWADDR ||
4892                     cmd == SIOCBONDSLAVEINFOQUERY ||
4893                     cmd == SIOCBONDINFOQUERY ||
4894                     cmd == SIOCBONDCHANGEACTIVE ||
4895                     cmd == SIOCGMIIPHY ||
4896                     cmd == SIOCGMIIREG ||
4897                     cmd == SIOCSMIIREG ||
4898                     cmd == SIOCBRADDIF ||
4899                     cmd == SIOCBRDELIF ||
4900                     cmd == SIOCSHWTSTAMP ||
4901                     cmd == SIOCWANDEV) {
4902                         err = -EOPNOTSUPP;
4903                         if (ops->ndo_do_ioctl) {
4904                                 if (netif_device_present(dev))
4905                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4906                                 else
4907                                         err = -ENODEV;
4908                         }
4909                 } else
4910                         err = -EINVAL;
4911
4912         }
4913         return err;
4914 }
4915
4916 /*
4917  *      This function handles all "interface"-type I/O control requests. The actual
4918  *      'doing' part of this is dev_ifsioc above.
4919  */
4920
4921 /**
4922  *      dev_ioctl       -       network device ioctl
4923  *      @net: the applicable net namespace
4924  *      @cmd: command to issue
4925  *      @arg: pointer to a struct ifreq in user space
4926  *
4927  *      Issue ioctl functions to devices. This is normally called by the
4928  *      user space syscall interfaces but can sometimes be useful for
4929  *      other purposes. The return value is the return from the syscall if
4930  *      positive or a negative errno code on error.
4931  */
4932
4933 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4934 {
4935         struct ifreq ifr;
4936         int ret;
4937         char *colon;
4938
4939         /* One special case: SIOCGIFCONF takes ifconf argument
4940            and requires shared lock, because it sleeps writing
4941            to user space.
4942          */
4943
4944         if (cmd == SIOCGIFCONF) {
4945                 rtnl_lock();
4946                 ret = dev_ifconf(net, (char __user *) arg);
4947                 rtnl_unlock();
4948                 return ret;
4949         }
4950         if (cmd == SIOCGIFNAME)
4951                 return dev_ifname(net, (struct ifreq __user *)arg);
4952
4953         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4954                 return -EFAULT;
4955
4956         ifr.ifr_name[IFNAMSIZ-1] = 0;
4957
4958         colon = strchr(ifr.ifr_name, ':');
4959         if (colon)
4960                 *colon = 0;
4961
4962         /*
4963          *      See which interface the caller is talking about.
4964          */
4965
4966         switch (cmd) {
4967         /*
4968          *      These ioctl calls:
4969          *      - can be done by all.
4970          *      - atomic and do not require locking.
4971          *      - return a value
4972          */
4973         case SIOCGIFFLAGS:
4974         case SIOCGIFMETRIC:
4975         case SIOCGIFMTU:
4976         case SIOCGIFHWADDR:
4977         case SIOCGIFSLAVE:
4978         case SIOCGIFMAP:
4979         case SIOCGIFINDEX:
4980         case SIOCGIFTXQLEN:
4981                 dev_load(net, ifr.ifr_name);
4982                 rcu_read_lock();
4983                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4984                 rcu_read_unlock();
4985                 if (!ret) {
4986                         if (colon)
4987                                 *colon = ':';
4988                         if (copy_to_user(arg, &ifr,
4989                                          sizeof(struct ifreq)))
4990                                 ret = -EFAULT;
4991                 }
4992                 return ret;
4993
4994         case SIOCETHTOOL:
4995                 dev_load(net, ifr.ifr_name);
4996                 rtnl_lock();
4997                 ret = dev_ethtool(net, &ifr);
4998                 rtnl_unlock();
4999                 if (!ret) {
5000                         if (colon)
5001                                 *colon = ':';
5002                         if (copy_to_user(arg, &ifr,
5003                                          sizeof(struct ifreq)))
5004                                 ret = -EFAULT;
5005                 }
5006                 return ret;
5007
5008         /*
5009          *      These ioctl calls:
5010          *      - require superuser power.
5011          *      - require strict serialization.
5012          *      - return a value
5013          */
5014         case SIOCGMIIPHY:
5015         case SIOCGMIIREG:
5016         case SIOCSIFNAME:
5017                 if (!capable(CAP_NET_ADMIN))
5018                         return -EPERM;
5019                 dev_load(net, ifr.ifr_name);
5020                 rtnl_lock();
5021                 ret = dev_ifsioc(net, &ifr, cmd);
5022                 rtnl_unlock();
5023                 if (!ret) {
5024                         if (colon)
5025                                 *colon = ':';
5026                         if (copy_to_user(arg, &ifr,
5027                                          sizeof(struct ifreq)))
5028                                 ret = -EFAULT;
5029                 }
5030                 return ret;
5031
5032         /*
5033          *      These ioctl calls:
5034          *      - require superuser power.
5035          *      - require strict serialization.
5036          *      - do not return a value
5037          */
5038         case SIOCSIFFLAGS:
5039         case SIOCSIFMETRIC:
5040         case SIOCSIFMTU:
5041         case SIOCSIFMAP:
5042         case SIOCSIFHWADDR:
5043         case SIOCSIFSLAVE:
5044         case SIOCADDMULTI:
5045         case SIOCDELMULTI:
5046         case SIOCSIFHWBROADCAST:
5047         case SIOCSIFTXQLEN:
5048         case SIOCSMIIREG:
5049         case SIOCBONDENSLAVE:
5050         case SIOCBONDRELEASE:
5051         case SIOCBONDSETHWADDR:
5052         case SIOCBONDCHANGEACTIVE:
5053         case SIOCBRADDIF:
5054         case SIOCBRDELIF:
5055         case SIOCSHWTSTAMP:
5056                 if (!capable(CAP_NET_ADMIN))
5057                         return -EPERM;
5058                 /* fall through */
5059         case SIOCBONDSLAVEINFOQUERY:
5060         case SIOCBONDINFOQUERY:
5061                 dev_load(net, ifr.ifr_name);
5062                 rtnl_lock();
5063                 ret = dev_ifsioc(net, &ifr, cmd);
5064                 rtnl_unlock();
5065                 return ret;
5066
5067         case SIOCGIFMEM:
5068                 /* Get the per device memory space. We can add this but
5069                  * currently do not support it */
5070         case SIOCSIFMEM:
5071                 /* Set the per device memory buffer space.
5072                  * Not applicable in our case */
5073         case SIOCSIFLINK:
5074                 return -ENOTTY;
5075
5076         /*
5077          *      Unknown or private ioctl.
5078          */
5079         default:
5080                 if (cmd == SIOCWANDEV ||
5081                     (cmd >= SIOCDEVPRIVATE &&
5082                      cmd <= SIOCDEVPRIVATE + 15)) {
5083                         dev_load(net, ifr.ifr_name);
5084                         rtnl_lock();
5085                         ret = dev_ifsioc(net, &ifr, cmd);
5086                         rtnl_unlock();
5087                         if (!ret && copy_to_user(arg, &ifr,
5088                                                  sizeof(struct ifreq)))
5089                                 ret = -EFAULT;
5090                         return ret;
5091                 }
5092                 /* Take care of Wireless Extensions */
5093                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5094                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5095                 return -ENOTTY;
5096         }
5097 }
5098
5099
5100 /**
5101  *      dev_new_index   -       allocate an ifindex
5102  *      @net: the applicable net namespace
5103  *
5104  *      Returns a suitable unique value for a new device interface
5105  *      number.  The caller must hold the rtnl semaphore or the
5106  *      dev_base_lock to be sure it remains unique.
5107  */
5108 static int dev_new_index(struct net *net)
5109 {
5110         static int ifindex;
5111         for (;;) {
5112                 if (++ifindex <= 0)
5113                         ifindex = 1;
5114                 if (!__dev_get_by_index(net, ifindex))
5115                         return ifindex;
5116         }
5117 }
5118
5119 /* Delayed registration/unregisteration */
5120 static LIST_HEAD(net_todo_list);
5121
5122 static void net_set_todo(struct net_device *dev)
5123 {
5124         list_add_tail(&dev->todo_list, &net_todo_list);
5125 }
5126
5127 static void rollback_registered_many(struct list_head *head)
5128 {
5129         struct net_device *dev, *tmp;
5130
5131         BUG_ON(dev_boot_phase);
5132         ASSERT_RTNL();
5133
5134         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5135                 /* Some devices call without registering
5136                  * for initialization unwind. Remove those
5137                  * devices and proceed with the remaining.
5138                  */
5139                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5140                         pr_debug("unregister_netdevice: device %s/%p never "
5141                                  "was registered\n", dev->name, dev);
5142
5143                         WARN_ON(1);
5144                         list_del(&dev->unreg_list);
5145                         continue;
5146                 }
5147                 dev->dismantle = true;
5148                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5149         }
5150
5151         /* If device is running, close it first. */
5152         dev_close_many(head);
5153
5154         list_for_each_entry(dev, head, unreg_list) {
5155                 /* And unlink it from device chain. */
5156                 unlist_netdevice(dev);
5157
5158                 dev->reg_state = NETREG_UNREGISTERING;
5159         }
5160
5161         synchronize_net();
5162
5163         list_for_each_entry(dev, head, unreg_list) {
5164                 /* Shutdown queueing discipline. */
5165                 dev_shutdown(dev);
5166
5167
5168                 /* Notify protocols, that we are about to destroy
5169                    this device. They should clean all the things.
5170                 */
5171                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5172
5173                 if (!dev->rtnl_link_ops ||
5174                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5175                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5176
5177                 /*
5178                  *      Flush the unicast and multicast chains
5179                  */
5180                 dev_uc_flush(dev);
5181                 dev_mc_flush(dev);
5182
5183                 if (dev->netdev_ops->ndo_uninit)
5184                         dev->netdev_ops->ndo_uninit(dev);
5185
5186                 /* Notifier chain MUST detach us from master device. */
5187                 WARN_ON(dev->master);
5188
5189                 /* Remove entries from kobject tree */
5190                 netdev_unregister_kobject(dev);
5191         }
5192
5193         /* Process any work delayed until the end of the batch */
5194         dev = list_first_entry(head, struct net_device, unreg_list);
5195         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5196
5197         rcu_barrier();
5198
5199         list_for_each_entry(dev, head, unreg_list)
5200                 dev_put(dev);
5201 }
5202
5203 static void rollback_registered(struct net_device *dev)
5204 {
5205         LIST_HEAD(single);
5206
5207         list_add(&dev->unreg_list, &single);
5208         rollback_registered_many(&single);
5209         list_del(&single);
5210 }
5211
5212 static u32 netdev_fix_features(struct net_device *dev, u32 features)
5213 {
5214         /* Fix illegal checksum combinations */
5215         if ((features & NETIF_F_HW_CSUM) &&
5216             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5217                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5218                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5219         }
5220
5221         if ((features & NETIF_F_NO_CSUM) &&
5222             (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5223                 netdev_warn(dev, "mixed no checksumming and other settings.\n");
5224                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5225         }
5226
5227         /* Fix illegal SG+CSUM combinations. */
5228         if ((features & NETIF_F_SG) &&
5229             !(features & NETIF_F_ALL_CSUM)) {
5230                 netdev_dbg(dev,
5231                         "Dropping NETIF_F_SG since no checksum feature.\n");
5232                 features &= ~NETIF_F_SG;
5233         }
5234
5235         /* TSO requires that SG is present as well. */
5236         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5237                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5238                 features &= ~NETIF_F_ALL_TSO;
5239         }
5240
5241         /* TSO ECN requires that TSO is present as well. */
5242         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5243                 features &= ~NETIF_F_TSO_ECN;
5244
5245         /* Software GSO depends on SG. */
5246         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5247                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5248                 features &= ~NETIF_F_GSO;
5249         }
5250
5251         /* UFO needs SG and checksumming */
5252         if (features & NETIF_F_UFO) {
5253                 /* maybe split UFO into V4 and V6? */
5254                 if (!((features & NETIF_F_GEN_CSUM) ||
5255                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5256                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5257                         netdev_dbg(dev,
5258                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5259                         features &= ~NETIF_F_UFO;
5260                 }
5261
5262                 if (!(features & NETIF_F_SG)) {
5263                         netdev_dbg(dev,
5264                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5265                         features &= ~NETIF_F_UFO;
5266                 }
5267         }
5268
5269         return features;
5270 }
5271
5272 int __netdev_update_features(struct net_device *dev)
5273 {
5274         u32 features;
5275         int err = 0;
5276
5277         ASSERT_RTNL();
5278
5279         features = netdev_get_wanted_features(dev);
5280
5281         if (dev->netdev_ops->ndo_fix_features)
5282                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5283
5284         /* driver might be less strict about feature dependencies */
5285         features = netdev_fix_features(dev, features);
5286
5287         if (dev->features == features)
5288                 return 0;
5289
5290         netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5291                 dev->features, features);
5292
5293         if (dev->netdev_ops->ndo_set_features)
5294                 err = dev->netdev_ops->ndo_set_features(dev, features);
5295
5296         if (unlikely(err < 0)) {
5297                 netdev_err(dev,
5298                         "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5299                         err, features, dev->features);
5300                 return -1;
5301         }
5302
5303         if (!err)
5304                 dev->features = features;
5305
5306         return 1;
5307 }
5308
5309 /**
5310  *      netdev_update_features - recalculate device features
5311  *      @dev: the device to check
5312  *
5313  *      Recalculate dev->features set and send notifications if it
5314  *      has changed. Should be called after driver or hardware dependent
5315  *      conditions might have changed that influence the features.
5316  */
5317 void netdev_update_features(struct net_device *dev)
5318 {
5319         if (__netdev_update_features(dev))
5320                 netdev_features_change(dev);
5321 }
5322 EXPORT_SYMBOL(netdev_update_features);
5323
5324 /**
5325  *      netdev_change_features - recalculate device features
5326  *      @dev: the device to check
5327  *
5328  *      Recalculate dev->features set and send notifications even
5329  *      if they have not changed. Should be called instead of
5330  *      netdev_update_features() if also dev->vlan_features might
5331  *      have changed to allow the changes to be propagated to stacked
5332  *      VLAN devices.
5333  */
5334 void netdev_change_features(struct net_device *dev)
5335 {
5336         __netdev_update_features(dev);
5337         netdev_features_change(dev);
5338 }
5339 EXPORT_SYMBOL(netdev_change_features);
5340
5341 /**
5342  *      netif_stacked_transfer_operstate -      transfer operstate
5343  *      @rootdev: the root or lower level device to transfer state from
5344  *      @dev: the device to transfer operstate to
5345  *
5346  *      Transfer operational state from root to device. This is normally
5347  *      called when a stacking relationship exists between the root
5348  *      device and the device(a leaf device).
5349  */
5350 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5351                                         struct net_device *dev)
5352 {
5353         if (rootdev->operstate == IF_OPER_DORMANT)
5354                 netif_dormant_on(dev);
5355         else
5356                 netif_dormant_off(dev);
5357
5358         if (netif_carrier_ok(rootdev)) {
5359                 if (!netif_carrier_ok(dev))
5360                         netif_carrier_on(dev);
5361         } else {
5362                 if (netif_carrier_ok(dev))
5363                         netif_carrier_off(dev);
5364         }
5365 }
5366 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5367
5368 #ifdef CONFIG_RPS
5369 static int netif_alloc_rx_queues(struct net_device *dev)
5370 {
5371         unsigned int i, count = dev->num_rx_queues;
5372         struct netdev_rx_queue *rx;
5373
5374         BUG_ON(count < 1);
5375
5376         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5377         if (!rx) {
5378                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5379                 return -ENOMEM;
5380         }
5381         dev->_rx = rx;
5382
5383         for (i = 0; i < count; i++)
5384                 rx[i].dev = dev;
5385         return 0;
5386 }
5387 #endif
5388
5389 static void netdev_init_one_queue(struct net_device *dev,
5390                                   struct netdev_queue *queue, void *_unused)
5391 {
5392         /* Initialize queue lock */
5393         spin_lock_init(&queue->_xmit_lock);
5394         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5395         queue->xmit_lock_owner = -1;
5396         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5397         queue->dev = dev;
5398 }
5399
5400 static int netif_alloc_netdev_queues(struct net_device *dev)
5401 {
5402         unsigned int count = dev->num_tx_queues;
5403         struct netdev_queue *tx;
5404
5405         BUG_ON(count < 1);
5406
5407         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5408         if (!tx) {
5409                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5410                        count);
5411                 return -ENOMEM;
5412         }
5413         dev->_tx = tx;
5414
5415         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5416         spin_lock_init(&dev->tx_global_lock);
5417
5418         return 0;
5419 }
5420
5421 /**
5422  *      register_netdevice      - register a network device
5423  *      @dev: device to register
5424  *
5425  *      Take a completed network device structure and add it to the kernel
5426  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5427  *      chain. 0 is returned on success. A negative errno code is returned
5428  *      on a failure to set up the device, or if the name is a duplicate.
5429  *
5430  *      Callers must hold the rtnl semaphore. You may want
5431  *      register_netdev() instead of this.
5432  *
5433  *      BUGS:
5434  *      The locking appears insufficient to guarantee two parallel registers
5435  *      will not get the same name.
5436  */
5437
5438 int register_netdevice(struct net_device *dev)
5439 {
5440         int ret;
5441         struct net *net = dev_net(dev);
5442
5443         BUG_ON(dev_boot_phase);
5444         ASSERT_RTNL();
5445
5446         might_sleep();
5447
5448         /* When net_device's are persistent, this will be fatal. */
5449         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5450         BUG_ON(!net);
5451
5452         spin_lock_init(&dev->addr_list_lock);
5453         netdev_set_addr_lockdep_class(dev);
5454
5455         dev->iflink = -1;
5456
5457         ret = dev_get_valid_name(dev, dev->name);
5458         if (ret < 0)
5459                 goto out;
5460
5461         /* Init, if this function is available */
5462         if (dev->netdev_ops->ndo_init) {
5463                 ret = dev->netdev_ops->ndo_init(dev);
5464                 if (ret) {
5465                         if (ret > 0)
5466                                 ret = -EIO;
5467                         goto out;
5468                 }
5469         }
5470
5471         dev->ifindex = dev_new_index(net);
5472         if (dev->iflink == -1)
5473                 dev->iflink = dev->ifindex;
5474
5475         /* Transfer changeable features to wanted_features and enable
5476          * software offloads (GSO and GRO).
5477          */
5478         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5479         dev->features |= NETIF_F_SOFT_FEATURES;
5480         dev->wanted_features = dev->features & dev->hw_features;
5481
5482         /* Turn on no cache copy if HW is doing checksum */
5483         dev->hw_features |= NETIF_F_NOCACHE_COPY;
5484         if ((dev->features & NETIF_F_ALL_CSUM) &&
5485             !(dev->features & NETIF_F_NO_CSUM)) {
5486                 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5487                 dev->features |= NETIF_F_NOCACHE_COPY;
5488         }
5489
5490         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5491          */
5492         dev->vlan_features |= NETIF_F_HIGHDMA;
5493
5494         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5495         ret = notifier_to_errno(ret);
5496         if (ret)
5497                 goto err_uninit;
5498
5499         ret = netdev_register_kobject(dev);
5500         if (ret)
5501                 goto err_uninit;
5502         dev->reg_state = NETREG_REGISTERED;
5503
5504         __netdev_update_features(dev);
5505
5506         /*
5507          *      Default initial state at registry is that the
5508          *      device is present.
5509          */
5510
5511         set_bit(__LINK_STATE_PRESENT, &dev->state);
5512
5513         dev_init_scheduler(dev);
5514         dev_hold(dev);
5515         list_netdevice(dev);
5516
5517         /* Notify protocols, that a new device appeared. */
5518         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5519         ret = notifier_to_errno(ret);
5520         if (ret) {
5521                 rollback_registered(dev);
5522                 dev->reg_state = NETREG_UNREGISTERED;
5523         }
5524         /*
5525          *      Prevent userspace races by waiting until the network
5526          *      device is fully setup before sending notifications.
5527          */
5528         if (!dev->rtnl_link_ops ||
5529             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5530                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5531
5532 out:
5533         return ret;
5534
5535 err_uninit:
5536         if (dev->netdev_ops->ndo_uninit)
5537                 dev->netdev_ops->ndo_uninit(dev);
5538         goto out;
5539 }
5540 EXPORT_SYMBOL(register_netdevice);
5541
5542 /**
5543  *      init_dummy_netdev       - init a dummy network device for NAPI
5544  *      @dev: device to init
5545  *
5546  *      This takes a network device structure and initialize the minimum
5547  *      amount of fields so it can be used to schedule NAPI polls without
5548  *      registering a full blown interface. This is to be used by drivers
5549  *      that need to tie several hardware interfaces to a single NAPI
5550  *      poll scheduler due to HW limitations.
5551  */
5552 int init_dummy_netdev(struct net_device *dev)
5553 {
5554         /* Clear everything. Note we don't initialize spinlocks
5555          * are they aren't supposed to be taken by any of the
5556          * NAPI code and this dummy netdev is supposed to be
5557          * only ever used for NAPI polls
5558          */
5559         memset(dev, 0, sizeof(struct net_device));
5560
5561         /* make sure we BUG if trying to hit standard
5562          * register/unregister code path
5563          */
5564         dev->reg_state = NETREG_DUMMY;
5565
5566         /* NAPI wants this */
5567         INIT_LIST_HEAD(&dev->napi_list);
5568
5569         /* a dummy interface is started by default */
5570         set_bit(__LINK_STATE_PRESENT, &dev->state);
5571         set_bit(__LINK_STATE_START, &dev->state);
5572
5573         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5574          * because users of this 'device' dont need to change
5575          * its refcount.
5576          */
5577
5578         return 0;
5579 }
5580 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5581
5582
5583 /**
5584  *      register_netdev - register a network device
5585  *      @dev: device to register
5586  *
5587  *      Take a completed network device structure and add it to the kernel
5588  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5589  *      chain. 0 is returned on success. A negative errno code is returned
5590  *      on a failure to set up the device, or if the name is a duplicate.
5591  *
5592  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5593  *      and expands the device name if you passed a format string to
5594  *      alloc_netdev.
5595  */
5596 int register_netdev(struct net_device *dev)
5597 {
5598         int err;
5599
5600         rtnl_lock();
5601         err = register_netdevice(dev);
5602         rtnl_unlock();
5603         return err;
5604 }
5605 EXPORT_SYMBOL(register_netdev);
5606
5607 int netdev_refcnt_read(const struct net_device *dev)
5608 {
5609         int i, refcnt = 0;
5610
5611         for_each_possible_cpu(i)
5612                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5613         return refcnt;
5614 }
5615 EXPORT_SYMBOL(netdev_refcnt_read);
5616
5617 /*
5618  * netdev_wait_allrefs - wait until all references are gone.
5619  *
5620  * This is called when unregistering network devices.
5621  *
5622  * Any protocol or device that holds a reference should register
5623  * for netdevice notification, and cleanup and put back the
5624  * reference if they receive an UNREGISTER event.
5625  * We can get stuck here if buggy protocols don't correctly
5626  * call dev_put.
5627  */
5628 static void netdev_wait_allrefs(struct net_device *dev)
5629 {
5630         unsigned long rebroadcast_time, warning_time;
5631         int refcnt;
5632
5633         linkwatch_forget_dev(dev);
5634
5635         rebroadcast_time = warning_time = jiffies;
5636         refcnt = netdev_refcnt_read(dev);
5637
5638         while (refcnt != 0) {
5639                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5640                         rtnl_lock();
5641
5642                         /* Rebroadcast unregister notification */
5643                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5644                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5645                          * should have already handle it the first time */
5646
5647                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5648                                      &dev->state)) {
5649                                 /* We must not have linkwatch events
5650                                  * pending on unregister. If this
5651                                  * happens, we simply run the queue
5652                                  * unscheduled, resulting in a noop
5653                                  * for this device.
5654                                  */
5655                                 linkwatch_run_queue();
5656                         }
5657
5658                         __rtnl_unlock();
5659
5660                         rebroadcast_time = jiffies;
5661                 }
5662
5663                 msleep(250);
5664
5665                 refcnt = netdev_refcnt_read(dev);
5666
5667                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5668                         printk(KERN_EMERG "unregister_netdevice: "
5669                                "waiting for %s to become free. Usage "
5670                                "count = %d\n",
5671                                dev->name, refcnt);
5672                         warning_time = jiffies;
5673                 }
5674         }
5675 }
5676
5677 /* The sequence is:
5678  *
5679  *      rtnl_lock();
5680  *      ...
5681  *      register_netdevice(x1);
5682  *      register_netdevice(x2);
5683  *      ...
5684  *      unregister_netdevice(y1);
5685  *      unregister_netdevice(y2);
5686  *      ...
5687  *      rtnl_unlock();
5688  *      free_netdev(y1);
5689  *      free_netdev(y2);
5690  *
5691  * We are invoked by rtnl_unlock().
5692  * This allows us to deal with problems:
5693  * 1) We can delete sysfs objects which invoke hotplug
5694  *    without deadlocking with linkwatch via keventd.
5695  * 2) Since we run with the RTNL semaphore not held, we can sleep
5696  *    safely in order to wait for the netdev refcnt to drop to zero.
5697  *
5698  * We must not return until all unregister events added during
5699  * the interval the lock was held have been completed.
5700  */
5701 void netdev_run_todo(void)
5702 {
5703         struct list_head list;
5704
5705         /* Snapshot list, allow later requests */
5706         list_replace_init(&net_todo_list, &list);
5707
5708         __rtnl_unlock();
5709
5710         while (!list_empty(&list)) {
5711                 struct net_device *dev
5712                         = list_first_entry(&list, struct net_device, todo_list);
5713                 list_del(&dev->todo_list);
5714
5715                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5716                         printk(KERN_ERR "network todo '%s' but state %d\n",
5717                                dev->name, dev->reg_state);
5718                         dump_stack();
5719                         continue;
5720                 }
5721
5722                 dev->reg_state = NETREG_UNREGISTERED;
5723
5724                 on_each_cpu(flush_backlog, dev, 1);
5725
5726                 netdev_wait_allrefs(dev);
5727
5728                 /* paranoia */
5729                 BUG_ON(netdev_refcnt_read(dev));
5730                 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5731                 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5732                 WARN_ON(dev->dn_ptr);
5733
5734                 if (dev->destructor)
5735                         dev->destructor(dev);
5736
5737                 /* Free network device */
5738                 kobject_put(&dev->dev.kobj);
5739         }
5740 }
5741
5742 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5743  * fields in the same order, with only the type differing.
5744  */
5745 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5746                                     const struct net_device_stats *netdev_stats)
5747 {
5748 #if BITS_PER_LONG == 64
5749         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5750         memcpy(stats64, netdev_stats, sizeof(*stats64));
5751 #else
5752         size_t i, n = sizeof(*stats64) / sizeof(u64);
5753         const unsigned long *src = (const unsigned long *)netdev_stats;
5754         u64 *dst = (u64 *)stats64;
5755
5756         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5757                      sizeof(*stats64) / sizeof(u64));
5758         for (i = 0; i < n; i++)
5759                 dst[i] = src[i];
5760 #endif
5761 }
5762
5763 /**
5764  *      dev_get_stats   - get network device statistics
5765  *      @dev: device to get statistics from
5766  *      @storage: place to store stats
5767  *
5768  *      Get network statistics from device. Return @storage.
5769  *      The device driver may provide its own method by setting
5770  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5771  *      otherwise the internal statistics structure is used.
5772  */
5773 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5774                                         struct rtnl_link_stats64 *storage)
5775 {
5776         const struct net_device_ops *ops = dev->netdev_ops;
5777
5778         if (ops->ndo_get_stats64) {
5779                 memset(storage, 0, sizeof(*storage));
5780                 ops->ndo_get_stats64(dev, storage);
5781         } else if (ops->ndo_get_stats) {
5782                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5783         } else {
5784                 netdev_stats_to_stats64(storage, &dev->stats);
5785         }
5786         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5787         return storage;
5788 }
5789 EXPORT_SYMBOL(dev_get_stats);
5790
5791 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5792 {
5793         struct netdev_queue *queue = dev_ingress_queue(dev);
5794
5795 #ifdef CONFIG_NET_CLS_ACT
5796         if (queue)
5797                 return queue;
5798         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5799         if (!queue)
5800                 return NULL;
5801         netdev_init_one_queue(dev, queue, NULL);
5802         queue->qdisc = &noop_qdisc;
5803         queue->qdisc_sleeping = &noop_qdisc;
5804         rcu_assign_pointer(dev->ingress_queue, queue);
5805 #endif
5806         return queue;
5807 }
5808
5809 /**
5810  *      alloc_netdev_mqs - allocate network device
5811  *      @sizeof_priv:   size of private data to allocate space for
5812  *      @name:          device name format string
5813  *      @setup:         callback to initialize device
5814  *      @txqs:          the number of TX subqueues to allocate
5815  *      @rxqs:          the number of RX subqueues to allocate
5816  *
5817  *      Allocates a struct net_device with private data area for driver use
5818  *      and performs basic initialization.  Also allocates subquue structs
5819  *      for each queue on the device.
5820  */
5821 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5822                 void (*setup)(struct net_device *),
5823                 unsigned int txqs, unsigned int rxqs)
5824 {
5825         struct net_device *dev;
5826         size_t alloc_size;
5827         struct net_device *p;
5828
5829         BUG_ON(strlen(name) >= sizeof(dev->name));
5830
5831         if (txqs < 1) {
5832                 pr_err("alloc_netdev: Unable to allocate device "
5833                        "with zero queues.\n");
5834                 return NULL;
5835         }
5836
5837 #ifdef CONFIG_RPS
5838         if (rxqs < 1) {
5839                 pr_err("alloc_netdev: Unable to allocate device "
5840                        "with zero RX queues.\n");
5841                 return NULL;
5842         }
5843 #endif
5844
5845         alloc_size = sizeof(struct net_device);
5846         if (sizeof_priv) {
5847                 /* ensure 32-byte alignment of private area */
5848                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5849                 alloc_size += sizeof_priv;
5850         }
5851         /* ensure 32-byte alignment of whole construct */
5852         alloc_size += NETDEV_ALIGN - 1;
5853
5854         p = kzalloc(alloc_size, GFP_KERNEL);
5855         if (!p) {
5856                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5857                 return NULL;
5858         }
5859
5860         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5861         dev->padded = (char *)dev - (char *)p;
5862
5863         dev->pcpu_refcnt = alloc_percpu(int);
5864         if (!dev->pcpu_refcnt)
5865                 goto free_p;
5866
5867         if (dev_addr_init(dev))
5868                 goto free_pcpu;
5869
5870         dev_mc_init(dev);
5871         dev_uc_init(dev);
5872
5873         dev_net_set(dev, &init_net);
5874
5875         dev->gso_max_size = GSO_MAX_SIZE;
5876
5877         INIT_LIST_HEAD(&dev->napi_list);
5878         INIT_LIST_HEAD(&dev->unreg_list);
5879         INIT_LIST_HEAD(&dev->link_watch_list);
5880         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5881         setup(dev);
5882
5883         dev->num_tx_queues = txqs;
5884         dev->real_num_tx_queues = txqs;
5885         if (netif_alloc_netdev_queues(dev))
5886                 goto free_all;
5887
5888 #ifdef CONFIG_RPS
5889         dev->num_rx_queues = rxqs;
5890         dev->real_num_rx_queues = rxqs;
5891         if (netif_alloc_rx_queues(dev))
5892                 goto free_all;
5893 #endif
5894
5895         strcpy(dev->name, name);
5896         dev->group = INIT_NETDEV_GROUP;
5897         return dev;
5898
5899 free_all:
5900         free_netdev(dev);
5901         return NULL;
5902
5903 free_pcpu:
5904         free_percpu(dev->pcpu_refcnt);
5905         kfree(dev->_tx);
5906 #ifdef CONFIG_RPS
5907         kfree(dev->_rx);
5908 #endif
5909
5910 free_p:
5911         kfree(p);
5912         return NULL;
5913 }
5914 EXPORT_SYMBOL(alloc_netdev_mqs);
5915
5916 /**
5917  *      free_netdev - free network device
5918  *      @dev: device
5919  *
5920  *      This function does the last stage of destroying an allocated device
5921  *      interface. The reference to the device object is released.
5922  *      If this is the last reference then it will be freed.
5923  */
5924 void free_netdev(struct net_device *dev)
5925 {
5926         struct napi_struct *p, *n;
5927
5928         release_net(dev_net(dev));
5929
5930         kfree(dev->_tx);
5931 #ifdef CONFIG_RPS
5932         kfree(dev->_rx);
5933 #endif
5934
5935         kfree(rcu_dereference_raw(dev->ingress_queue));
5936
5937         /* Flush device addresses */
5938         dev_addr_flush(dev);
5939
5940         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5941                 netif_napi_del(p);
5942
5943         free_percpu(dev->pcpu_refcnt);
5944         dev->pcpu_refcnt = NULL;
5945
5946         /*  Compatibility with error handling in drivers */
5947         if (dev->reg_state == NETREG_UNINITIALIZED) {
5948                 kfree((char *)dev - dev->padded);
5949                 return;
5950         }
5951
5952         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5953         dev->reg_state = NETREG_RELEASED;
5954
5955         /* will free via device release */
5956         put_device(&dev->dev);
5957 }
5958 EXPORT_SYMBOL(free_netdev);
5959
5960 /**
5961  *      synchronize_net -  Synchronize with packet receive processing
5962  *
5963  *      Wait for packets currently being received to be done.
5964  *      Does not block later packets from starting.
5965  */
5966 void synchronize_net(void)
5967 {
5968         might_sleep();
5969         if (rtnl_is_locked())
5970                 synchronize_rcu_expedited();
5971         else
5972                 synchronize_rcu();
5973 }
5974 EXPORT_SYMBOL(synchronize_net);
5975
5976 /**
5977  *      unregister_netdevice_queue - remove device from the kernel
5978  *      @dev: device
5979  *      @head: list
5980  *
5981  *      This function shuts down a device interface and removes it
5982  *      from the kernel tables.
5983  *      If head not NULL, device is queued to be unregistered later.
5984  *
5985  *      Callers must hold the rtnl semaphore.  You may want
5986  *      unregister_netdev() instead of this.
5987  */
5988
5989 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5990 {
5991         ASSERT_RTNL();
5992
5993         if (head) {
5994                 list_move_tail(&dev->unreg_list, head);
5995         } else {
5996                 rollback_registered(dev);
5997                 /* Finish processing unregister after unlock */
5998                 net_set_todo(dev);
5999         }
6000 }
6001 EXPORT_SYMBOL(unregister_netdevice_queue);
6002
6003 /**
6004  *      unregister_netdevice_many - unregister many devices
6005  *      @head: list of devices
6006  */
6007 void unregister_netdevice_many(struct list_head *head)
6008 {
6009         struct net_device *dev;
6010
6011         if (!list_empty(head)) {
6012                 rollback_registered_many(head);
6013                 list_for_each_entry(dev, head, unreg_list)
6014                         net_set_todo(dev);
6015         }
6016 }
6017 EXPORT_SYMBOL(unregister_netdevice_many);
6018
6019 /**
6020  *      unregister_netdev - remove device from the kernel
6021  *      @dev: device
6022  *
6023  *      This function shuts down a device interface and removes it
6024  *      from the kernel tables.
6025  *
6026  *      This is just a wrapper for unregister_netdevice that takes
6027  *      the rtnl semaphore.  In general you want to use this and not
6028  *      unregister_netdevice.
6029  */
6030 void unregister_netdev(struct net_device *dev)
6031 {
6032         rtnl_lock();
6033         unregister_netdevice(dev);
6034         rtnl_unlock();
6035 }
6036 EXPORT_SYMBOL(unregister_netdev);
6037
6038 /**
6039  *      dev_change_net_namespace - move device to different nethost namespace
6040  *      @dev: device
6041  *      @net: network namespace
6042  *      @pat: If not NULL name pattern to try if the current device name
6043  *            is already taken in the destination network namespace.
6044  *
6045  *      This function shuts down a device interface and moves it
6046  *      to a new network namespace. On success 0 is returned, on
6047  *      a failure a netagive errno code is returned.
6048  *
6049  *      Callers must hold the rtnl semaphore.
6050  */
6051
6052 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6053 {
6054         int err;
6055
6056         ASSERT_RTNL();
6057
6058         /* Don't allow namespace local devices to be moved. */
6059         err = -EINVAL;
6060         if (dev->features & NETIF_F_NETNS_LOCAL)
6061                 goto out;
6062
6063         /* Ensure the device has been registrered */
6064         err = -EINVAL;
6065         if (dev->reg_state != NETREG_REGISTERED)
6066                 goto out;
6067
6068         /* Get out if there is nothing todo */
6069         err = 0;
6070         if (net_eq(dev_net(dev), net))
6071                 goto out;
6072
6073         /* Pick the destination device name, and ensure
6074          * we can use it in the destination network namespace.
6075          */
6076         err = -EEXIST;
6077         if (__dev_get_by_name(net, dev->name)) {
6078                 /* We get here if we can't use the current device name */
6079                 if (!pat)
6080                         goto out;
6081                 if (dev_get_valid_name(dev, pat) < 0)
6082                         goto out;
6083         }
6084
6085         /*
6086          * And now a mini version of register_netdevice unregister_netdevice.
6087          */
6088
6089         /* If device is running close it first. */
6090         dev_close(dev);
6091
6092         /* And unlink it from device chain */
6093         err = -ENODEV;
6094         unlist_netdevice(dev);
6095
6096         synchronize_net();
6097
6098         /* Shutdown queueing discipline. */
6099         dev_shutdown(dev);
6100
6101         /* Notify protocols, that we are about to destroy
6102            this device. They should clean all the things.
6103
6104            Note that dev->reg_state stays at NETREG_REGISTERED.
6105            This is wanted because this way 8021q and macvlan know
6106            the device is just moving and can keep their slaves up.
6107         */
6108         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6109         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6110
6111         /*
6112          *      Flush the unicast and multicast chains
6113          */
6114         dev_uc_flush(dev);
6115         dev_mc_flush(dev);
6116
6117         /* Actually switch the network namespace */
6118         dev_net_set(dev, net);
6119
6120         /* If there is an ifindex conflict assign a new one */
6121         if (__dev_get_by_index(net, dev->ifindex)) {
6122                 int iflink = (dev->iflink == dev->ifindex);
6123                 dev->ifindex = dev_new_index(net);
6124                 if (iflink)
6125                         dev->iflink = dev->ifindex;
6126         }
6127
6128         /* Fixup kobjects */
6129         err = device_rename(&dev->dev, dev->name);
6130         WARN_ON(err);
6131
6132         /* Add the device back in the hashes */
6133         list_netdevice(dev);
6134
6135         /* Notify protocols, that a new device appeared. */
6136         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6137
6138         /*
6139          *      Prevent userspace races by waiting until the network
6140          *      device is fully setup before sending notifications.
6141          */
6142         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6143
6144         synchronize_net();
6145         err = 0;
6146 out:
6147         return err;
6148 }
6149 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6150
6151 static int dev_cpu_callback(struct notifier_block *nfb,
6152                             unsigned long action,
6153                             void *ocpu)
6154 {
6155         struct sk_buff **list_skb;
6156         struct sk_buff *skb;
6157         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6158         struct softnet_data *sd, *oldsd;
6159
6160         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6161                 return NOTIFY_OK;
6162
6163         local_irq_disable();
6164         cpu = smp_processor_id();
6165         sd = &per_cpu(softnet_data, cpu);
6166         oldsd = &per_cpu(softnet_data, oldcpu);
6167
6168         /* Find end of our completion_queue. */
6169         list_skb = &sd->completion_queue;
6170         while (*list_skb)
6171                 list_skb = &(*list_skb)->next;
6172         /* Append completion queue from offline CPU. */
6173         *list_skb = oldsd->completion_queue;
6174         oldsd->completion_queue = NULL;
6175
6176         /* Append output queue from offline CPU. */
6177         if (oldsd->output_queue) {
6178                 *sd->output_queue_tailp = oldsd->output_queue;
6179                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6180                 oldsd->output_queue = NULL;
6181                 oldsd->output_queue_tailp = &oldsd->output_queue;
6182         }
6183         /* Append NAPI poll list from offline CPU. */
6184         if (!list_empty(&oldsd->poll_list)) {
6185                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6186                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6187         }
6188
6189         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6190         local_irq_enable();
6191
6192         /* Process offline CPU's input_pkt_queue */
6193         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6194                 netif_rx(skb);
6195                 input_queue_head_incr(oldsd);
6196         }
6197         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6198                 netif_rx(skb);
6199                 input_queue_head_incr(oldsd);
6200         }
6201
6202         return NOTIFY_OK;
6203 }
6204
6205
6206 /**
6207  *      netdev_increment_features - increment feature set by one
6208  *      @all: current feature set
6209  *      @one: new feature set
6210  *      @mask: mask feature set
6211  *
6212  *      Computes a new feature set after adding a device with feature set
6213  *      @one to the master device with current feature set @all.  Will not
6214  *      enable anything that is off in @mask. Returns the new feature set.
6215  */
6216 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6217 {
6218         if (mask & NETIF_F_GEN_CSUM)
6219                 mask |= NETIF_F_ALL_CSUM;
6220         mask |= NETIF_F_VLAN_CHALLENGED;
6221
6222         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6223         all &= one | ~NETIF_F_ALL_FOR_ALL;
6224
6225         /* If device needs checksumming, downgrade to it. */
6226         if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6227                 all &= ~NETIF_F_NO_CSUM;
6228
6229         /* If one device supports hw checksumming, set for all. */
6230         if (all & NETIF_F_GEN_CSUM)
6231                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6232
6233         return all;
6234 }
6235 EXPORT_SYMBOL(netdev_increment_features);
6236
6237 static struct hlist_head *netdev_create_hash(void)
6238 {
6239         int i;
6240         struct hlist_head *hash;
6241
6242         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6243         if (hash != NULL)
6244                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6245                         INIT_HLIST_HEAD(&hash[i]);
6246
6247         return hash;
6248 }
6249
6250 /* Initialize per network namespace state */
6251 static int __net_init netdev_init(struct net *net)
6252 {
6253         INIT_LIST_HEAD(&net->dev_base_head);
6254
6255         net->dev_name_head = netdev_create_hash();
6256         if (net->dev_name_head == NULL)
6257                 goto err_name;
6258
6259         net->dev_index_head = netdev_create_hash();
6260         if (net->dev_index_head == NULL)
6261                 goto err_idx;
6262
6263         return 0;
6264
6265 err_idx:
6266         kfree(net->dev_name_head);
6267 err_name:
6268         return -ENOMEM;
6269 }
6270
6271 /**
6272  *      netdev_drivername - network driver for the device
6273  *      @dev: network device
6274  *
6275  *      Determine network driver for device.
6276  */
6277 const char *netdev_drivername(const struct net_device *dev)
6278 {
6279         const struct device_driver *driver;
6280         const struct device *parent;
6281         const char *empty = "";
6282
6283         parent = dev->dev.parent;
6284         if (!parent)
6285                 return empty;
6286
6287         driver = parent->driver;
6288         if (driver && driver->name)
6289                 return driver->name;
6290         return empty;
6291 }
6292
6293 static int __netdev_printk(const char *level, const struct net_device *dev,
6294                            struct va_format *vaf)
6295 {
6296         int r;
6297
6298         if (dev && dev->dev.parent)
6299                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6300                                netdev_name(dev), vaf);
6301         else if (dev)
6302                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6303         else
6304                 r = printk("%s(NULL net_device): %pV", level, vaf);
6305
6306         return r;
6307 }
6308
6309 int netdev_printk(const char *level, const struct net_device *dev,
6310                   const char *format, ...)
6311 {
6312         struct va_format vaf;
6313         va_list args;
6314         int r;
6315
6316         va_start(args, format);
6317
6318         vaf.fmt = format;
6319         vaf.va = &args;
6320
6321         r = __netdev_printk(level, dev, &vaf);
6322         va_end(args);
6323
6324         return r;
6325 }
6326 EXPORT_SYMBOL(netdev_printk);
6327
6328 #define define_netdev_printk_level(func, level)                 \
6329 int func(const struct net_device *dev, const char *fmt, ...)    \
6330 {                                                               \
6331         int r;                                                  \
6332         struct va_format vaf;                                   \
6333         va_list args;                                           \
6334                                                                 \
6335         va_start(args, fmt);                                    \
6336                                                                 \
6337         vaf.fmt = fmt;                                          \
6338         vaf.va = &args;                                         \
6339                                                                 \
6340         r = __netdev_printk(level, dev, &vaf);                  \
6341         va_end(args);                                           \
6342                                                                 \
6343         return r;                                               \
6344 }                                                               \
6345 EXPORT_SYMBOL(func);
6346
6347 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6348 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6349 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6350 define_netdev_printk_level(netdev_err, KERN_ERR);
6351 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6352 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6353 define_netdev_printk_level(netdev_info, KERN_INFO);
6354
6355 static void __net_exit netdev_exit(struct net *net)
6356 {
6357         kfree(net->dev_name_head);
6358         kfree(net->dev_index_head);
6359 }
6360
6361 static struct pernet_operations __net_initdata netdev_net_ops = {
6362         .init = netdev_init,
6363         .exit = netdev_exit,
6364 };
6365
6366 static void __net_exit default_device_exit(struct net *net)
6367 {
6368         struct net_device *dev, *aux;
6369         /*
6370          * Push all migratable network devices back to the
6371          * initial network namespace
6372          */
6373         rtnl_lock();
6374         for_each_netdev_safe(net, dev, aux) {
6375                 int err;
6376                 char fb_name[IFNAMSIZ];
6377
6378                 /* Ignore unmoveable devices (i.e. loopback) */
6379                 if (dev->features & NETIF_F_NETNS_LOCAL)
6380                         continue;
6381
6382                 /* Leave virtual devices for the generic cleanup */
6383                 if (dev->rtnl_link_ops)
6384                         continue;
6385
6386                 /* Push remaining network devices to init_net */
6387                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6388                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6389                 if (err) {
6390                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6391                                 __func__, dev->name, err);
6392                         BUG();
6393                 }
6394         }
6395         rtnl_unlock();
6396 }
6397
6398 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6399 {
6400         /* At exit all network devices most be removed from a network
6401          * namespace.  Do this in the reverse order of registration.
6402          * Do this across as many network namespaces as possible to
6403          * improve batching efficiency.
6404          */
6405         struct net_device *dev;
6406         struct net *net;
6407         LIST_HEAD(dev_kill_list);
6408
6409         rtnl_lock();
6410         list_for_each_entry(net, net_list, exit_list) {
6411                 for_each_netdev_reverse(net, dev) {
6412                         if (dev->rtnl_link_ops)
6413                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6414                         else
6415                                 unregister_netdevice_queue(dev, &dev_kill_list);
6416                 }
6417         }
6418         unregister_netdevice_many(&dev_kill_list);
6419         list_del(&dev_kill_list);
6420         rtnl_unlock();
6421 }
6422
6423 static struct pernet_operations __net_initdata default_device_ops = {
6424         .exit = default_device_exit,
6425         .exit_batch = default_device_exit_batch,
6426 };
6427
6428 /*
6429  *      Initialize the DEV module. At boot time this walks the device list and
6430  *      unhooks any devices that fail to initialise (normally hardware not
6431  *      present) and leaves us with a valid list of present and active devices.
6432  *
6433  */
6434
6435 /*
6436  *       This is called single threaded during boot, so no need
6437  *       to take the rtnl semaphore.
6438  */
6439 static int __init net_dev_init(void)
6440 {
6441         int i, rc = -ENOMEM;
6442
6443         BUG_ON(!dev_boot_phase);
6444
6445         if (dev_proc_init())
6446                 goto out;
6447
6448         if (netdev_kobject_init())
6449                 goto out;
6450
6451         INIT_LIST_HEAD(&ptype_all);
6452         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6453                 INIT_LIST_HEAD(&ptype_base[i]);
6454
6455         if (register_pernet_subsys(&netdev_net_ops))
6456                 goto out;
6457
6458         /*
6459          *      Initialise the packet receive queues.
6460          */
6461
6462         for_each_possible_cpu(i) {
6463                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6464
6465                 memset(sd, 0, sizeof(*sd));
6466                 skb_queue_head_init(&sd->input_pkt_queue);
6467                 skb_queue_head_init(&sd->process_queue);
6468                 sd->completion_queue = NULL;
6469                 INIT_LIST_HEAD(&sd->poll_list);
6470                 sd->output_queue = NULL;
6471                 sd->output_queue_tailp = &sd->output_queue;
6472 #ifdef CONFIG_RPS
6473                 sd->csd.func = rps_trigger_softirq;
6474                 sd->csd.info = sd;
6475                 sd->csd.flags = 0;
6476                 sd->cpu = i;
6477 #endif
6478
6479                 sd->backlog.poll = process_backlog;
6480                 sd->backlog.weight = weight_p;
6481                 sd->backlog.gro_list = NULL;
6482                 sd->backlog.gro_count = 0;
6483         }
6484
6485         dev_boot_phase = 0;
6486
6487         /* The loopback device is special if any other network devices
6488          * is present in a network namespace the loopback device must
6489          * be present. Since we now dynamically allocate and free the
6490          * loopback device ensure this invariant is maintained by
6491          * keeping the loopback device as the first device on the
6492          * list of network devices.  Ensuring the loopback devices
6493          * is the first device that appears and the last network device
6494          * that disappears.
6495          */
6496         if (register_pernet_device(&loopback_net_ops))
6497                 goto out;
6498
6499         if (register_pernet_device(&default_device_ops))
6500                 goto out;
6501
6502         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6503         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6504
6505         hotcpu_notifier(dev_cpu_callback, 0);
6506         dst_init();
6507         dev_mcast_init();
6508         rc = 0;
6509 out:
6510         return rc;
6511 }
6512
6513 subsys_initcall(net_dev_init);
6514
6515 static int __init initialize_hashrnd(void)
6516 {
6517         get_random_bytes(&hashrnd, sizeof(hashrnd));
6518         return 0;
6519 }
6520
6521 late_initcall_sync(initialize_hashrnd);
6522