net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <linux/if_bridge.h>
 105 #include <linux/if_macvlan.h>
 106 #include <net/dst.h>
 107 #include <net/pkt_sched.h>
 108 #include <net/checksum.h>
 109 #include <net/xfrm.h>
 110 #include <linux/highmem.h>
 111 #include <linux/init.h>
 112 #include <linux/kmod.h>
 113 #include <linux/module.h>
 114 #include <linux/netpoll.h>
 115 #include <linux/rcupdate.h>
 116 #include <linux/delay.h>
 117 #include <net/wext.h>
 118 #include <net/iw_handler.h>
 119 #include <asm/current.h>
 120 #include <linux/audit.h>
 121 #include <linux/dmaengine.h>
 122 #include <linux/err.h>
 123 #include <linux/ctype.h>
 124 #include <linux/if_arp.h>
 125 #include <linux/if_vlan.h>
 126 #include <linux/ip.h>
 127 #include <net/ip.h>
 128 #include <linux/ipv6.h>
 129 #include <linux/in.h>
 130 #include <linux/jhash.h>
 131 #include <linux/random.h>
 132 #include <trace/events/napi.h>
 133
 134 #include "net-sysfs.h"
 135
 136 /* Instead of increasing this, you should create a hash table. */
 137 #define MAX_GRO_SKBS 8
 138
 139 /* This should be increased if a protocol with a bigger head is added. */
 140 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 141
 142 /*
 143  *      The list of packet types we will receive (as opposed to discard)
 144  *      and the routines to invoke.
 145  *
 146  *      Why 16. Because with 16 the only overlap we get on a hash of the
 147  *      low nibble of the protocol value is RARP/SNAP/X.25.
 148  *
 149  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 150  *             sure which should go first, but I bet it won't make much
 151  *             difference if we are running VLANs.  The good news is that
 152  *             this protocol won't be in the list unless compiled in, so
 153  *             the average user (w/out VLANs) will not be adversely affected.
 154  *             --BLG
 155  *
 156  *              0800    IP
 157  *              8100    802.1Q VLAN
 158  *              0001    802.3
 159  *              0002    AX.25
 160  *              0004    802.2
 161  *              8035    RARP
 162  *              0005    SNAP
 163  *              0805    X.25
 164  *              0806    ARP
 165  *              8137    IPX
 166  *              0009    Localtalk
 167  *              86DD    IPv6
 168  */
 169
 170 #define PTYPE_HASH_SIZE (16)
 171 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 172
 173 static DEFINE_SPINLOCK(ptype_lock);
 174 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 175 static struct list_head ptype_all __read_mostly;        /* Taps */
 176
 177 /*
 178  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 179  * semaphore.
 180  *
 181  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 182  *
 183  * Writers must hold the rtnl semaphore while they loop through the
 184  * dev_base_head list, and hold dev_base_lock for writing when they do the
 185  * actual updates.  This allows pure readers to access the list even
 186  * while a writer is preparing to update it.
 187  *
 188  * To put it another way, dev_base_lock is held for writing only to
 189  * protect against pure readers; the rtnl semaphore provides the
 190  * protection against other writers.
 191  *
 192  * See, for example usages, register_netdevice() and
 193  * unregister_netdevice(), which must be called with the rtnl
 194  * semaphore held.
 195  */
 196 DEFINE_RWLOCK(dev_base_lock);
 197 EXPORT_SYMBOL(dev_base_lock);
 198
 199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 200 {
 201         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 202         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 203 }
 204
 205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206 {
 207         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 208 }
 209
 210 /* Device list insertion */
 211 static int list_netdevice(struct net_device *dev)
 212 {
 213         struct net *net = dev_net(dev);
 214
 215         ASSERT_RTNL();
 216
 217         write_lock_bh(&dev_base_lock);
 218         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 219         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 220         hlist_add_head_rcu(&dev->index_hlist,
 221                            dev_index_hash(net, dev->ifindex));
 222         write_unlock_bh(&dev_base_lock);
 223         return 0;
 224 }
 225
 226 /* Device list removal
 227  * caller must respect a RCU grace period before freeing/reusing dev
 228  */
 229 static void unlist_netdevice(struct net_device *dev)
 230 {
 231         ASSERT_RTNL();
 232
 233         /* Unlink dev from the device chain */
 234         write_lock_bh(&dev_base_lock);
 235         list_del_rcu(&dev->dev_list);
 236         hlist_del_rcu(&dev->name_hlist);
 237         hlist_del_rcu(&dev->index_hlist);
 238         write_unlock_bh(&dev_base_lock);
 239 }
 240
 241 /*
 242  *      Our notifier list
 243  */
 244
 245 static RAW_NOTIFIER_HEAD(netdev_chain);
 246
 247 /*
 248  *      Device drivers call our routines to queue packets here. We empty the
 249  *      queue in the local softnet handler.
 250  */
 251
 252 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 253 EXPORT_PER_CPU_SYMBOL(softnet_data);
 254
 255 #ifdef CONFIG_LOCKDEP
 256 /*
 257  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 258  * according to dev->type
 259  */
 260 static const unsigned short netdev_lock_type[] =
 261         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 262          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 263          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 264          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 265          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 266          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 267          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 268          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 269          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 270          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 271          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 272          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 273          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 274          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 275          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 276          ARPHRD_VOID, ARPHRD_NONE};
 277
 278 static const char *const netdev_lock_name[] =
 279         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 280          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 281          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 282          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 283          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 284          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 285          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 286          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 287          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 288          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 289          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 290          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 291          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 292          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 293          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 294          "_xmit_VOID", "_xmit_NONE"};
 295
 296 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 297 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 298
 299 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 300 {
 301         int i;
 302
 303         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 304                 if (netdev_lock_type[i] == dev_type)
 305                         return i;
 306         /* the last key is used by default */
 307         return ARRAY_SIZE(netdev_lock_type) - 1;
 308 }
 309
 310 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 311                                                  unsigned short dev_type)
 312 {
 313         int i;
 314
 315         i = netdev_lock_pos(dev_type);
 316         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 317                                    netdev_lock_name[i]);
 318 }
 319
 320 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 321 {
 322         int i;
 323
 324         i = netdev_lock_pos(dev->type);
 325         lockdep_set_class_and_name(&dev->addr_list_lock,
 326                                    &netdev_addr_lock_key[i],
 327                                    netdev_lock_name[i]);
 328 }
 329 #else
 330 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 331                                                  unsigned short dev_type)
 332 {
 333 }
 334 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 335 {
 336 }
 337 #endif
 338
 339 /*******************************************************************************
 340
 341                 Protocol management and registration routines
 342
 343 *******************************************************************************/
 344
 345 /*
 346  *      Add a protocol ID to the list. Now that the input handler is
 347  *      smarter we can dispense with all the messy stuff that used to be
 348  *      here.
 349  *
 350  *      BEWARE!!! Protocol handlers, mangling input packets,
 351  *      MUST BE last in hash buckets and checking protocol handlers
 352  *      MUST start from promiscuous ptype_all chain in net_bh.
 353  *      It is true now, do not change it.
 354  *      Explanation follows: if protocol handler, mangling packet, will
 355  *      be the first on list, it is not able to sense, that packet
 356  *      is cloned and should be copied-on-write, so that it will
 357  *      change it and subsequent readers will get broken packet.
 358  *                                                      --ANK (980803)
 359  */
 360
 361 /**
 362  *      dev_add_pack - add packet handler
 363  *      @pt: packet type declaration
 364  *
 365  *      Add a protocol handler to the networking stack. The passed &packet_type
 366  *      is linked into kernel lists and may not be freed until it has been
 367  *      removed from the kernel lists.
 368  *
 369  *      This call does not sleep therefore it can not
 370  *      guarantee all CPU's that are in middle of receiving packets
 371  *      will see the new packet type (until the next received packet).
 372  */
 373
 374 void dev_add_pack(struct packet_type *pt)
 375 {
 376         int hash;
 377
 378         spin_lock_bh(&ptype_lock);
 379         if (pt->type == htons(ETH_P_ALL))
 380                 list_add_rcu(&pt->list, &ptype_all);
 381         else {
 382                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 383                 list_add_rcu(&pt->list, &ptype_base[hash]);
 384         }
 385         spin_unlock_bh(&ptype_lock);
 386 }
 387 EXPORT_SYMBOL(dev_add_pack);
 388
 389 /**
 390  *      __dev_remove_pack        - remove packet handler
 391  *      @pt: packet type declaration
 392  *
 393  *      Remove a protocol handler that was previously added to the kernel
 394  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 395  *      from the kernel lists and can be freed or reused once this function
 396  *      returns.
 397  *
 398  *      The packet type might still be in use by receivers
 399  *      and must not be freed until after all the CPU's have gone
 400  *      through a quiescent state.
 401  */
 402 void __dev_remove_pack(struct packet_type *pt)
 403 {
 404         struct list_head *head;
 405         struct packet_type *pt1;
 406
 407         spin_lock_bh(&ptype_lock);
 408
 409         if (pt->type == htons(ETH_P_ALL))
 410                 head = &ptype_all;
 411         else
 412                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 413
 414         list_for_each_entry(pt1, head, list) {
 415                 if (pt == pt1) {
 416                         list_del_rcu(&pt->list);
 417                         goto out;
 418                 }
 419         }
 420
 421         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 422 out:
 423         spin_unlock_bh(&ptype_lock);
 424 }
 425 EXPORT_SYMBOL(__dev_remove_pack);
 426
 427 /**
 428  *      dev_remove_pack  - remove packet handler
 429  *      @pt: packet type declaration
 430  *
 431  *      Remove a protocol handler that was previously added to the kernel
 432  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 433  *      from the kernel lists and can be freed or reused once this function
 434  *      returns.
 435  *
 436  *      This call sleeps to guarantee that no CPU is looking at the packet
 437  *      type after return.
 438  */
 439 void dev_remove_pack(struct packet_type *pt)
 440 {
 441         __dev_remove_pack(pt);
 442
 443         synchronize_net();
 444 }
 445 EXPORT_SYMBOL(dev_remove_pack);
 446
 447 /******************************************************************************
 448
 449                       Device Boot-time Settings Routines
 450
 451 *******************************************************************************/
 452
 453 /* Boot time configuration table */
 454 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 455
 456 /**
 457  *      netdev_boot_setup_add   - add new setup entry
 458  *      @name: name of the device
 459  *      @map: configured settings for the device
 460  *
 461  *      Adds new setup entry to the dev_boot_setup list.  The function
 462  *      returns 0 on error and 1 on success.  This is a generic routine to
 463  *      all netdevices.
 464  */
 465 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 466 {
 467         struct netdev_boot_setup *s;
 468         int i;
 469
 470         s = dev_boot_setup;
 471         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 472                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 473                         memset(s[i].name, 0, sizeof(s[i].name));
 474                         strlcpy(s[i].name, name, IFNAMSIZ);
 475                         memcpy(&s[i].map, map, sizeof(s[i].map));
 476                         break;
 477                 }
 478         }
 479
 480         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 481 }
 482
 483 /**
 484  *      netdev_boot_setup_check - check boot time settings
 485  *      @dev: the netdevice
 486  *
 487  *      Check boot time settings for the device.
 488  *      The found settings are set for the device to be used
 489  *      later in the device probing.
 490  *      Returns 0 if no settings found, 1 if they are.
 491  */
 492 int netdev_boot_setup_check(struct net_device *dev)
 493 {
 494         struct netdev_boot_setup *s = dev_boot_setup;
 495         int i;
 496
 497         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 498                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 499                     !strcmp(dev->name, s[i].name)) {
 500                         dev->irq        = s[i].map.irq;
 501                         dev->base_addr  = s[i].map.base_addr;
 502                         dev->mem_start  = s[i].map.mem_start;
 503                         dev->mem_end    = s[i].map.mem_end;
 504                         return 1;
 505                 }
 506         }
 507         return 0;
 508 }
 509 EXPORT_SYMBOL(netdev_boot_setup_check);
 510
 511
 512 /**
 513  *      netdev_boot_base        - get address from boot time settings
 514  *      @prefix: prefix for network device
 515  *      @unit: id for network device
 516  *
 517  *      Check boot time settings for the base address of device.
 518  *      The found settings are set for the device to be used
 519  *      later in the device probing.
 520  *      Returns 0 if no settings found.
 521  */
 522 unsigned long netdev_boot_base(const char *prefix, int unit)
 523 {
 524         const struct netdev_boot_setup *s = dev_boot_setup;
 525         char name[IFNAMSIZ];
 526         int i;
 527
 528         sprintf(name, "%s%d", prefix, unit);
 529
 530         /*
 531          * If device already registered then return base of 1
 532          * to indicate not to probe for this interface
 533          */
 534         if (__dev_get_by_name(&init_net, name))
 535                 return 1;
 536
 537         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 538                 if (!strcmp(name, s[i].name))
 539                         return s[i].map.base_addr;
 540         return 0;
 541 }
 542
 543 /*
 544  * Saves at boot time configured settings for any netdevice.
 545  */
 546 int __init netdev_boot_setup(char *str)
 547 {
 548         int ints[5];
 549         struct ifmap map;
 550
 551         str = get_options(str, ARRAY_SIZE(ints), ints);
 552         if (!str || !*str)
 553                 return 0;
 554
 555         /* Save settings */
 556         memset(&map, 0, sizeof(map));
 557         if (ints[0] > 0)
 558                 map.irq = ints[1];
 559         if (ints[0] > 1)
 560                 map.base_addr = ints[2];
 561         if (ints[0] > 2)
 562                 map.mem_start = ints[3];
 563         if (ints[0] > 3)
 564                 map.mem_end = ints[4];
 565
 566         /* Add new entry to the list */
 567         return netdev_boot_setup_add(str, &map);
 568 }
 569
 570 __setup("netdev=", netdev_boot_setup);
 571
 572 /*******************************************************************************
 573
 574                             Device Interface Subroutines
 575
 576 *******************************************************************************/
 577
 578 /**
 579  *      __dev_get_by_name       - find a device by its name
 580  *      @net: the applicable net namespace
 581  *      @name: name to find
 582  *
 583  *      Find an interface by name. Must be called under RTNL semaphore
 584  *      or @dev_base_lock. If the name is found a pointer to the device
 585  *      is returned. If the name is not found then %NULL is returned. The
 586  *      reference counters are not incremented so the caller must be
 587  *      careful with locks.
 588  */
 589
 590 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 591 {
 592         struct hlist_node *p;
 593         struct net_device *dev;
 594         struct hlist_head *head = dev_name_hash(net, name);
 595
 596         hlist_for_each_entry(dev, p, head, name_hlist)
 597                 if (!strncmp(dev->name, name, IFNAMSIZ))
 598                         return dev;
 599
 600         return NULL;
 601 }
 602 EXPORT_SYMBOL(__dev_get_by_name);
 603
 604 /**
 605  *      dev_get_by_name_rcu     - find a device by its name
 606  *      @net: the applicable net namespace
 607  *      @name: name to find
 608  *
 609  *      Find an interface by name.
 610  *      If the name is found a pointer to the device is returned.
 611  *      If the name is not found then %NULL is returned.
 612  *      The reference counters are not incremented so the caller must be
 613  *      careful with locks. The caller must hold RCU lock.
 614  */
 615
 616 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 617 {
 618         struct hlist_node *p;
 619         struct net_device *dev;
 620         struct hlist_head *head = dev_name_hash(net, name);
 621
 622         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 623                 if (!strncmp(dev->name, name, IFNAMSIZ))
 624                         return dev;
 625
 626         return NULL;
 627 }
 628 EXPORT_SYMBOL(dev_get_by_name_rcu);
 629
 630 /**
 631  *      dev_get_by_name         - find a device by its name
 632  *      @net: the applicable net namespace
 633  *      @name: name to find
 634  *
 635  *      Find an interface by name. This can be called from any
 636  *      context and does its own locking. The returned handle has
 637  *      the usage count incremented and the caller must use dev_put() to
 638  *      release it when it is no longer needed. %NULL is returned if no
 639  *      matching device is found.
 640  */
 641
 642 struct net_device *dev_get_by_name(struct net *net, const char *name)
 643 {
 644         struct net_device *dev;
 645
 646         rcu_read_lock();
 647         dev = dev_get_by_name_rcu(net, name);
 648         if (dev)
 649                 dev_hold(dev);
 650         rcu_read_unlock();
 651         return dev;
 652 }
 653 EXPORT_SYMBOL(dev_get_by_name);
 654
 655 /**
 656  *      __dev_get_by_index - find a device by its ifindex
 657  *      @net: the applicable net namespace
 658  *      @ifindex: index of device
 659  *
 660  *      Search for an interface by index. Returns %NULL if the device
 661  *      is not found or a pointer to the device. The device has not
 662  *      had its reference counter increased so the caller must be careful
 663  *      about locking. The caller must hold either the RTNL semaphore
 664  *      or @dev_base_lock.
 665  */
 666
 667 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 668 {
 669         struct hlist_node *p;
 670         struct net_device *dev;
 671         struct hlist_head *head = dev_index_hash(net, ifindex);
 672
 673         hlist_for_each_entry(dev, p, head, index_hlist)
 674                 if (dev->ifindex == ifindex)
 675                         return dev;
 676
 677         return NULL;
 678 }
 679 EXPORT_SYMBOL(__dev_get_by_index);
 680
 681 /**
 682  *      dev_get_by_index_rcu - find a device by its ifindex
 683  *      @net: the applicable net namespace
 684  *      @ifindex: index of device
 685  *
 686  *      Search for an interface by index. Returns %NULL if the device
 687  *      is not found or a pointer to the device. The device has not
 688  *      had its reference counter increased so the caller must be careful
 689  *      about locking. The caller must hold RCU lock.
 690  */
 691
 692 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 693 {
 694         struct hlist_node *p;
 695         struct net_device *dev;
 696         struct hlist_head *head = dev_index_hash(net, ifindex);
 697
 698         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 699                 if (dev->ifindex == ifindex)
 700                         return dev;
 701
 702         return NULL;
 703 }
 704 EXPORT_SYMBOL(dev_get_by_index_rcu);
 705
 706
 707 /**
 708  *      dev_get_by_index - find a device by its ifindex
 709  *      @net: the applicable net namespace
 710  *      @ifindex: index of device
 711  *
 712  *      Search for an interface by index. Returns NULL if the device
 713  *      is not found or a pointer to the device. The device returned has
 714  *      had a reference added and the pointer is safe until the user calls
 715  *      dev_put to indicate they have finished with it.
 716  */
 717
 718 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 719 {
 720         struct net_device *dev;
 721
 722         rcu_read_lock();
 723         dev = dev_get_by_index_rcu(net, ifindex);
 724         if (dev)
 725                 dev_hold(dev);
 726         rcu_read_unlock();
 727         return dev;
 728 }
 729 EXPORT_SYMBOL(dev_get_by_index);
 730
 731 /**
 732  *      dev_getbyhwaddr - find a device by its hardware address
 733  *      @net: the applicable net namespace
 734  *      @type: media type of device
 735  *      @ha: hardware address
 736  *
 737  *      Search for an interface by MAC address. Returns NULL if the device
 738  *      is not found or a pointer to the device. The caller must hold the
 739  *      rtnl semaphore. The returned device has not had its ref count increased
 740  *      and the caller must therefore be careful about locking
 741  *
 742  *      BUGS:
 743  *      If the API was consistent this would be __dev_get_by_hwaddr
 744  */
 745
 746 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 747 {
 748         struct net_device *dev;
 749
 750         ASSERT_RTNL();
 751
 752         for_each_netdev(net, dev)
 753                 if (dev->type == type &&
 754                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 755                         return dev;
 756
 757         return NULL;
 758 }
 759 EXPORT_SYMBOL(dev_getbyhwaddr);
 760
 761 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 762 {
 763         struct net_device *dev;
 764
 765         ASSERT_RTNL();
 766         for_each_netdev(net, dev)
 767                 if (dev->type == type)
 768                         return dev;
 769
 770         return NULL;
 771 }
 772 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 773
 774 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 775 {
 776         struct net_device *dev;
 777
 778         rtnl_lock();
 779         dev = __dev_getfirstbyhwtype(net, type);
 780         if (dev)
 781                 dev_hold(dev);
 782         rtnl_unlock();
 783         return dev;
 784 }
 785 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 786
 787 /**
 788  *      dev_get_by_flags - find any device with given flags
 789  *      @net: the applicable net namespace
 790  *      @if_flags: IFF_* values
 791  *      @mask: bitmask of bits in if_flags to check
 792  *
 793  *      Search for any interface with the given flags. Returns NULL if a device
 794  *      is not found or a pointer to the device. The device returned has
 795  *      had a reference added and the pointer is safe until the user calls
 796  *      dev_put to indicate they have finished with it.
 797  */
 798
 799 struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
 800                                     unsigned short mask)
 801 {
 802         struct net_device *dev, *ret;
 803
 804         ret = NULL;
 805         rcu_read_lock();
 806         for_each_netdev_rcu(net, dev) {
 807                 if (((dev->flags ^ if_flags) & mask) == 0) {
 808                         dev_hold(dev);
 809                         ret = dev;
 810                         break;
 811                 }
 812         }
 813         rcu_read_unlock();
 814         return ret;
 815 }
 816 EXPORT_SYMBOL(dev_get_by_flags);
 817
 818 /**
 819  *      dev_valid_name - check if name is okay for network device
 820  *      @name: name string
 821  *
 822  *      Network device names need to be valid file names to
 823  *      to allow sysfs to work.  We also disallow any kind of
 824  *      whitespace.
 825  */
 826 int dev_valid_name(const char *name)
 827 {
 828         if (*name == '\0')
 829                 return 0;
 830         if (strlen(name) >= IFNAMSIZ)
 831                 return 0;
 832         if (!strcmp(name, ".") || !strcmp(name, ".."))
 833                 return 0;
 834
 835         while (*name) {
 836                 if (*name == '/' || isspace(*name))
 837                         return 0;
 838                 name++;
 839         }
 840         return 1;
 841 }
 842 EXPORT_SYMBOL(dev_valid_name);
 843
 844 /**
 845  *      __dev_alloc_name - allocate a name for a device
 846  *      @net: network namespace to allocate the device name in
 847  *      @name: name format string
 848  *      @buf:  scratch buffer and result name string
 849  *
 850  *      Passed a format string - eg "lt%d" it will try and find a suitable
 851  *      id. It scans list of devices to build up a free map, then chooses
 852  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 853  *      while allocating the name and adding the device in order to avoid
 854  *      duplicates.
 855  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 856  *      Returns the number of the unit assigned or a negative errno code.
 857  */
 858
 859 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 860 {
 861         int i = 0;
 862         const char *p;
 863         const int max_netdevices = 8*PAGE_SIZE;
 864         unsigned long *inuse;
 865         struct net_device *d;
 866
 867         p = strnchr(name, IFNAMSIZ-1, '%');
 868         if (p) {
 869                 /*
 870                  * Verify the string as this thing may have come from
 871                  * the user.  There must be either one "%d" and no other "%"
 872                  * characters.
 873                  */
 874                 if (p[1] != 'd' || strchr(p + 2, '%'))
 875                         return -EINVAL;
 876
 877                 /* Use one page as a bit array of possible slots */
 878                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 879                 if (!inuse)
 880                         return -ENOMEM;
 881
 882                 for_each_netdev(net, d) {
 883                         if (!sscanf(d->name, name, &i))
 884                                 continue;
 885                         if (i < 0 || i >= max_netdevices)
 886                                 continue;
 887
 888                         /*  avoid cases where sscanf is not exact inverse of printf */
 889                         snprintf(buf, IFNAMSIZ, name, i);
 890                         if (!strncmp(buf, d->name, IFNAMSIZ))
 891                                 set_bit(i, inuse);
 892                 }
 893
 894                 i = find_first_zero_bit(inuse, max_netdevices);
 895                 free_page((unsigned long) inuse);
 896         }
 897
 898         if (buf != name)
 899                 snprintf(buf, IFNAMSIZ, name, i);
 900         if (!__dev_get_by_name(net, buf))
 901                 return i;
 902
 903         /* It is possible to run out of possible slots
 904          * when the name is long and there isn't enough space left
 905          * for the digits, or if all bits are used.
 906          */
 907         return -ENFILE;
 908 }
 909
 910 /**
 911  *      dev_alloc_name - allocate a name for a device
 912  *      @dev: device
 913  *      @name: name format string
 914  *
 915  *      Passed a format string - eg "lt%d" it will try and find a suitable
 916  *      id. It scans list of devices to build up a free map, then chooses
 917  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 918  *      while allocating the name and adding the device in order to avoid
 919  *      duplicates.
 920  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 921  *      Returns the number of the unit assigned or a negative errno code.
 922  */
 923
 924 int dev_alloc_name(struct net_device *dev, const char *name)
 925 {
 926         char buf[IFNAMSIZ];
 927         struct net *net;
 928         int ret;
 929
 930         BUG_ON(!dev_net(dev));
 931         net = dev_net(dev);
 932         ret = __dev_alloc_name(net, name, buf);
 933         if (ret >= 0)
 934                 strlcpy(dev->name, buf, IFNAMSIZ);
 935         return ret;
 936 }
 937 EXPORT_SYMBOL(dev_alloc_name);
 938
 939 static int dev_get_valid_name(struct net *net, const char *name, char *buf,
 940                               bool fmt)
 941 {
 942         if (!dev_valid_name(name))
 943                 return -EINVAL;
 944
 945         if (fmt && strchr(name, '%'))
 946                 return __dev_alloc_name(net, name, buf);
 947         else if (__dev_get_by_name(net, name))
 948                 return -EEXIST;
 949         else if (buf != name)
 950                 strlcpy(buf, name, IFNAMSIZ);
 951
 952         return 0;
 953 }
 954
 955 /**
 956  *      dev_change_name - change name of a device
 957  *      @dev: device
 958  *      @newname: name (or format string) must be at least IFNAMSIZ
 959  *
 960  *      Change name of a device, can pass format strings "eth%d".
 961  *      for wildcarding.
 962  */
 963 int dev_change_name(struct net_device *dev, const char *newname)
 964 {
 965         char oldname[IFNAMSIZ];
 966         int err = 0;
 967         int ret;
 968         struct net *net;
 969
 970         ASSERT_RTNL();
 971         BUG_ON(!dev_net(dev));
 972
 973         net = dev_net(dev);
 974         if (dev->flags & IFF_UP)
 975                 return -EBUSY;
 976
 977         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 978                 return 0;
 979
 980         memcpy(oldname, dev->name, IFNAMSIZ);
 981
 982         err = dev_get_valid_name(net, newname, dev->name, 1);
 983         if (err < 0)
 984                 return err;
 985
 986 rollback:
 987         /* For now only devices in the initial network namespace
 988          * are in sysfs.
 989          */
 990         if (net_eq(net, &init_net)) {
 991                 ret = device_rename(&dev->dev, dev->name);
 992                 if (ret) {
 993                         memcpy(dev->name, oldname, IFNAMSIZ);
 994                         return ret;
 995                 }
 996         }
 997
 998         write_lock_bh(&dev_base_lock);
 999         hlist_del(&dev->name_hlist);
1000         write_unlock_bh(&dev_base_lock);
1001
1002         synchronize_rcu();
1003
1004         write_lock_bh(&dev_base_lock);
1005         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1006         write_unlock_bh(&dev_base_lock);
1007
1008         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1009         ret = notifier_to_errno(ret);
1010
1011         if (ret) {
1012                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1013                 if (err >= 0) {
1014                         err = ret;
1015                         memcpy(dev->name, oldname, IFNAMSIZ);
1016                         goto rollback;
1017                 } else {
1018                         printk(KERN_ERR
1019                                "%s: name change rollback failed: %d.\n",
1020                                dev->name, ret);
1021                 }
1022         }
1023
1024         return err;
1025 }
1026
1027 /**
1028  *      dev_set_alias - change ifalias of a device
1029  *      @dev: device
1030  *      @alias: name up to IFALIASZ
1031  *      @len: limit of bytes to copy from info
1032  *
1033  *      Set ifalias for a device,
1034  */
1035 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1036 {
1037         ASSERT_RTNL();
1038
1039         if (len >= IFALIASZ)
1040                 return -EINVAL;
1041
1042         if (!len) {
1043                 if (dev->ifalias) {
1044                         kfree(dev->ifalias);
1045                         dev->ifalias = NULL;
1046                 }
1047                 return 0;
1048         }
1049
1050         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1051         if (!dev->ifalias)
1052                 return -ENOMEM;
1053
1054         strlcpy(dev->ifalias, alias, len+1);
1055         return len;
1056 }
1057
1058
1059 /**
1060  *      netdev_features_change - device changes features
1061  *      @dev: device to cause notification
1062  *
1063  *      Called to indicate a device has changed features.
1064  */
1065 void netdev_features_change(struct net_device *dev)
1066 {
1067         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1068 }
1069 EXPORT_SYMBOL(netdev_features_change);
1070
1071 /**
1072  *      netdev_state_change - device changes state
1073  *      @dev: device to cause notification
1074  *
1075  *      Called to indicate a device has changed state. This function calls
1076  *      the notifier chains for netdev_chain and sends a NEWLINK message
1077  *      to the routing socket.
1078  */
1079 void netdev_state_change(struct net_device *dev)
1080 {
1081         if (dev->flags & IFF_UP) {
1082                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1083                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1084         }
1085 }
1086 EXPORT_SYMBOL(netdev_state_change);
1087
1088 void netdev_bonding_change(struct net_device *dev, unsigned long event)
1089 {
1090         call_netdevice_notifiers(event, dev);
1091 }
1092 EXPORT_SYMBOL(netdev_bonding_change);
1093
1094 /**
1095  *      dev_load        - load a network module
1096  *      @net: the applicable net namespace
1097  *      @name: name of interface
1098  *
1099  *      If a network interface is not present and the process has suitable
1100  *      privileges this function loads the module. If module loading is not
1101  *      available in this kernel then it becomes a nop.
1102  */
1103
1104 void dev_load(struct net *net, const char *name)
1105 {
1106         struct net_device *dev;
1107
1108         rcu_read_lock();
1109         dev = dev_get_by_name_rcu(net, name);
1110         rcu_read_unlock();
1111
1112         if (!dev && capable(CAP_NET_ADMIN))
1113                 request_module("%s", name);
1114 }
1115 EXPORT_SYMBOL(dev_load);
1116
1117 static int __dev_open(struct net_device *dev)
1118 {
1119         const struct net_device_ops *ops = dev->netdev_ops;
1120         int ret;
1121
1122         ASSERT_RTNL();
1123
1124         /*
1125          *      Is it even present?
1126          */
1127         if (!netif_device_present(dev))
1128                 return -ENODEV;
1129
1130         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1131         ret = notifier_to_errno(ret);
1132         if (ret)
1133                 return ret;
1134
1135         /*
1136          *      Call device private open method
1137          */
1138         set_bit(__LINK_STATE_START, &dev->state);
1139
1140         if (ops->ndo_validate_addr)
1141                 ret = ops->ndo_validate_addr(dev);
1142
1143         if (!ret && ops->ndo_open)
1144                 ret = ops->ndo_open(dev);
1145
1146         /*
1147          *      If it went open OK then:
1148          */
1149
1150         if (ret)
1151                 clear_bit(__LINK_STATE_START, &dev->state);
1152         else {
1153                 /*
1154                  *      Set the flags.
1155                  */
1156                 dev->flags |= IFF_UP;
1157
1158                 /*
1159                  *      Enable NET_DMA
1160                  */
1161                 net_dmaengine_get();
1162
1163                 /*
1164                  *      Initialize multicasting status
1165                  */
1166                 dev_set_rx_mode(dev);
1167
1168                 /*
1169                  *      Wakeup transmit queue engine
1170                  */
1171                 dev_activate(dev);
1172         }
1173
1174         return ret;
1175 }
1176
1177 /**
1178  *      dev_open        - prepare an interface for use.
1179  *      @dev:   device to open
1180  *
1181  *      Takes a device from down to up state. The device's private open
1182  *      function is invoked and then the multicast lists are loaded. Finally
1183  *      the device is moved into the up state and a %NETDEV_UP message is
1184  *      sent to the netdev notifier chain.
1185  *
1186  *      Calling this function on an active interface is a nop. On a failure
1187  *      a negative errno code is returned.
1188  */
1189 int dev_open(struct net_device *dev)
1190 {
1191         int ret;
1192
1193         /*
1194          *      Is it already up?
1195          */
1196         if (dev->flags & IFF_UP)
1197                 return 0;
1198
1199         /*
1200          *      Open device
1201          */
1202         ret = __dev_open(dev);
1203         if (ret < 0)
1204                 return ret;
1205
1206         /*
1207          *      ... and announce new interface.
1208          */
1209         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1210         call_netdevice_notifiers(NETDEV_UP, dev);
1211
1212         return ret;
1213 }
1214 EXPORT_SYMBOL(dev_open);
1215
1216 static int __dev_close(struct net_device *dev)
1217 {
1218         const struct net_device_ops *ops = dev->netdev_ops;
1219
1220         ASSERT_RTNL();
1221         might_sleep();
1222
1223         /*
1224          *      Tell people we are going down, so that they can
1225          *      prepare to death, when device is still operating.
1226          */
1227         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1228
1229         clear_bit(__LINK_STATE_START, &dev->state);
1230
1231         /* Synchronize to scheduled poll. We cannot touch poll list,
1232          * it can be even on different cpu. So just clear netif_running().
1233          *
1234          * dev->stop() will invoke napi_disable() on all of it's
1235          * napi_struct instances on this device.
1236          */
1237         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1238
1239         dev_deactivate(dev);
1240
1241         /*
1242          *      Call the device specific close. This cannot fail.
1243          *      Only if device is UP
1244          *
1245          *      We allow it to be called even after a DETACH hot-plug
1246          *      event.
1247          */
1248         if (ops->ndo_stop)
1249                 ops->ndo_stop(dev);
1250
1251         /*
1252          *      Device is now down.
1253          */
1254
1255         dev->flags &= ~IFF_UP;
1256
1257         /*
1258          *      Shutdown NET_DMA
1259          */
1260         net_dmaengine_put();
1261
1262         return 0;
1263 }
1264
1265 /**
1266  *      dev_close - shutdown an interface.
1267  *      @dev: device to shutdown
1268  *
1269  *      This function moves an active device into down state. A
1270  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1271  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1272  *      chain.
1273  */
1274 int dev_close(struct net_device *dev)
1275 {
1276         if (!(dev->flags & IFF_UP))
1277                 return 0;
1278
1279         __dev_close(dev);
1280
1281         /*
1282          * Tell people we are down
1283          */
1284         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1285         call_netdevice_notifiers(NETDEV_DOWN, dev);
1286
1287         return 0;
1288 }
1289 EXPORT_SYMBOL(dev_close);
1290
1291
1292 /**
1293  *      dev_disable_lro - disable Large Receive Offload on a device
1294  *      @dev: device
1295  *
1296  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1297  *      called under RTNL.  This is needed if received packets may be
1298  *      forwarded to another interface.
1299  */
1300 void dev_disable_lro(struct net_device *dev)
1301 {
1302         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1303             dev->ethtool_ops->set_flags) {
1304                 u32 flags = dev->ethtool_ops->get_flags(dev);
1305                 if (flags & ETH_FLAG_LRO) {
1306                         flags &= ~ETH_FLAG_LRO;
1307                         dev->ethtool_ops->set_flags(dev, flags);
1308                 }
1309         }
1310         WARN_ON(dev->features & NETIF_F_LRO);
1311 }
1312 EXPORT_SYMBOL(dev_disable_lro);
1313
1314
1315 static int dev_boot_phase = 1;
1316
1317 /*
1318  *      Device change register/unregister. These are not inline or static
1319  *      as we export them to the world.
1320  */
1321
1322 /**
1323  *      register_netdevice_notifier - register a network notifier block
1324  *      @nb: notifier
1325  *
1326  *      Register a notifier to be called when network device events occur.
1327  *      The notifier passed is linked into the kernel structures and must
1328  *      not be reused until it has been unregistered. A negative errno code
1329  *      is returned on a failure.
1330  *
1331  *      When registered all registration and up events are replayed
1332  *      to the new notifier to allow device to have a race free
1333  *      view of the network device list.
1334  */
1335
1336 int register_netdevice_notifier(struct notifier_block *nb)
1337 {
1338         struct net_device *dev;
1339         struct net_device *last;
1340         struct net *net;
1341         int err;
1342
1343         rtnl_lock();
1344         err = raw_notifier_chain_register(&netdev_chain, nb);
1345         if (err)
1346                 goto unlock;
1347         if (dev_boot_phase)
1348                 goto unlock;
1349         for_each_net(net) {
1350                 for_each_netdev(net, dev) {
1351                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1352                         err = notifier_to_errno(err);
1353                         if (err)
1354                                 goto rollback;
1355
1356                         if (!(dev->flags & IFF_UP))
1357                                 continue;
1358
1359                         nb->notifier_call(nb, NETDEV_UP, dev);
1360                 }
1361         }
1362
1363 unlock:
1364         rtnl_unlock();
1365         return err;
1366
1367 rollback:
1368         last = dev;
1369         for_each_net(net) {
1370                 for_each_netdev(net, dev) {
1371                         if (dev == last)
1372                                 break;
1373
1374                         if (dev->flags & IFF_UP) {
1375                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1376                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1377                         }
1378                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1379                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1380                 }
1381         }
1382
1383         raw_notifier_chain_unregister(&netdev_chain, nb);
1384         goto unlock;
1385 }
1386 EXPORT_SYMBOL(register_netdevice_notifier);
1387
1388 /**
1389  *      unregister_netdevice_notifier - unregister a network notifier block
1390  *      @nb: notifier
1391  *
1392  *      Unregister a notifier previously registered by
1393  *      register_netdevice_notifier(). The notifier is unlinked into the
1394  *      kernel structures and may then be reused. A negative errno code
1395  *      is returned on a failure.
1396  */
1397
1398 int unregister_netdevice_notifier(struct notifier_block *nb)
1399 {
1400         int err;
1401
1402         rtnl_lock();
1403         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1404         rtnl_unlock();
1405         return err;
1406 }
1407 EXPORT_SYMBOL(unregister_netdevice_notifier);
1408
1409 /**
1410  *      call_netdevice_notifiers - call all network notifier blocks
1411  *      @val: value passed unmodified to notifier function
1412  *      @dev: net_device pointer passed unmodified to notifier function
1413  *
1414  *      Call all network notifier blocks.  Parameters and return value
1415  *      are as for raw_notifier_call_chain().
1416  */
1417
1418 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1419 {
1420         return raw_notifier_call_chain(&netdev_chain, val, dev);
1421 }
1422
1423 /* When > 0 there are consumers of rx skb time stamps */
1424 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1425
1426 void net_enable_timestamp(void)
1427 {
1428         atomic_inc(&netstamp_needed);
1429 }
1430 EXPORT_SYMBOL(net_enable_timestamp);
1431
1432 void net_disable_timestamp(void)
1433 {
1434         atomic_dec(&netstamp_needed);
1435 }
1436 EXPORT_SYMBOL(net_disable_timestamp);
1437
1438 static inline void net_timestamp(struct sk_buff *skb)
1439 {
1440         if (atomic_read(&netstamp_needed))
1441                 __net_timestamp(skb);
1442         else
1443                 skb->tstamp.tv64 = 0;
1444 }
1445
1446 /**
1447  * dev_forward_skb - loopback an skb to another netif
1448  *
1449  * @dev: destination network device
1450  * @skb: buffer to forward
1451  *
1452  * return values:
1453  *      NET_RX_SUCCESS  (no congestion)
1454  *      NET_RX_DROP     (packet was dropped)
1455  *
1456  * dev_forward_skb can be used for injecting an skb from the
1457  * start_xmit function of one device into the receive queue
1458  * of another device.
1459  *
1460  * The receiving device may be in another namespace, so
1461  * we have to clear all information in the skb that could
1462  * impact namespace isolation.
1463  */
1464 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1465 {
1466         skb_orphan(skb);
1467
1468         if (!(dev->flags & IFF_UP))
1469                 return NET_RX_DROP;
1470
1471         if (skb->len > (dev->mtu + dev->hard_header_len))
1472                 return NET_RX_DROP;
1473
1474         skb_set_dev(skb, dev);
1475         skb->tstamp.tv64 = 0;
1476         skb->pkt_type = PACKET_HOST;
1477         skb->protocol = eth_type_trans(skb, dev);
1478         return netif_rx(skb);
1479 }
1480 EXPORT_SYMBOL_GPL(dev_forward_skb);
1481
1482 /*
1483  *      Support routine. Sends outgoing frames to any network
1484  *      taps currently in use.
1485  */
1486
1487 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1488 {
1489         struct packet_type *ptype;
1490
1491 #ifdef CONFIG_NET_CLS_ACT
1492         if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1493                 net_timestamp(skb);
1494 #else
1495         net_timestamp(skb);
1496 #endif
1497
1498         rcu_read_lock();
1499         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1500                 /* Never send packets back to the socket
1501                  * they originated from - MvS (miquels@drinkel.ow.org)
1502                  */
1503                 if ((ptype->dev == dev || !ptype->dev) &&
1504                     (ptype->af_packet_priv == NULL ||
1505                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1506                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1507                         if (!skb2)
1508                                 break;
1509
1510                         /* skb->nh should be correctly
1511                            set by sender, so that the second statement is
1512                            just protection against buggy protocols.
1513                          */
1514                         skb_reset_mac_header(skb2);
1515
1516                         if (skb_network_header(skb2) < skb2->data ||
1517                             skb2->network_header > skb2->tail) {
1518                                 if (net_ratelimit())
1519                                         printk(KERN_CRIT "protocol %04x is "
1520                                                "buggy, dev %s\n",
1521                                                skb2->protocol, dev->name);
1522                                 skb_reset_network_header(skb2);
1523                         }
1524
1525                         skb2->transport_header = skb2->network_header;
1526                         skb2->pkt_type = PACKET_OUTGOING;
1527                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1528                 }
1529         }
1530         rcu_read_unlock();
1531 }
1532
1533
1534 static inline void __netif_reschedule(struct Qdisc *q)
1535 {
1536         struct softnet_data *sd;
1537         unsigned long flags;
1538
1539         local_irq_save(flags);
1540         sd = &__get_cpu_var(softnet_data);
1541         q->next_sched = sd->output_queue;
1542         sd->output_queue = q;
1543         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1544         local_irq_restore(flags);
1545 }
1546
1547 void __netif_schedule(struct Qdisc *q)
1548 {
1549         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1550                 __netif_reschedule(q);
1551 }
1552 EXPORT_SYMBOL(__netif_schedule);
1553
1554 void dev_kfree_skb_irq(struct sk_buff *skb)
1555 {
1556         if (atomic_dec_and_test(&skb->users)) {
1557                 struct softnet_data *sd;
1558                 unsigned long flags;
1559
1560                 local_irq_save(flags);
1561                 sd = &__get_cpu_var(softnet_data);
1562                 skb->next = sd->completion_queue;
1563                 sd->completion_queue = skb;
1564                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1565                 local_irq_restore(flags);
1566         }
1567 }
1568 EXPORT_SYMBOL(dev_kfree_skb_irq);
1569
1570 void dev_kfree_skb_any(struct sk_buff *skb)
1571 {
1572         if (in_irq() || irqs_disabled())
1573                 dev_kfree_skb_irq(skb);
1574         else
1575                 dev_kfree_skb(skb);
1576 }
1577 EXPORT_SYMBOL(dev_kfree_skb_any);
1578
1579
1580 /**
1581  * netif_device_detach - mark device as removed
1582  * @dev: network device
1583  *
1584  * Mark device as removed from system and therefore no longer available.
1585  */
1586 void netif_device_detach(struct net_device *dev)
1587 {
1588         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1589             netif_running(dev)) {
1590                 netif_tx_stop_all_queues(dev);
1591         }
1592 }
1593 EXPORT_SYMBOL(netif_device_detach);
1594
1595 /**
1596  * netif_device_attach - mark device as attached
1597  * @dev: network device
1598  *
1599  * Mark device as attached from system and restart if needed.
1600  */
1601 void netif_device_attach(struct net_device *dev)
1602 {
1603         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1604             netif_running(dev)) {
1605                 netif_tx_wake_all_queues(dev);
1606                 __netdev_watchdog_up(dev);
1607         }
1608 }
1609 EXPORT_SYMBOL(netif_device_attach);
1610
1611 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1612 {
1613         return ((features & NETIF_F_GEN_CSUM) ||
1614                 ((features & NETIF_F_IP_CSUM) &&
1615                  protocol == htons(ETH_P_IP)) ||
1616                 ((features & NETIF_F_IPV6_CSUM) &&
1617                  protocol == htons(ETH_P_IPV6)) ||
1618                 ((features & NETIF_F_FCOE_CRC) &&
1619                  protocol == htons(ETH_P_FCOE)));
1620 }
1621
1622 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1623 {
1624         if (can_checksum_protocol(dev->features, skb->protocol))
1625                 return true;
1626
1627         if (skb->protocol == htons(ETH_P_8021Q)) {
1628                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1629                 if (can_checksum_protocol(dev->features & dev->vlan_features,
1630                                           veh->h_vlan_encapsulated_proto))
1631                         return true;
1632         }
1633
1634         return false;
1635 }
1636
1637 /**
1638  * skb_dev_set -- assign a new device to a buffer
1639  * @skb: buffer for the new device
1640  * @dev: network device
1641  *
1642  * If an skb is owned by a device already, we have to reset
1643  * all data private to the namespace a device belongs to
1644  * before assigning it a new device.
1645  */
1646 #ifdef CONFIG_NET_NS
1647 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1648 {
1649         skb_dst_drop(skb);
1650         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1651                 secpath_reset(skb);
1652                 nf_reset(skb);
1653                 skb_init_secmark(skb);
1654                 skb->mark = 0;
1655                 skb->priority = 0;
1656                 skb->nf_trace = 0;
1657                 skb->ipvs_property = 0;
1658 #ifdef CONFIG_NET_SCHED
1659                 skb->tc_index = 0;
1660 #endif
1661         }
1662         skb->dev = dev;
1663 }
1664 EXPORT_SYMBOL(skb_set_dev);
1665 #endif /* CONFIG_NET_NS */
1666
1667 /*
1668  * Invalidate hardware checksum when packet is to be mangled, and
1669  * complete checksum manually on outgoing path.
1670  */
1671 int skb_checksum_help(struct sk_buff *skb)
1672 {
1673         __wsum csum;
1674         int ret = 0, offset;
1675
1676         if (skb->ip_summed == CHECKSUM_COMPLETE)
1677                 goto out_set_summed;
1678
1679         if (unlikely(skb_shinfo(skb)->gso_size)) {
1680                 /* Let GSO fix up the checksum. */
1681                 goto out_set_summed;
1682         }
1683
1684         offset = skb->csum_start - skb_headroom(skb);
1685         BUG_ON(offset >= skb_headlen(skb));
1686         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1687
1688         offset += skb->csum_offset;
1689         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1690
1691         if (skb_cloned(skb) &&
1692             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1693                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1694                 if (ret)
1695                         goto out;
1696         }
1697
1698         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1699 out_set_summed:
1700         skb->ip_summed = CHECKSUM_NONE;
1701 out:
1702         return ret;
1703 }
1704 EXPORT_SYMBOL(skb_checksum_help);
1705
1706 /**
1707  *      skb_gso_segment - Perform segmentation on skb.
1708  *      @skb: buffer to segment
1709  *      @features: features for the output path (see dev->features)
1710  *
1711  *      This function segments the given skb and returns a list of segments.
1712  *
1713  *      It may return NULL if the skb requires no segmentation.  This is
1714  *      only possible when GSO is used for verifying header integrity.
1715  */
1716 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1717 {
1718         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1719         struct packet_type *ptype;
1720         __be16 type = skb->protocol;
1721         int err;
1722
1723         skb_reset_mac_header(skb);
1724         skb->mac_len = skb->network_header - skb->mac_header;
1725         __skb_pull(skb, skb->mac_len);
1726
1727         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1728                 struct net_device *dev = skb->dev;
1729                 struct ethtool_drvinfo info = {};
1730
1731                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1732                         dev->ethtool_ops->get_drvinfo(dev, &info);
1733
1734                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1735                         "ip_summed=%d",
1736                      info.driver, dev ? dev->features : 0L,
1737                      skb->sk ? skb->sk->sk_route_caps : 0L,
1738                      skb->len, skb->data_len, skb->ip_summed);
1739
1740                 if (skb_header_cloned(skb) &&
1741                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1742                         return ERR_PTR(err);
1743         }
1744
1745         rcu_read_lock();
1746         list_for_each_entry_rcu(ptype,
1747                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1748                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1749                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1750                                 err = ptype->gso_send_check(skb);
1751                                 segs = ERR_PTR(err);
1752                                 if (err || skb_gso_ok(skb, features))
1753                                         break;
1754                                 __skb_push(skb, (skb->data -
1755                                                  skb_network_header(skb)));
1756                         }
1757                         segs = ptype->gso_segment(skb, features);
1758                         break;
1759                 }
1760         }
1761         rcu_read_unlock();
1762
1763         __skb_push(skb, skb->data - skb_mac_header(skb));
1764
1765         return segs;
1766 }
1767 EXPORT_SYMBOL(skb_gso_segment);
1768
1769 /* Take action when hardware reception checksum errors are detected. */
1770 #ifdef CONFIG_BUG
1771 void netdev_rx_csum_fault(struct net_device *dev)
1772 {
1773         if (net_ratelimit()) {
1774                 printk(KERN_ERR "%s: hw csum failure.\n",
1775                         dev ? dev->name : "<unknown>");
1776                 dump_stack();
1777         }
1778 }
1779 EXPORT_SYMBOL(netdev_rx_csum_fault);
1780 #endif
1781
1782 /* Actually, we should eliminate this check as soon as we know, that:
1783  * 1. IOMMU is present and allows to map all the memory.
1784  * 2. No high memory really exists on this machine.
1785  */
1786
1787 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1788 {
1789 #ifdef CONFIG_HIGHMEM
1790         int i;
1791
1792         if (dev->features & NETIF_F_HIGHDMA)
1793                 return 0;
1794
1795         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1796                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1797                         return 1;
1798
1799 #endif
1800         return 0;
1801 }
1802
1803 struct dev_gso_cb {
1804         void (*destructor)(struct sk_buff *skb);
1805 };
1806
1807 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1808
1809 static void dev_gso_skb_destructor(struct sk_buff *skb)
1810 {
1811         struct dev_gso_cb *cb;
1812
1813         do {
1814                 struct sk_buff *nskb = skb->next;
1815
1816                 skb->next = nskb->next;
1817                 nskb->next = NULL;
1818                 kfree_skb(nskb);
1819         } while (skb->next);
1820
1821         cb = DEV_GSO_CB(skb);
1822         if (cb->destructor)
1823                 cb->destructor(skb);
1824 }
1825
1826 /**
1827  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1828  *      @skb: buffer to segment
1829  *
1830  *      This function segments the given skb and stores the list of segments
1831  *      in skb->next.
1832  */
1833 static int dev_gso_segment(struct sk_buff *skb)
1834 {
1835         struct net_device *dev = skb->dev;
1836         struct sk_buff *segs;
1837         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1838                                          NETIF_F_SG : 0);
1839
1840         segs = skb_gso_segment(skb, features);
1841
1842         /* Verifying header integrity only. */
1843         if (!segs)
1844                 return 0;
1845
1846         if (IS_ERR(segs))
1847                 return PTR_ERR(segs);
1848
1849         skb->next = segs;
1850         DEV_GSO_CB(skb)->destructor = skb->destructor;
1851         skb->destructor = dev_gso_skb_destructor;
1852
1853         return 0;
1854 }
1855
1856 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1857                         struct netdev_queue *txq)
1858 {
1859         const struct net_device_ops *ops = dev->netdev_ops;
1860         int rc = NETDEV_TX_OK;
1861
1862         if (likely(!skb->next)) {
1863                 if (!list_empty(&ptype_all))
1864                         dev_queue_xmit_nit(skb, dev);
1865
1866                 if (netif_needs_gso(dev, skb)) {
1867                         if (unlikely(dev_gso_segment(skb)))
1868                                 goto out_kfree_skb;
1869                         if (skb->next)
1870                                 goto gso;
1871                 }
1872
1873                 /*
1874                  * If device doesnt need skb->dst, release it right now while
1875                  * its hot in this cpu cache
1876                  */
1877                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1878                         skb_dst_drop(skb);
1879
1880                 rc = ops->ndo_start_xmit(skb, dev);
1881                 if (rc == NETDEV_TX_OK)
1882                         txq_trans_update(txq);
1883                 /*
1884                  * TODO: if skb_orphan() was called by
1885                  * dev->hard_start_xmit() (for example, the unmodified
1886                  * igb driver does that; bnx2 doesn't), then
1887                  * skb_tx_software_timestamp() will be unable to send
1888                  * back the time stamp.
1889                  *
1890                  * How can this be prevented? Always create another
1891                  * reference to the socket before calling
1892                  * dev->hard_start_xmit()? Prevent that skb_orphan()
1893                  * does anything in dev->hard_start_xmit() by clearing
1894                  * the skb destructor before the call and restoring it
1895                  * afterwards, then doing the skb_orphan() ourselves?
1896                  */
1897                 return rc;
1898         }
1899
1900 gso:
1901         do {
1902                 struct sk_buff *nskb = skb->next;
1903
1904                 skb->next = nskb->next;
1905                 nskb->next = NULL;
1906
1907                 /*
1908                  * If device doesnt need nskb->dst, release it right now while
1909                  * its hot in this cpu cache
1910                  */
1911                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1912                         skb_dst_drop(nskb);
1913
1914                 rc = ops->ndo_start_xmit(nskb, dev);
1915                 if (unlikely(rc != NETDEV_TX_OK)) {
1916                         if (rc & ~NETDEV_TX_MASK)
1917                                 goto out_kfree_gso_skb;
1918                         nskb->next = skb->next;
1919                         skb->next = nskb;
1920                         return rc;
1921                 }
1922                 txq_trans_update(txq);
1923                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1924                         return NETDEV_TX_BUSY;
1925         } while (skb->next);
1926
1927 out_kfree_gso_skb:
1928         if (likely(skb->next == NULL))
1929                 skb->destructor = DEV_GSO_CB(skb)->destructor;
1930 out_kfree_skb:
1931         kfree_skb(skb);
1932         return rc;
1933 }
1934
1935 static u32 skb_tx_hashrnd;
1936
1937 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1938 {
1939         u32 hash;
1940
1941         if (skb_rx_queue_recorded(skb)) {
1942                 hash = skb_get_rx_queue(skb);
1943                 while (unlikely(hash >= dev->real_num_tx_queues))
1944                         hash -= dev->real_num_tx_queues;
1945                 return hash;
1946         }
1947
1948         if (skb->sk && skb->sk->sk_hash)
1949                 hash = skb->sk->sk_hash;
1950         else
1951                 hash = skb->protocol;
1952
1953         hash = jhash_1word(hash, skb_tx_hashrnd);
1954
1955         return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1956 }
1957 EXPORT_SYMBOL(skb_tx_hash);
1958
1959 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1960 {
1961         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1962                 if (net_ratelimit()) {
1963                         WARN(1, "%s selects TX queue %d, but "
1964                              "real number of TX queues is %d\n",
1965                              dev->name, queue_index,
1966                              dev->real_num_tx_queues);
1967                 }
1968                 return 0;
1969         }
1970         return queue_index;
1971 }
1972
1973 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1974                                         struct sk_buff *skb)
1975 {
1976         u16 queue_index;
1977         struct sock *sk = skb->sk;
1978
1979         if (sk_tx_queue_recorded(sk)) {
1980                 queue_index = sk_tx_queue_get(sk);
1981         } else {
1982                 const struct net_device_ops *ops = dev->netdev_ops;
1983
1984                 if (ops->ndo_select_queue) {
1985                         queue_index = ops->ndo_select_queue(dev, skb);
1986                         queue_index = dev_cap_txqueue(dev, queue_index);
1987                 } else {
1988                         queue_index = 0;
1989                         if (dev->real_num_tx_queues > 1)
1990                                 queue_index = skb_tx_hash(dev, skb);
1991
1992                         if (sk) {
1993                                 struct dst_entry *dst = rcu_dereference_bh(sk->sk_dst_cache);
1994
1995                                 if (dst && skb_dst(skb) == dst)
1996                                         sk_tx_queue_set(sk, queue_index);
1997                         }
1998                 }
1999         }
2000
2001         skb_set_queue_mapping(skb, queue_index);
2002         return netdev_get_tx_queue(dev, queue_index);
2003 }
2004
2005 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2006                                  struct net_device *dev,
2007                                  struct netdev_queue *txq)
2008 {
2009         spinlock_t *root_lock = qdisc_lock(q);
2010         int rc;
2011
2012         spin_lock(root_lock);
2013         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2014                 kfree_skb(skb);
2015                 rc = NET_XMIT_DROP;
2016         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2017                    !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
2018                 /*
2019                  * This is a work-conserving queue; there are no old skbs
2020                  * waiting to be sent out; and the qdisc is not running -
2021                  * xmit the skb directly.
2022                  */
2023                 __qdisc_update_bstats(q, skb->len);
2024                 if (sch_direct_xmit(skb, q, dev, txq, root_lock))
2025                         __qdisc_run(q);
2026                 else
2027                         clear_bit(__QDISC_STATE_RUNNING, &q->state);
2028
2029                 rc = NET_XMIT_SUCCESS;
2030         } else {
2031                 rc = qdisc_enqueue_root(skb, q);
2032                 qdisc_run(q);
2033         }
2034         spin_unlock(root_lock);
2035
2036         return rc;
2037 }
2038
2039 /*
2040  * Returns true if either:
2041  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2042  *      2. skb is fragmented and the device does not support SG, or if
2043  *         at least one of fragments is in highmem and device does not
2044  *         support DMA from it.
2045  */
2046 static inline int skb_needs_linearize(struct sk_buff *skb,
2047                                       struct net_device *dev)
2048 {
2049         return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
2050                (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
2051                                               illegal_highdma(dev, skb)));
2052 }
2053
2054 /**
2055  *      dev_queue_xmit - transmit a buffer
2056  *      @skb: buffer to transmit
2057  *
2058  *      Queue a buffer for transmission to a network device. The caller must
2059  *      have set the device and priority and built the buffer before calling
2060  *      this function. The function can be called from an interrupt.
2061  *
2062  *      A negative errno code is returned on a failure. A success does not
2063  *      guarantee the frame will be transmitted as it may be dropped due
2064  *      to congestion or traffic shaping.
2065  *
2066  * -----------------------------------------------------------------------------------
2067  *      I notice this method can also return errors from the queue disciplines,
2068  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2069  *      be positive.
2070  *
2071  *      Regardless of the return value, the skb is consumed, so it is currently
2072  *      difficult to retry a send to this method.  (You can bump the ref count
2073  *      before sending to hold a reference for retry if you are careful.)
2074  *
2075  *      When calling this method, interrupts MUST be enabled.  This is because
2076  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2077  *          --BLG
2078  */
2079 int dev_queue_xmit(struct sk_buff *skb)
2080 {
2081         struct net_device *dev = skb->dev;
2082         struct netdev_queue *txq;
2083         struct Qdisc *q;
2084         int rc = -ENOMEM;
2085
2086         /* GSO will handle the following emulations directly. */
2087         if (netif_needs_gso(dev, skb))
2088                 goto gso;
2089
2090         /* Convert a paged skb to linear, if required */
2091         if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
2092                 goto out_kfree_skb;
2093
2094         /* If packet is not checksummed and device does not support
2095          * checksumming for this protocol, complete checksumming here.
2096          */
2097         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2098                 skb_set_transport_header(skb, skb->csum_start -
2099                                               skb_headroom(skb));
2100                 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2101                         goto out_kfree_skb;
2102         }
2103
2104 gso:
2105         /* Disable soft irqs for various locks below. Also
2106          * stops preemption for RCU.
2107          */
2108         rcu_read_lock_bh();
2109
2110         txq = dev_pick_tx(dev, skb);
2111         q = rcu_dereference_bh(txq->qdisc);
2112
2113 #ifdef CONFIG_NET_CLS_ACT
2114         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2115 #endif
2116         if (q->enqueue) {
2117                 rc = __dev_xmit_skb(skb, q, dev, txq);
2118                 goto out;
2119         }
2120
2121         /* The device has no queue. Common case for software devices:
2122            loopback, all the sorts of tunnels...
2123
2124            Really, it is unlikely that netif_tx_lock protection is necessary
2125            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2126            counters.)
2127            However, it is possible, that they rely on protection
2128            made by us here.
2129
2130            Check this and shot the lock. It is not prone from deadlocks.
2131            Either shot noqueue qdisc, it is even simpler 8)
2132          */
2133         if (dev->flags & IFF_UP) {
2134                 int cpu = smp_processor_id(); /* ok because BHs are off */
2135
2136                 if (txq->xmit_lock_owner != cpu) {
2137
2138                         HARD_TX_LOCK(dev, txq, cpu);
2139
2140                         if (!netif_tx_queue_stopped(txq)) {
2141                                 rc = dev_hard_start_xmit(skb, dev, txq);
2142                                 if (dev_xmit_complete(rc)) {
2143                                         HARD_TX_UNLOCK(dev, txq);
2144                                         goto out;
2145                                 }
2146                         }
2147                         HARD_TX_UNLOCK(dev, txq);
2148                         if (net_ratelimit())
2149                                 printk(KERN_CRIT "Virtual device %s asks to "
2150                                        "queue packet!\n", dev->name);
2151                 } else {
2152                         /* Recursion is detected! It is possible,
2153                          * unfortunately */
2154                         if (net_ratelimit())
2155                                 printk(KERN_CRIT "Dead loop on virtual device "
2156                                        "%s, fix it urgently!\n", dev->name);
2157                 }
2158         }
2159
2160         rc = -ENETDOWN;
2161         rcu_read_unlock_bh();
2162
2163 out_kfree_skb:
2164         kfree_skb(skb);
2165         return rc;
2166 out:
2167         rcu_read_unlock_bh();
2168         return rc;
2169 }
2170 EXPORT_SYMBOL(dev_queue_xmit);
2171
2172
2173 /*=======================================================================
2174                         Receiver routines
2175   =======================================================================*/
2176
2177 int netdev_max_backlog __read_mostly = 1000;
2178 int netdev_budget __read_mostly = 300;
2179 int weight_p __read_mostly = 64;            /* old backlog weight */
2180
2181 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2182
2183
2184 /**
2185  *      netif_rx        -       post buffer to the network code
2186  *      @skb: buffer to post
2187  *
2188  *      This function receives a packet from a device driver and queues it for
2189  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2190  *      may be dropped during processing for congestion control or by the
2191  *      protocol layers.
2192  *
2193  *      return values:
2194  *      NET_RX_SUCCESS  (no congestion)
2195  *      NET_RX_DROP     (packet was dropped)
2196  *
2197  */
2198
2199 int netif_rx(struct sk_buff *skb)
2200 {
2201         struct softnet_data *queue;
2202         unsigned long flags;
2203
2204         /* if netpoll wants it, pretend we never saw it */
2205         if (netpoll_rx(skb))
2206                 return NET_RX_DROP;
2207
2208         if (!skb->tstamp.tv64)
2209                 net_timestamp(skb);
2210
2211         /*
2212          * The code is rearranged so that the path is the most
2213          * short when CPU is congested, but is still operating.
2214          */
2215         local_irq_save(flags);
2216         queue = &__get_cpu_var(softnet_data);
2217
2218         __get_cpu_var(netdev_rx_stat).total++;
2219         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2220                 if (queue->input_pkt_queue.qlen) {
2221 enqueue:
2222                         __skb_queue_tail(&queue->input_pkt_queue, skb);
2223                         local_irq_restore(flags);
2224                         return NET_RX_SUCCESS;
2225                 }
2226
2227                 napi_schedule(&queue->backlog);
2228                 goto enqueue;
2229         }
2230
2231         __get_cpu_var(netdev_rx_stat).dropped++;
2232         local_irq_restore(flags);
2233
2234         kfree_skb(skb);
2235         return NET_RX_DROP;
2236 }
2237 EXPORT_SYMBOL(netif_rx);
2238
2239 int netif_rx_ni(struct sk_buff *skb)
2240 {
2241         int err;
2242
2243         preempt_disable();
2244         err = netif_rx(skb);
2245         if (local_softirq_pending())
2246                 do_softirq();
2247         preempt_enable();
2248
2249         return err;
2250 }
2251 EXPORT_SYMBOL(netif_rx_ni);
2252
2253 static void net_tx_action(struct softirq_action *h)
2254 {
2255         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2256
2257         if (sd->completion_queue) {
2258                 struct sk_buff *clist;
2259
2260                 local_irq_disable();
2261                 clist = sd->completion_queue;
2262                 sd->completion_queue = NULL;
2263                 local_irq_enable();
2264
2265                 while (clist) {
2266                         struct sk_buff *skb = clist;
2267                         clist = clist->next;
2268
2269                         WARN_ON(atomic_read(&skb->users));
2270                         __kfree_skb(skb);
2271                 }
2272         }
2273
2274         if (sd->output_queue) {
2275                 struct Qdisc *head;
2276
2277                 local_irq_disable();
2278                 head = sd->output_queue;
2279                 sd->output_queue = NULL;
2280                 local_irq_enable();
2281
2282                 while (head) {
2283                         struct Qdisc *q = head;
2284                         spinlock_t *root_lock;
2285
2286                         head = head->next_sched;
2287
2288                         root_lock = qdisc_lock(q);
2289                         if (spin_trylock(root_lock)) {
2290                                 smp_mb__before_clear_bit();
2291                                 clear_bit(__QDISC_STATE_SCHED,
2292                                           &q->state);
2293                                 qdisc_run(q);
2294                                 spin_unlock(root_lock);
2295                         } else {
2296                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2297                                               &q->state)) {
2298                                         __netif_reschedule(q);
2299                                 } else {
2300                                         smp_mb__before_clear_bit();
2301                                         clear_bit(__QDISC_STATE_SCHED,
2302                                                   &q->state);
2303                                 }
2304                         }
2305                 }
2306         }
2307 }
2308
2309 static inline int deliver_skb(struct sk_buff *skb,
2310                               struct packet_type *pt_prev,
2311                               struct net_device *orig_dev)
2312 {
2313         atomic_inc(&skb->users);
2314         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2315 }
2316
2317 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2318
2319 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2320 /* This hook is defined here for ATM LANE */
2321 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2322                              unsigned char *addr) __read_mostly;
2323 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2324 #endif
2325
2326 /*
2327  * If bridge module is loaded call bridging hook.
2328  *  returns NULL if packet was consumed.
2329  */
2330 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2331                                         struct sk_buff *skb) __read_mostly;
2332 EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2333
2334 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2335                                             struct packet_type **pt_prev, int *ret,
2336                                             struct net_device *orig_dev)
2337 {
2338         struct net_bridge_port *port;
2339
2340         if (skb->pkt_type == PACKET_LOOPBACK ||
2341             (port = rcu_dereference(skb->dev->br_port)) == NULL)
2342                 return skb;
2343
2344         if (*pt_prev) {
2345                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2346                 *pt_prev = NULL;
2347         }
2348
2349         return br_handle_frame_hook(port, skb);
2350 }
2351 #else
2352 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
2353 #endif
2354
2355 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2356 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2357 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2358
2359 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2360                                              struct packet_type **pt_prev,
2361                                              int *ret,
2362                                              struct net_device *orig_dev)
2363 {
2364         if (skb->dev->macvlan_port == NULL)
2365                 return skb;
2366
2367         if (*pt_prev) {
2368                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2369                 *pt_prev = NULL;
2370         }
2371         return macvlan_handle_frame_hook(skb);
2372 }
2373 #else
2374 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
2375 #endif
2376
2377 #ifdef CONFIG_NET_CLS_ACT
2378 /* TODO: Maybe we should just force sch_ingress to be compiled in
2379  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2380  * a compare and 2 stores extra right now if we dont have it on
2381  * but have CONFIG_NET_CLS_ACT
2382  * NOTE: This doesnt stop any functionality; if you dont have
2383  * the ingress scheduler, you just cant add policies on ingress.
2384  *
2385  */
2386 static int ing_filter(struct sk_buff *skb)
2387 {
2388         struct net_device *dev = skb->dev;
2389         u32 ttl = G_TC_RTTL(skb->tc_verd);
2390         struct netdev_queue *rxq;
2391         int result = TC_ACT_OK;
2392         struct Qdisc *q;
2393
2394         if (MAX_RED_LOOP < ttl++) {
2395                 printk(KERN_WARNING
2396                        "Redir loop detected Dropping packet (%d->%d)\n",
2397                        skb->skb_iif, dev->ifindex);
2398                 return TC_ACT_SHOT;
2399         }
2400
2401         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2402         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2403
2404         rxq = &dev->rx_queue;
2405
2406         q = rxq->qdisc;
2407         if (q != &noop_qdisc) {
2408                 spin_lock(qdisc_lock(q));
2409                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2410                         result = qdisc_enqueue_root(skb, q);
2411                 spin_unlock(qdisc_lock(q));
2412         }
2413
2414         return result;
2415 }
2416
2417 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2418                                          struct packet_type **pt_prev,
2419                                          int *ret, struct net_device *orig_dev)
2420 {
2421         if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2422                 goto out;
2423
2424         if (*pt_prev) {
2425                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2426                 *pt_prev = NULL;
2427         } else {
2428                 /* Huh? Why does turning on AF_PACKET affect this? */
2429                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2430         }
2431
2432         switch (ing_filter(skb)) {
2433         case TC_ACT_SHOT:
2434         case TC_ACT_STOLEN:
2435                 kfree_skb(skb);
2436                 return NULL;
2437         }
2438
2439 out:
2440         skb->tc_verd = 0;
2441         return skb;
2442 }
2443 #endif
2444
2445 /*
2446  *      netif_nit_deliver - deliver received packets to network taps
2447  *      @skb: buffer
2448  *
2449  *      This function is used to deliver incoming packets to network
2450  *      taps. It should be used when the normal netif_receive_skb path
2451  *      is bypassed, for example because of VLAN acceleration.
2452  */
2453 void netif_nit_deliver(struct sk_buff *skb)
2454 {
2455         struct packet_type *ptype;
2456
2457         if (list_empty(&ptype_all))
2458                 return;
2459
2460         skb_reset_network_header(skb);
2461         skb_reset_transport_header(skb);
2462         skb->mac_len = skb->network_header - skb->mac_header;
2463
2464         rcu_read_lock();
2465         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2466                 if (!ptype->dev || ptype->dev == skb->dev)
2467                         deliver_skb(skb, ptype, skb->dev);
2468         }
2469         rcu_read_unlock();
2470 }
2471
2472 /**
2473  *      netif_receive_skb - process receive buffer from network
2474  *      @skb: buffer to process
2475  *
2476  *      netif_receive_skb() is the main receive data processing function.
2477  *      It always succeeds. The buffer may be dropped during processing
2478  *      for congestion control or by the protocol layers.
2479  *
2480  *      This function may only be called from softirq context and interrupts
2481  *      should be enabled.
2482  *
2483  *      Return values (usually ignored):
2484  *      NET_RX_SUCCESS: no congestion
2485  *      NET_RX_DROP: packet was dropped
2486  */
2487 int netif_receive_skb(struct sk_buff *skb)
2488 {
2489         struct packet_type *ptype, *pt_prev;
2490         struct net_device *orig_dev;
2491         struct net_device *master;
2492         struct net_device *null_or_orig;
2493         struct net_device *null_or_bond;
2494         int ret = NET_RX_DROP;
2495         __be16 type;
2496
2497         if (!skb->tstamp.tv64)
2498                 net_timestamp(skb);
2499
2500         if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2501                 return NET_RX_SUCCESS;
2502
2503         /* if we've gotten here through NAPI, check netpoll */
2504         if (netpoll_receive_skb(skb))
2505                 return NET_RX_DROP;
2506
2507         if (!skb->skb_iif)
2508                 skb->skb_iif = skb->dev->ifindex;
2509
2510         null_or_orig = NULL;
2511         orig_dev = skb->dev;
2512         master = ACCESS_ONCE(orig_dev->master);
2513         if (master) {
2514                 if (skb_bond_should_drop(skb, master))
2515                         null_or_orig = orig_dev; /* deliver only exact match */
2516                 else
2517                         skb->dev = master;
2518         }
2519
2520         __get_cpu_var(netdev_rx_stat).total++;
2521
2522         skb_reset_network_header(skb);
2523         skb_reset_transport_header(skb);
2524         skb->mac_len = skb->network_header - skb->mac_header;
2525
2526         pt_prev = NULL;
2527
2528         rcu_read_lock();
2529
2530 #ifdef CONFIG_NET_CLS_ACT
2531         if (skb->tc_verd & TC_NCLS) {
2532                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2533                 goto ncls;
2534         }
2535 #endif
2536
2537         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2538                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2539                     ptype->dev == orig_dev) {
2540                         if (pt_prev)
2541                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2542                         pt_prev = ptype;
2543                 }
2544         }
2545
2546 #ifdef CONFIG_NET_CLS_ACT
2547         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2548         if (!skb)
2549                 goto out;
2550 ncls:
2551 #endif
2552
2553         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2554         if (!skb)
2555                 goto out;
2556         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2557         if (!skb)
2558                 goto out;
2559
2560         /*
2561          * Make sure frames received on VLAN interfaces stacked on
2562          * bonding interfaces still make their way to any base bonding
2563          * device that may have registered for a specific ptype.  The
2564          * handler may have to adjust skb->dev and orig_dev.
2565          */
2566         null_or_bond = NULL;
2567         if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2568             (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2569                 null_or_bond = vlan_dev_real_dev(skb->dev);
2570         }
2571
2572         type = skb->protocol;
2573         list_for_each_entry_rcu(ptype,
2574                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2575                 if (ptype->type == type && (ptype->dev == null_or_orig ||
2576                      ptype->dev == skb->dev || ptype->dev == orig_dev ||
2577                      ptype->dev == null_or_bond)) {
2578                         if (pt_prev)
2579                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2580                         pt_prev = ptype;
2581                 }
2582         }
2583
2584         if (pt_prev) {
2585                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2586         } else {
2587                 kfree_skb(skb);
2588                 /* Jamal, now you will not able to escape explaining
2589                  * me how you were going to use this. :-)
2590                  */
2591                 ret = NET_RX_DROP;
2592         }
2593
2594 out:
2595         rcu_read_unlock();
2596         return ret;
2597 }
2598 EXPORT_SYMBOL(netif_receive_skb);
2599
2600 /* Network device is going away, flush any packets still pending  */
2601 static void flush_backlog(void *arg)
2602 {
2603         struct net_device *dev = arg;
2604         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2605         struct sk_buff *skb, *tmp;
2606
2607         skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2608                 if (skb->dev == dev) {
2609                         __skb_unlink(skb, &queue->input_pkt_queue);
2610                         kfree_skb(skb);
2611                 }
2612 }
2613
2614 static int napi_gro_complete(struct sk_buff *skb)
2615 {
2616         struct packet_type *ptype;
2617         __be16 type = skb->protocol;
2618         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2619         int err = -ENOENT;
2620
2621         if (NAPI_GRO_CB(skb)->count == 1) {
2622                 skb_shinfo(skb)->gso_size = 0;
2623                 goto out;
2624         }
2625
2626         rcu_read_lock();
2627         list_for_each_entry_rcu(ptype, head, list) {
2628                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2629                         continue;
2630
2631                 err = ptype->gro_complete(skb);
2632                 break;
2633         }
2634         rcu_read_unlock();
2635
2636         if (err) {
2637                 WARN_ON(&ptype->list == head);
2638                 kfree_skb(skb);
2639                 return NET_RX_SUCCESS;
2640         }
2641
2642 out:
2643         return netif_receive_skb(skb);
2644 }
2645
2646 static void napi_gro_flush(struct napi_struct *napi)
2647 {
2648         struct sk_buff *skb, *next;
2649
2650         for (skb = napi->gro_list; skb; skb = next) {
2651                 next = skb->next;
2652                 skb->next = NULL;
2653                 napi_gro_complete(skb);
2654         }
2655
2656         napi->gro_count = 0;
2657         napi->gro_list = NULL;
2658 }
2659
2660 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2661 {
2662         struct sk_buff **pp = NULL;
2663         struct packet_type *ptype;
2664         __be16 type = skb->protocol;
2665         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2666         int same_flow;
2667         int mac_len;
2668         enum gro_result ret;
2669
2670         if (!(skb->dev->features & NETIF_F_GRO))
2671                 goto normal;
2672
2673         if (skb_is_gso(skb) || skb_has_frags(skb))
2674                 goto normal;
2675
2676         rcu_read_lock();
2677         list_for_each_entry_rcu(ptype, head, list) {
2678                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2679                         continue;
2680
2681                 skb_set_network_header(skb, skb_gro_offset(skb));
2682                 mac_len = skb->network_header - skb->mac_header;
2683                 skb->mac_len = mac_len;
2684                 NAPI_GRO_CB(skb)->same_flow = 0;
2685                 NAPI_GRO_CB(skb)->flush = 0;
2686                 NAPI_GRO_CB(skb)->free = 0;
2687
2688                 pp = ptype->gro_receive(&napi->gro_list, skb);
2689                 break;
2690         }
2691         rcu_read_unlock();
2692
2693         if (&ptype->list == head)
2694                 goto normal;
2695
2696         same_flow = NAPI_GRO_CB(skb)->same_flow;
2697         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2698
2699         if (pp) {
2700                 struct sk_buff *nskb = *pp;
2701
2702                 *pp = nskb->next;
2703                 nskb->next = NULL;
2704                 napi_gro_complete(nskb);
2705                 napi->gro_count--;
2706         }
2707
2708         if (same_flow)
2709                 goto ok;
2710
2711         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2712                 goto normal;
2713
2714         napi->gro_count++;
2715         NAPI_GRO_CB(skb)->count = 1;
2716         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2717         skb->next = napi->gro_list;
2718         napi->gro_list = skb;
2719         ret = GRO_HELD;
2720
2721 pull:
2722         if (skb_headlen(skb) < skb_gro_offset(skb)) {
2723                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
2724
2725                 BUG_ON(skb->end - skb->tail < grow);
2726
2727                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
2728
2729                 skb->tail += grow;
2730                 skb->data_len -= grow;
2731
2732                 skb_shinfo(skb)->frags[0].page_offset += grow;
2733                 skb_shinfo(skb)->frags[0].size -= grow;
2734
2735                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
2736                         put_page(skb_shinfo(skb)->frags[0].page);
2737                         memmove(skb_shinfo(skb)->frags,
2738                                 skb_shinfo(skb)->frags + 1,
2739                                 --skb_shinfo(skb)->nr_frags);
2740                 }
2741         }
2742
2743 ok:
2744         return ret;
2745
2746 normal:
2747         ret = GRO_NORMAL;
2748         goto pull;
2749 }
2750 EXPORT_SYMBOL(dev_gro_receive);
2751
2752 static gro_result_t
2753 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2754 {
2755         struct sk_buff *p;
2756
2757         if (netpoll_rx_on(skb))
2758                 return GRO_NORMAL;
2759
2760         for (p = napi->gro_list; p; p = p->next) {
2761                 NAPI_GRO_CB(p)->same_flow =
2762                         (p->dev == skb->dev) &&
2763                         !compare_ether_header(skb_mac_header(p),
2764                                               skb_gro_mac_header(skb));
2765                 NAPI_GRO_CB(p)->flush = 0;
2766         }
2767
2768         return dev_gro_receive(napi, skb);
2769 }
2770
2771 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
2772 {
2773         switch (ret) {
2774         case GRO_NORMAL:
2775                 if (netif_receive_skb(skb))
2776                         ret = GRO_DROP;
2777                 break;
2778
2779         case GRO_DROP:
2780         case GRO_MERGED_FREE:
2781                 kfree_skb(skb);
2782                 break;
2783
2784         case GRO_HELD:
2785         case GRO_MERGED:
2786                 break;
2787         }
2788
2789         return ret;
2790 }
2791 EXPORT_SYMBOL(napi_skb_finish);
2792
2793 void skb_gro_reset_offset(struct sk_buff *skb)
2794 {
2795         NAPI_GRO_CB(skb)->data_offset = 0;
2796         NAPI_GRO_CB(skb)->frag0 = NULL;
2797         NAPI_GRO_CB(skb)->frag0_len = 0;
2798
2799         if (skb->mac_header == skb->tail &&
2800             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
2801                 NAPI_GRO_CB(skb)->frag0 =
2802                         page_address(skb_shinfo(skb)->frags[0].page) +
2803                         skb_shinfo(skb)->frags[0].page_offset;
2804                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
2805         }
2806 }
2807 EXPORT_SYMBOL(skb_gro_reset_offset);
2808
2809 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2810 {
2811         skb_gro_reset_offset(skb);
2812
2813         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2814 }
2815 EXPORT_SYMBOL(napi_gro_receive);
2816
2817 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2818 {
2819         __skb_pull(skb, skb_headlen(skb));
2820         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2821
2822         napi->skb = skb;
2823 }
2824 EXPORT_SYMBOL(napi_reuse_skb);
2825
2826 struct sk_buff *napi_get_frags(struct napi_struct *napi)
2827 {
2828         struct sk_buff *skb = napi->skb;
2829
2830         if (!skb) {
2831                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
2832                 if (skb)
2833                         napi->skb = skb;
2834         }
2835         return skb;
2836 }
2837 EXPORT_SYMBOL(napi_get_frags);
2838
2839 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
2840                                gro_result_t ret)
2841 {
2842         switch (ret) {
2843         case GRO_NORMAL:
2844         case GRO_HELD:
2845                 skb->protocol = eth_type_trans(skb, skb->dev);
2846
2847                 if (ret == GRO_HELD)
2848                         skb_gro_pull(skb, -ETH_HLEN);
2849                 else if (netif_receive_skb(skb))
2850                         ret = GRO_DROP;
2851                 break;
2852
2853         case GRO_DROP:
2854         case GRO_MERGED_FREE:
2855                 napi_reuse_skb(napi, skb);
2856                 break;
2857
2858         case GRO_MERGED:
2859                 break;
2860         }
2861
2862         return ret;
2863 }
2864 EXPORT_SYMBOL(napi_frags_finish);
2865
2866 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
2867 {
2868         struct sk_buff *skb = napi->skb;
2869         struct ethhdr *eth;
2870         unsigned int hlen;
2871         unsigned int off;
2872
2873         napi->skb = NULL;
2874
2875         skb_reset_mac_header(skb);
2876         skb_gro_reset_offset(skb);
2877
2878         off = skb_gro_offset(skb);
2879         hlen = off + sizeof(*eth);
2880         eth = skb_gro_header_fast(skb, off);
2881         if (skb_gro_header_hard(skb, hlen)) {
2882                 eth = skb_gro_header_slow(skb, hlen, off);
2883                 if (unlikely(!eth)) {
2884                         napi_reuse_skb(napi, skb);
2885                         skb = NULL;
2886                         goto out;
2887                 }
2888         }
2889
2890         skb_gro_pull(skb, sizeof(*eth));
2891
2892         /*
2893          * This works because the only protocols we care about don't require
2894          * special handling.  We'll fix it up properly at the end.
2895          */
2896         skb->protocol = eth->h_proto;
2897
2898 out:
2899         return skb;
2900 }
2901 EXPORT_SYMBOL(napi_frags_skb);
2902
2903 gro_result_t napi_gro_frags(struct napi_struct *napi)
2904 {
2905         struct sk_buff *skb = napi_frags_skb(napi);
2906
2907         if (!skb)
2908                 return GRO_DROP;
2909
2910         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2911 }
2912 EXPORT_SYMBOL(napi_gro_frags);
2913
2914 static int process_backlog(struct napi_struct *napi, int quota)
2915 {
2916         int work = 0;
2917         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2918         unsigned long start_time = jiffies;
2919
2920         napi->weight = weight_p;
2921         do {
2922                 struct sk_buff *skb;
2923
2924                 local_irq_disable();
2925                 skb = __skb_dequeue(&queue->input_pkt_queue);
2926                 if (!skb) {
2927                         __napi_complete(napi);
2928                         local_irq_enable();
2929                         break;
2930                 }
2931                 local_irq_enable();
2932
2933                 netif_receive_skb(skb);
2934         } while (++work < quota && jiffies == start_time);
2935
2936         return work;
2937 }
2938
2939 /**
2940  * __napi_schedule - schedule for receive
2941  * @n: entry to schedule
2942  *
2943  * The entry's receive function will be scheduled to run
2944  */
2945 void __napi_schedule(struct napi_struct *n)
2946 {
2947         unsigned long flags;
2948
2949         local_irq_save(flags);
2950         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2951         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2952         local_irq_restore(flags);
2953 }
2954 EXPORT_SYMBOL(__napi_schedule);
2955
2956 void __napi_complete(struct napi_struct *n)
2957 {
2958         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2959         BUG_ON(n->gro_list);
2960
2961         list_del(&n->poll_list);
2962         smp_mb__before_clear_bit();
2963         clear_bit(NAPI_STATE_SCHED, &n->state);
2964 }
2965 EXPORT_SYMBOL(__napi_complete);
2966
2967 void napi_complete(struct napi_struct *n)
2968 {
2969         unsigned long flags;
2970
2971         /*
2972          * don't let napi dequeue from the cpu poll list
2973          * just in case its running on a different cpu
2974          */
2975         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2976                 return;
2977
2978         napi_gro_flush(n);
2979         local_irq_save(flags);
2980         __napi_complete(n);
2981         local_irq_restore(flags);
2982 }
2983 EXPORT_SYMBOL(napi_complete);
2984
2985 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2986                     int (*poll)(struct napi_struct *, int), int weight)
2987 {
2988         INIT_LIST_HEAD(&napi->poll_list);
2989         napi->gro_count = 0;
2990         napi->gro_list = NULL;
2991         napi->skb = NULL;
2992         napi->poll = poll;
2993         napi->weight = weight;
2994         list_add(&napi->dev_list, &dev->napi_list);
2995         napi->dev = dev;
2996 #ifdef CONFIG_NETPOLL
2997         spin_lock_init(&napi->poll_lock);
2998         napi->poll_owner = -1;
2999 #endif
3000         set_bit(NAPI_STATE_SCHED, &napi->state);
3001 }
3002 EXPORT_SYMBOL(netif_napi_add);
3003
3004 void netif_napi_del(struct napi_struct *napi)
3005 {
3006         struct sk_buff *skb, *next;
3007
3008         list_del_init(&napi->dev_list);
3009         napi_free_frags(napi);
3010
3011         for (skb = napi->gro_list; skb; skb = next) {
3012                 next = skb->next;
3013                 skb->next = NULL;
3014                 kfree_skb(skb);
3015         }
3016
3017         napi->gro_list = NULL;
3018         napi->gro_count = 0;
3019 }
3020 EXPORT_SYMBOL(netif_napi_del);
3021
3022
3023 static void net_rx_action(struct softirq_action *h)
3024 {
3025         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
3026         unsigned long time_limit = jiffies + 2;
3027         int budget = netdev_budget;
3028         void *have;
3029
3030         local_irq_disable();
3031
3032         while (!list_empty(list)) {
3033                 struct napi_struct *n;
3034                 int work, weight;
3035
3036                 /* If softirq window is exhuasted then punt.
3037                  * Allow this to run for 2 jiffies since which will allow
3038                  * an average latency of 1.5/HZ.
3039                  */
3040                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3041                         goto softnet_break;
3042
3043                 local_irq_enable();
3044
3045                 /* Even though interrupts have been re-enabled, this
3046                  * access is safe because interrupts can only add new
3047                  * entries to the tail of this list, and only ->poll()
3048                  * calls can remove this head entry from the list.
3049                  */
3050                 n = list_first_entry(list, struct napi_struct, poll_list);
3051
3052                 have = netpoll_poll_lock(n);
3053
3054                 weight = n->weight;
3055
3056                 /* This NAPI_STATE_SCHED test is for avoiding a race
3057                  * with netpoll's poll_napi().  Only the entity which
3058                  * obtains the lock and sees NAPI_STATE_SCHED set will
3059                  * actually make the ->poll() call.  Therefore we avoid
3060                  * accidently calling ->poll() when NAPI is not scheduled.
3061                  */
3062                 work = 0;
3063                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3064                         work = n->poll(n, weight);
3065                         trace_napi_poll(n);
3066                 }
3067
3068                 WARN_ON_ONCE(work > weight);
3069
3070                 budget -= work;
3071
3072                 local_irq_disable();
3073
3074                 /* Drivers must not modify the NAPI state if they
3075                  * consume the entire weight.  In such cases this code
3076                  * still "owns" the NAPI instance and therefore can
3077                  * move the instance around on the list at-will.
3078                  */
3079                 if (unlikely(work == weight)) {
3080                         if (unlikely(napi_disable_pending(n))) {
3081                                 local_irq_enable();
3082                                 napi_complete(n);
3083                                 local_irq_disable();
3084                         } else
3085                                 list_move_tail(&n->poll_list, list);
3086                 }
3087
3088                 netpoll_poll_unlock(have);
3089         }
3090 out:
3091         local_irq_enable();
3092
3093 #ifdef CONFIG_NET_DMA
3094         /*
3095          * There may not be any more sk_buffs coming right now, so push
3096          * any pending DMA copies to hardware
3097          */
3098         dma_issue_pending_all();
3099 #endif
3100
3101         return;
3102
3103 softnet_break:
3104         __get_cpu_var(netdev_rx_stat).time_squeeze++;
3105         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3106         goto out;
3107 }
3108
3109 static gifconf_func_t *gifconf_list[NPROTO];
3110
3111 /**
3112  *      register_gifconf        -       register a SIOCGIF handler
3113  *      @family: Address family
3114  *      @gifconf: Function handler
3115  *
3116  *      Register protocol dependent address dumping routines. The handler
3117  *      that is passed must not be freed or reused until it has been replaced
3118  *      by another handler.
3119  */
3120 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3121 {
3122         if (family >= NPROTO)
3123                 return -EINVAL;
3124         gifconf_list[family] = gifconf;
3125         return 0;
3126 }
3127 EXPORT_SYMBOL(register_gifconf);
3128
3129
3130 /*
3131  *      Map an interface index to its name (SIOCGIFNAME)
3132  */
3133
3134 /*
3135  *      We need this ioctl for efficient implementation of the
3136  *      if_indextoname() function required by the IPv6 API.  Without
3137  *      it, we would have to search all the interfaces to find a
3138  *      match.  --pb
3139  */
3140
3141 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3142 {
3143         struct net_device *dev;
3144         struct ifreq ifr;
3145
3146         /*
3147          *      Fetch the caller's info block.
3148          */
3149
3150         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3151                 return -EFAULT;
3152
3153         rcu_read_lock();
3154         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3155         if (!dev) {
3156                 rcu_read_unlock();
3157                 return -ENODEV;
3158         }
3159
3160         strcpy(ifr.ifr_name, dev->name);
3161         rcu_read_unlock();
3162
3163         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3164                 return -EFAULT;
3165         return 0;
3166 }
3167
3168 /*
3169  *      Perform a SIOCGIFCONF call. This structure will change
3170  *      size eventually, and there is nothing I can do about it.
3171  *      Thus we will need a 'compatibility mode'.
3172  */
3173
3174 static int dev_ifconf(struct net *net, char __user *arg)
3175 {
3176         struct ifconf ifc;
3177         struct net_device *dev;
3178         char __user *pos;
3179         int len;
3180         int total;
3181         int i;
3182
3183         /*
3184          *      Fetch the caller's info block.
3185          */
3186
3187         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3188                 return -EFAULT;
3189
3190         pos = ifc.ifc_buf;
3191         len = ifc.ifc_len;
3192
3193         /*
3194          *      Loop over the interfaces, and write an info block for each.
3195          */
3196
3197         total = 0;
3198         for_each_netdev(net, dev) {
3199                 for (i = 0; i < NPROTO; i++) {
3200                         if (gifconf_list[i]) {
3201                                 int done;
3202                                 if (!pos)
3203                                         done = gifconf_list[i](dev, NULL, 0);
3204                                 else
3205                                         done = gifconf_list[i](dev, pos + total,
3206                                                                len - total);
3207                                 if (done < 0)
3208                                         return -EFAULT;
3209                                 total += done;
3210                         }
3211                 }
3212         }
3213
3214         /*
3215          *      All done.  Write the updated control block back to the caller.
3216          */
3217         ifc.ifc_len = total;
3218
3219         /*
3220          *      Both BSD and Solaris return 0 here, so we do too.
3221          */
3222         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3223 }
3224
3225 #ifdef CONFIG_PROC_FS
3226 /*
3227  *      This is invoked by the /proc filesystem handler to display a device
3228  *      in detail.
3229  */
3230 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3231         __acquires(RCU)
3232 {
3233         struct net *net = seq_file_net(seq);
3234         loff_t off;
3235         struct net_device *dev;
3236
3237         rcu_read_lock();
3238         if (!*pos)
3239                 return SEQ_START_TOKEN;
3240
3241         off = 1;
3242         for_each_netdev_rcu(net, dev)
3243                 if (off++ == *pos)
3244                         return dev;
3245
3246         return NULL;
3247 }
3248
3249 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3250 {
3251         struct net_device *dev = (v == SEQ_START_TOKEN) ?
3252                                   first_net_device(seq_file_net(seq)) :
3253                                   next_net_device((struct net_device *)v);
3254
3255         ++*pos;
3256         return rcu_dereference(dev);
3257 }
3258
3259 void dev_seq_stop(struct seq_file *seq, void *v)
3260         __releases(RCU)
3261 {
3262         rcu_read_unlock();
3263 }
3264
3265 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3266 {
3267         const struct net_device_stats *stats = dev_get_stats(dev);
3268
3269         seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3270                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3271                    dev->name, stats->rx_bytes, stats->rx_packets,
3272                    stats->rx_errors,
3273                    stats->rx_dropped + stats->rx_missed_errors,
3274                    stats->rx_fifo_errors,
3275                    stats->rx_length_errors + stats->rx_over_errors +
3276                     stats->rx_crc_errors + stats->rx_frame_errors,
3277                    stats->rx_compressed, stats->multicast,
3278                    stats->tx_bytes, stats->tx_packets,
3279                    stats->tx_errors, stats->tx_dropped,
3280                    stats->tx_fifo_errors, stats->collisions,
3281                    stats->tx_carrier_errors +
3282                     stats->tx_aborted_errors +
3283                     stats->tx_window_errors +
3284                     stats->tx_heartbeat_errors,
3285                    stats->tx_compressed);
3286 }
3287
3288 /*
3289  *      Called from the PROCfs module. This now uses the new arbitrary sized
3290  *      /proc/net interface to create /proc/net/dev
3291  */
3292 static int dev_seq_show(struct seq_file *seq, void *v)
3293 {
3294         if (v == SEQ_START_TOKEN)
3295                 seq_puts(seq, "Inter-|   Receive                            "
3296                               "                    |  Transmit\n"
3297                               " face |bytes    packets errs drop fifo frame "
3298                               "compressed multicast|bytes    packets errs "
3299                               "drop fifo colls carrier compressed\n");
3300         else
3301                 dev_seq_printf_stats(seq, v);
3302         return 0;
3303 }
3304
3305 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3306 {
3307         struct netif_rx_stats *rc = NULL;
3308
3309         while (*pos < nr_cpu_ids)
3310                 if (cpu_online(*pos)) {
3311                         rc = &per_cpu(netdev_rx_stat, *pos);
3312                         break;
3313                 } else
3314                         ++*pos;
3315         return rc;
3316 }
3317
3318 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3319 {
3320         return softnet_get_online(pos);
3321 }
3322
3323 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3324 {
3325         ++*pos;
3326         return softnet_get_online(pos);
3327 }
3328
3329 static void softnet_seq_stop(struct seq_file *seq, void *v)
3330 {
3331 }
3332
3333 static int softnet_seq_show(struct seq_file *seq, void *v)
3334 {
3335         struct netif_rx_stats *s = v;
3336
3337         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3338                    s->total, s->dropped, s->time_squeeze, 0,
3339                    0, 0, 0, 0, /* was fastroute */
3340                    s->cpu_collision);
3341         return 0;
3342 }
3343
3344 static const struct seq_operations dev_seq_ops = {
3345         .start = dev_seq_start,
3346         .next  = dev_seq_next,
3347         .stop  = dev_seq_stop,
3348         .show  = dev_seq_show,
3349 };
3350
3351 static int dev_seq_open(struct inode *inode, struct file *file)
3352 {
3353         return seq_open_net(inode, file, &dev_seq_ops,
3354                             sizeof(struct seq_net_private));
3355 }
3356
3357 static const struct file_operations dev_seq_fops = {
3358         .owner   = THIS_MODULE,
3359         .open    = dev_seq_open,
3360         .read    = seq_read,
3361         .llseek  = seq_lseek,
3362         .release = seq_release_net,
3363 };
3364
3365 static const struct seq_operations softnet_seq_ops = {
3366         .start = softnet_seq_start,
3367         .next  = softnet_seq_next,
3368         .stop  = softnet_seq_stop,
3369         .show  = softnet_seq_show,
3370 };
3371
3372 static int softnet_seq_open(struct inode *inode, struct file *file)
3373 {
3374         return seq_open(file, &softnet_seq_ops);
3375 }
3376
3377 static const struct file_operations softnet_seq_fops = {
3378         .owner   = THIS_MODULE,
3379         .open    = softnet_seq_open,
3380         .read    = seq_read,
3381         .llseek  = seq_lseek,
3382         .release = seq_release,
3383 };
3384
3385 static void *ptype_get_idx(loff_t pos)
3386 {
3387         struct packet_type *pt = NULL;
3388         loff_t i = 0;
3389         int t;
3390
3391         list_for_each_entry_rcu(pt, &ptype_all, list) {
3392                 if (i == pos)
3393                         return pt;
3394                 ++i;
3395         }
3396
3397         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3398                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3399                         if (i == pos)
3400                                 return pt;
3401                         ++i;
3402                 }
3403         }
3404         return NULL;
3405 }
3406
3407 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3408         __acquires(RCU)
3409 {
3410         rcu_read_lock();
3411         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3412 }
3413
3414 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3415 {
3416         struct packet_type *pt;
3417         struct list_head *nxt;
3418         int hash;
3419
3420         ++*pos;
3421         if (v == SEQ_START_TOKEN)
3422                 return ptype_get_idx(0);
3423
3424         pt = v;
3425         nxt = pt->list.next;
3426         if (pt->type == htons(ETH_P_ALL)) {
3427                 if (nxt != &ptype_all)
3428                         goto found;
3429                 hash = 0;
3430                 nxt = ptype_base[0].next;
3431         } else
3432                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3433
3434         while (nxt == &ptype_base[hash]) {
3435                 if (++hash >= PTYPE_HASH_SIZE)
3436                         return NULL;
3437                 nxt = ptype_base[hash].next;
3438         }
3439 found:
3440         return list_entry(nxt, struct packet_type, list);
3441 }
3442
3443 static void ptype_seq_stop(struct seq_file *seq, void *v)
3444         __releases(RCU)
3445 {
3446         rcu_read_unlock();
3447 }
3448
3449 static int ptype_seq_show(struct seq_file *seq, void *v)
3450 {
3451         struct packet_type *pt = v;
3452
3453         if (v == SEQ_START_TOKEN)
3454                 seq_puts(seq, "Type Device      Function\n");
3455         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3456                 if (pt->type == htons(ETH_P_ALL))
3457                         seq_puts(seq, "ALL ");
3458                 else
3459                         seq_printf(seq, "%04x", ntohs(pt->type));
3460
3461                 seq_printf(seq, " %-8s %pF\n",
3462                            pt->dev ? pt->dev->name : "", pt->func);
3463         }
3464
3465         return 0;
3466 }
3467
3468 static const struct seq_operations ptype_seq_ops = {
3469         .start = ptype_seq_start,
3470         .next  = ptype_seq_next,
3471         .stop  = ptype_seq_stop,
3472         .show  = ptype_seq_show,
3473 };
3474
3475 static int ptype_seq_open(struct inode *inode, struct file *file)
3476 {
3477         return seq_open_net(inode, file, &ptype_seq_ops,
3478                         sizeof(struct seq_net_private));
3479 }
3480
3481 static const struct file_operations ptype_seq_fops = {
3482         .owner   = THIS_MODULE,
3483         .open    = ptype_seq_open,
3484         .read    = seq_read,
3485         .llseek  = seq_lseek,
3486         .release = seq_release_net,
3487 };
3488
3489
3490 static int __net_init dev_proc_net_init(struct net *net)
3491 {
3492         int rc = -ENOMEM;
3493
3494         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3495                 goto out;
3496         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3497                 goto out_dev;
3498         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3499                 goto out_softnet;
3500
3501         if (wext_proc_init(net))
3502                 goto out_ptype;
3503         rc = 0;
3504 out:
3505         return rc;
3506 out_ptype:
3507         proc_net_remove(net, "ptype");
3508 out_softnet:
3509         proc_net_remove(net, "softnet_stat");
3510 out_dev:
3511         proc_net_remove(net, "dev");
3512         goto out;
3513 }
3514
3515 static void __net_exit dev_proc_net_exit(struct net *net)
3516 {
3517         wext_proc_exit(net);
3518
3519         proc_net_remove(net, "ptype");
3520         proc_net_remove(net, "softnet_stat");
3521         proc_net_remove(net, "dev");
3522 }
3523
3524 static struct pernet_operations __net_initdata dev_proc_ops = {
3525         .init = dev_proc_net_init,
3526         .exit = dev_proc_net_exit,
3527 };
3528
3529 static int __init dev_proc_init(void)
3530 {
3531         return register_pernet_subsys(&dev_proc_ops);
3532 }
3533 #else
3534 #define dev_proc_init() 0
3535 #endif  /* CONFIG_PROC_FS */
3536
3537
3538 /**
3539  *      netdev_set_master       -       set up master/slave pair
3540  *      @slave: slave device
3541  *      @master: new master device
3542  *
3543  *      Changes the master device of the slave. Pass %NULL to break the
3544  *      bonding. The caller must hold the RTNL semaphore. On a failure
3545  *      a negative errno code is returned. On success the reference counts
3546  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3547  *      function returns zero.
3548  */
3549 int netdev_set_master(struct net_device *slave, struct net_device *master)
3550 {
3551         struct net_device *old = slave->master;
3552
3553         ASSERT_RTNL();
3554
3555         if (master) {
3556                 if (old)
3557                         return -EBUSY;
3558                 dev_hold(master);
3559         }
3560
3561         slave->master = master;
3562
3563         synchronize_net();
3564
3565         if (old)
3566                 dev_put(old);
3567
3568         if (master)
3569                 slave->flags |= IFF_SLAVE;
3570         else
3571                 slave->flags &= ~IFF_SLAVE;
3572
3573         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3574         return 0;
3575 }
3576 EXPORT_SYMBOL(netdev_set_master);
3577
3578 static void dev_change_rx_flags(struct net_device *dev, int flags)
3579 {
3580         const struct net_device_ops *ops = dev->netdev_ops;
3581
3582         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3583                 ops->ndo_change_rx_flags(dev, flags);
3584 }
3585
3586 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3587 {
3588         unsigned short old_flags = dev->flags;
3589         uid_t uid;
3590         gid_t gid;
3591
3592         ASSERT_RTNL();
3593
3594         dev->flags |= IFF_PROMISC;
3595         dev->promiscuity += inc;
3596         if (dev->promiscuity == 0) {
3597                 /*
3598                  * Avoid overflow.
3599                  * If inc causes overflow, untouch promisc and return error.
3600                  */
3601                 if (inc < 0)
3602                         dev->flags &= ~IFF_PROMISC;
3603                 else {
3604                         dev->promiscuity -= inc;
3605                         printk(KERN_WARNING "%s: promiscuity touches roof, "
3606                                 "set promiscuity failed, promiscuity feature "
3607                                 "of device might be broken.\n", dev->name);
3608                         return -EOVERFLOW;
3609                 }
3610         }
3611         if (dev->flags != old_flags) {
3612                 printk(KERN_INFO "device %s %s promiscuous mode\n",
3613                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3614                                                                "left");
3615                 if (audit_enabled) {
3616                         current_uid_gid(&uid, &gid);
3617                         audit_log(current->audit_context, GFP_ATOMIC,
3618                                 AUDIT_ANOM_PROMISCUOUS,
3619                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3620                                 dev->name, (dev->flags & IFF_PROMISC),
3621                                 (old_flags & IFF_PROMISC),
3622                                 audit_get_loginuid(current),
3623                                 uid, gid,
3624                                 audit_get_sessionid(current));
3625                 }
3626
3627                 dev_change_rx_flags(dev, IFF_PROMISC);
3628         }
3629         return 0;
3630 }
3631
3632 /**
3633  *      dev_set_promiscuity     - update promiscuity count on a device
3634  *      @dev: device
3635  *      @inc: modifier
3636  *
3637  *      Add or remove promiscuity from a device. While the count in the device
3638  *      remains above zero the interface remains promiscuous. Once it hits zero
3639  *      the device reverts back to normal filtering operation. A negative inc
3640  *      value is used to drop promiscuity on the device.
3641  *      Return 0 if successful or a negative errno code on error.
3642  */
3643 int dev_set_promiscuity(struct net_device *dev, int inc)
3644 {
3645         unsigned short old_flags = dev->flags;
3646         int err;
3647
3648         err = __dev_set_promiscuity(dev, inc);
3649         if (err < 0)
3650                 return err;
3651         if (dev->flags != old_flags)
3652                 dev_set_rx_mode(dev);
3653         return err;
3654 }
3655 EXPORT_SYMBOL(dev_set_promiscuity);
3656
3657 /**
3658  *      dev_set_allmulti        - update allmulti count on a device
3659  *      @dev: device
3660  *      @inc: modifier
3661  *
3662  *      Add or remove reception of all multicast frames to a device. While the
3663  *      count in the device remains above zero the interface remains listening
3664  *      to all interfaces. Once it hits zero the device reverts back to normal
3665  *      filtering operation. A negative @inc value is used to drop the counter
3666  *      when releasing a resource needing all multicasts.
3667  *      Return 0 if successful or a negative errno code on error.
3668  */
3669
3670 int dev_set_allmulti(struct net_device *dev, int inc)
3671 {
3672         unsigned short old_flags = dev->flags;
3673
3674         ASSERT_RTNL();
3675
3676         dev->flags |= IFF_ALLMULTI;
3677         dev->allmulti += inc;
3678         if (dev->allmulti == 0) {
3679                 /*
3680                  * Avoid overflow.
3681                  * If inc causes overflow, untouch allmulti and return error.
3682                  */
3683                 if (inc < 0)
3684                         dev->flags &= ~IFF_ALLMULTI;
3685                 else {
3686                         dev->allmulti -= inc;
3687                         printk(KERN_WARNING "%s: allmulti touches roof, "
3688                                 "set allmulti failed, allmulti feature of "
3689                                 "device might be broken.\n", dev->name);
3690                         return -EOVERFLOW;
3691                 }
3692         }
3693         if (dev->flags ^ old_flags) {
3694                 dev_change_rx_flags(dev, IFF_ALLMULTI);
3695                 dev_set_rx_mode(dev);
3696         }
3697         return 0;
3698 }
3699 EXPORT_SYMBOL(dev_set_allmulti);
3700
3701 /*
3702  *      Upload unicast and multicast address lists to device and
3703  *      configure RX filtering. When the device doesn't support unicast
3704  *      filtering it is put in promiscuous mode while unicast addresses
3705  *      are present.
3706  */
3707 void __dev_set_rx_mode(struct net_device *dev)
3708 {
3709         const struct net_device_ops *ops = dev->netdev_ops;
3710
3711         /* dev_open will call this function so the list will stay sane. */
3712         if (!(dev->flags&IFF_UP))
3713                 return;
3714
3715         if (!netif_device_present(dev))
3716                 return;
3717
3718         if (ops->ndo_set_rx_mode)
3719                 ops->ndo_set_rx_mode(dev);
3720         else {
3721                 /* Unicast addresses changes may only happen under the rtnl,
3722                  * therefore calling __dev_set_promiscuity here is safe.
3723                  */
3724                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
3725                         __dev_set_promiscuity(dev, 1);
3726                         dev->uc_promisc = 1;
3727                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
3728                         __dev_set_promiscuity(dev, -1);
3729                         dev->uc_promisc = 0;
3730                 }
3731
3732                 if (ops->ndo_set_multicast_list)
3733                         ops->ndo_set_multicast_list(dev);
3734         }
3735 }
3736
3737 void dev_set_rx_mode(struct net_device *dev)
3738 {
3739         netif_addr_lock_bh(dev);
3740         __dev_set_rx_mode(dev);
3741         netif_addr_unlock_bh(dev);
3742 }
3743
3744 /* hw addresses list handling functions */
3745
3746 static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
3747                          int addr_len, unsigned char addr_type)
3748 {
3749         struct netdev_hw_addr *ha;
3750         int alloc_size;
3751
3752         if (addr_len > MAX_ADDR_LEN)
3753                 return -EINVAL;
3754
3755         list_for_each_entry(ha, &list->list, list) {
3756                 if (!memcmp(ha->addr, addr, addr_len) &&
3757                     ha->type == addr_type) {
3758                         ha->refcount++;
3759                         return 0;
3760                 }
3761         }
3762
3763
3764         alloc_size = sizeof(*ha);
3765         if (alloc_size < L1_CACHE_BYTES)
3766                 alloc_size = L1_CACHE_BYTES;
3767         ha = kmalloc(alloc_size, GFP_ATOMIC);
3768         if (!ha)
3769                 return -ENOMEM;
3770         memcpy(ha->addr, addr, addr_len);
3771         ha->type = addr_type;
3772         ha->refcount = 1;
3773         ha->synced = false;
3774         list_add_tail_rcu(&ha->list, &list->list);
3775         list->count++;
3776         return 0;
3777 }
3778
3779 static void ha_rcu_free(struct rcu_head *head)
3780 {
3781         struct netdev_hw_addr *ha;
3782
3783         ha = container_of(head, struct netdev_hw_addr, rcu_head);
3784         kfree(ha);
3785 }
3786
3787 static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
3788                          int addr_len, unsigned char addr_type)
3789 {
3790         struct netdev_hw_addr *ha;
3791
3792         list_for_each_entry(ha, &list->list, list) {
3793                 if (!memcmp(ha->addr, addr, addr_len) &&
3794                     (ha->type == addr_type || !addr_type)) {
3795                         if (--ha->refcount)
3796                                 return 0;
3797                         list_del_rcu(&ha->list);
3798                         call_rcu(&ha->rcu_head, ha_rcu_free);
3799                         list->count--;
3800                         return 0;
3801                 }
3802         }
3803         return -ENOENT;
3804 }
3805
3806 static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
3807                                   struct netdev_hw_addr_list *from_list,
3808                                   int addr_len,
3809                                   unsigned char addr_type)
3810 {
3811         int err;
3812         struct netdev_hw_addr *ha, *ha2;
3813         unsigned char type;
3814
3815         list_for_each_entry(ha, &from_list->list, list) {
3816                 type = addr_type ? addr_type : ha->type;
3817                 err = __hw_addr_add(to_list, ha->addr, addr_len, type);
3818                 if (err)
3819                         goto unroll;
3820         }
3821         return 0;
3822
3823 unroll:
3824         list_for_each_entry(ha2, &from_list->list, list) {
3825                 if (ha2 == ha)
3826                         break;
3827                 type = addr_type ? addr_type : ha2->type;
3828                 __hw_addr_del(to_list, ha2->addr, addr_len, type);
3829         }
3830         return err;
3831 }
3832
3833 static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
3834                                    struct netdev_hw_addr_list *from_list,
3835                                    int addr_len,
3836                                    unsigned char addr_type)
3837 {
3838         struct netdev_hw_addr *ha;
3839         unsigned char type;
3840
3841         list_for_each_entry(ha, &from_list->list, list) {
3842                 type = addr_type ? addr_type : ha->type;
3843                 __hw_addr_del(to_list, ha->addr, addr_len, addr_type);
3844         }
3845 }
3846
3847 static int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
3848                           struct netdev_hw_addr_list *from_list,
3849                           int addr_len)
3850 {
3851         int err = 0;
3852         struct netdev_hw_addr *ha, *tmp;
3853
3854         list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3855                 if (!ha->synced) {
3856                         err = __hw_addr_add(to_list, ha->addr,
3857                                             addr_len, ha->type);
3858                         if (err)
3859                                 break;
3860                         ha->synced = true;
3861                         ha->refcount++;
3862                 } else if (ha->refcount == 1) {
3863                         __hw_addr_del(to_list, ha->addr, addr_len, ha->type);
3864                         __hw_addr_del(from_list, ha->addr, addr_len, ha->type);
3865                 }
3866         }
3867         return err;
3868 }
3869
3870 static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
3871                              struct netdev_hw_addr_list *from_list,
3872                              int addr_len)
3873 {
3874         struct netdev_hw_addr *ha, *tmp;
3875
3876         list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3877                 if (ha->synced) {
3878                         __hw_addr_del(to_list, ha->addr,
3879                                       addr_len, ha->type);
3880                         ha->synced = false;
3881                         __hw_addr_del(from_list, ha->addr,
3882                                       addr_len, ha->type);
3883                 }
3884         }
3885 }
3886
3887 static void __hw_addr_flush(struct netdev_hw_addr_list *list)
3888 {
3889         struct netdev_hw_addr *ha, *tmp;
3890
3891         list_for_each_entry_safe(ha, tmp, &list->list, list) {
3892                 list_del_rcu(&ha->list);
3893                 call_rcu(&ha->rcu_head, ha_rcu_free);
3894         }
3895         list->count = 0;
3896 }
3897
3898 static void __hw_addr_init(struct netdev_hw_addr_list *list)
3899 {
3900         INIT_LIST_HEAD(&list->list);
3901         list->count = 0;
3902 }
3903
3904 /* Device addresses handling functions */
3905
3906 static void dev_addr_flush(struct net_device *dev)
3907 {
3908         /* rtnl_mutex must be held here */
3909
3910         __hw_addr_flush(&dev->dev_addrs);
3911         dev->dev_addr = NULL;
3912 }
3913
3914 static int dev_addr_init(struct net_device *dev)
3915 {
3916         unsigned char addr[MAX_ADDR_LEN];
3917         struct netdev_hw_addr *ha;
3918         int err;
3919
3920         /* rtnl_mutex must be held here */
3921
3922         __hw_addr_init(&dev->dev_addrs);
3923         memset(addr, 0, sizeof(addr));
3924         err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
3925                             NETDEV_HW_ADDR_T_LAN);
3926         if (!err) {
3927                 /*
3928                  * Get the first (previously created) address from the list
3929                  * and set dev_addr pointer to this location.
3930                  */
3931                 ha = list_first_entry(&dev->dev_addrs.list,
3932                                       struct netdev_hw_addr, list);
3933                 dev->dev_addr = ha->addr;
3934         }
3935         return err;
3936 }
3937
3938 /**
3939  *      dev_addr_add    - Add a device address
3940  *      @dev: device
3941  *      @addr: address to add
3942  *      @addr_type: address type
3943  *
3944  *      Add a device address to the device or increase the reference count if
3945  *      it already exists.
3946  *
3947  *      The caller must hold the rtnl_mutex.
3948  */
3949 int dev_addr_add(struct net_device *dev, unsigned char *addr,
3950                  unsigned char addr_type)
3951 {
3952         int err;
3953
3954         ASSERT_RTNL();
3955
3956         err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
3957         if (!err)
3958                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3959         return err;
3960 }
3961 EXPORT_SYMBOL(dev_addr_add);
3962
3963 /**
3964  *      dev_addr_del    - Release a device address.
3965  *      @dev: device
3966  *      @addr: address to delete
3967  *      @addr_type: address type
3968  *
3969  *      Release reference to a device address and remove it from the device
3970  *      if the reference count drops to zero.
3971  *
3972  *      The caller must hold the rtnl_mutex.
3973  */
3974 int dev_addr_del(struct net_device *dev, unsigned char *addr,
3975                  unsigned char addr_type)
3976 {
3977         int err;
3978         struct netdev_hw_addr *ha;
3979
3980         ASSERT_RTNL();
3981
3982         /*
3983          * We can not remove the first address from the list because
3984          * dev->dev_addr points to that.
3985          */
3986         ha = list_first_entry(&dev->dev_addrs.list,
3987                               struct netdev_hw_addr, list);
3988         if (ha->addr == dev->dev_addr && ha->refcount == 1)
3989                 return -ENOENT;
3990
3991         err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
3992                             addr_type);
3993         if (!err)
3994                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3995         return err;
3996 }
3997 EXPORT_SYMBOL(dev_addr_del);
3998
3999 /**
4000  *      dev_addr_add_multiple   - Add device addresses from another device
4001  *      @to_dev: device to which addresses will be added
4002  *      @from_dev: device from which addresses will be added
4003  *      @addr_type: address type - 0 means type will be used from from_dev
4004  *
4005  *      Add device addresses of the one device to another.
4006  **
4007  *      The caller must hold the rtnl_mutex.
4008  */
4009 int dev_addr_add_multiple(struct net_device *to_dev,
4010                           struct net_device *from_dev,
4011                           unsigned char addr_type)
4012 {
4013         int err;
4014
4015         ASSERT_RTNL();
4016
4017         if (from_dev->addr_len != to_dev->addr_len)
4018                 return -EINVAL;
4019         err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
4020                                      to_dev->addr_len, addr_type);
4021         if (!err)
4022                 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
4023         return err;
4024 }
4025 EXPORT_SYMBOL(dev_addr_add_multiple);
4026
4027 /**
4028  *      dev_addr_del_multiple   - Delete device addresses by another device
4029  *      @to_dev: device where the addresses will be deleted
4030  *      @from_dev: device by which addresses the addresses will be deleted
4031  *      @addr_type: address type - 0 means type will used from from_dev
4032  *
4033  *      Deletes addresses in to device by the list of addresses in from device.
4034  *
4035  *      The caller must hold the rtnl_mutex.
4036  */
4037 int dev_addr_del_multiple(struct net_device *to_dev,
4038                           struct net_device *from_dev,
4039                           unsigned char addr_type)
4040 {
4041         ASSERT_RTNL();
4042
4043         if (from_dev->addr_len != to_dev->addr_len)
4044                 return -EINVAL;
4045         __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
4046                                to_dev->addr_len, addr_type);
4047         call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
4048         return 0;
4049 }
4050 EXPORT_SYMBOL(dev_addr_del_multiple);
4051
4052 /* multicast addresses handling functions */
4053
4054 int __dev_addr_delete(struct dev_addr_list **list, int *count,
4055                       void *addr, int alen, int glbl)
4056 {
4057         struct dev_addr_list *da;
4058
4059         for (; (da = *list) != NULL; list = &da->next) {
4060                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4061                     alen == da->da_addrlen) {
4062                         if (glbl) {
4063                                 int old_glbl = da->da_gusers;
4064                                 da->da_gusers = 0;
4065                                 if (old_glbl == 0)
4066                                         break;
4067                         }
4068                         if (--da->da_users)
4069                                 return 0;
4070
4071                         *list = da->next;
4072                         kfree(da);
4073                         (*count)--;
4074                         return 0;
4075                 }
4076         }
4077         return -ENOENT;
4078 }
4079
4080 int __dev_addr_add(struct dev_addr_list **list, int *count,
4081                    void *addr, int alen, int glbl)
4082 {
4083         struct dev_addr_list *da;
4084
4085         for (da = *list; da != NULL; da = da->next) {
4086                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4087                     da->da_addrlen == alen) {
4088                         if (glbl) {
4089                                 int old_glbl = da->da_gusers;
4090                                 da->da_gusers = 1;
4091                                 if (old_glbl)
4092                                         return 0;
4093                         }
4094                         da->da_users++;
4095                         return 0;
4096                 }
4097         }
4098
4099         da = kzalloc(sizeof(*da), GFP_ATOMIC);
4100         if (da == NULL)
4101                 return -ENOMEM;
4102         memcpy(da->da_addr, addr, alen);
4103         da->da_addrlen = alen;
4104         da->da_users = 1;
4105         da->da_gusers = glbl ? 1 : 0;
4106         da->next = *list;
4107         *list = da;
4108         (*count)++;
4109         return 0;
4110 }
4111
4112 /**
4113  *      dev_unicast_delete      - Release secondary unicast address.
4114  *      @dev: device
4115  *      @addr: address to delete
4116  *
4117  *      Release reference to a secondary unicast address and remove it
4118  *      from the device if the reference count drops to zero.
4119  *
4120  *      The caller must hold the rtnl_mutex.
4121  */
4122 int dev_unicast_delete(struct net_device *dev, void *addr)
4123 {
4124         int err;
4125
4126         ASSERT_RTNL();
4127
4128         netif_addr_lock_bh(dev);
4129         err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
4130                             NETDEV_HW_ADDR_T_UNICAST);
4131         if (!err)
4132                 __dev_set_rx_mode(dev);
4133         netif_addr_unlock_bh(dev);
4134         return err;
4135 }
4136 EXPORT_SYMBOL(dev_unicast_delete);
4137
4138 /**
4139  *      dev_unicast_add         - add a secondary unicast address
4140  *      @dev: device
4141  *      @addr: address to add
4142  *
4143  *      Add a secondary unicast address to the device or increase
4144  *      the reference count if it already exists.
4145  *
4146  *      The caller must hold the rtnl_mutex.
4147  */
4148 int dev_unicast_add(struct net_device *dev, void *addr)
4149 {
4150         int err;
4151
4152         ASSERT_RTNL();
4153
4154         netif_addr_lock_bh(dev);
4155         err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
4156                             NETDEV_HW_ADDR_T_UNICAST);
4157         if (!err)
4158                 __dev_set_rx_mode(dev);
4159         netif_addr_unlock_bh(dev);
4160         return err;
4161 }
4162 EXPORT_SYMBOL(dev_unicast_add);
4163
4164 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
4165                     struct dev_addr_list **from, int *from_count)
4166 {
4167         struct dev_addr_list *da, *next;
4168         int err = 0;
4169
4170         da = *from;
4171         while (da != NULL) {
4172                 next = da->next;
4173                 if (!da->da_synced) {
4174                         err = __dev_addr_add(to, to_count,
4175                                              da->da_addr, da->da_addrlen, 0);
4176                         if (err < 0)
4177                                 break;
4178                         da->da_synced = 1;
4179                         da->da_users++;
4180                 } else if (da->da_users == 1) {
4181                         __dev_addr_delete(to, to_count,
4182                                           da->da_addr, da->da_addrlen, 0);
4183                         __dev_addr_delete(from, from_count,
4184                                           da->da_addr, da->da_addrlen, 0);
4185                 }
4186                 da = next;
4187         }
4188         return err;
4189 }
4190 EXPORT_SYMBOL_GPL(__dev_addr_sync);
4191
4192 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
4193                        struct dev_addr_list **from, int *from_count)
4194 {
4195         struct dev_addr_list *da, *next;
4196
4197         da = *from;
4198         while (da != NULL) {
4199                 next = da->next;
4200                 if (da->da_synced) {
4201                         __dev_addr_delete(to, to_count,
4202                                           da->da_addr, da->da_addrlen, 0);
4203                         da->da_synced = 0;
4204                         __dev_addr_delete(from, from_count,
4205                                           da->da_addr, da->da_addrlen, 0);
4206                 }
4207                 da = next;
4208         }
4209 }
4210 EXPORT_SYMBOL_GPL(__dev_addr_unsync);
4211
4212 /**
4213  *      dev_unicast_sync - Synchronize device's unicast list to another device
4214  *      @to: destination device
4215  *      @from: source device
4216  *
4217  *      Add newly added addresses to the destination device and release
4218  *      addresses that have no users left. The source device must be
4219  *      locked by netif_tx_lock_bh.
4220  *
4221  *      This function is intended to be called from the dev->set_rx_mode
4222  *      function of layered software devices.
4223  */
4224 int dev_unicast_sync(struct net_device *to, struct net_device *from)
4225 {
4226         int err = 0;
4227
4228         if (to->addr_len != from->addr_len)
4229                 return -EINVAL;
4230
4231         netif_addr_lock_bh(to);
4232         err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
4233         if (!err)
4234                 __dev_set_rx_mode(to);
4235         netif_addr_unlock_bh(to);
4236         return err;
4237 }
4238 EXPORT_SYMBOL(dev_unicast_sync);
4239
4240 /**
4241  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
4242  *      @to: destination device
4243  *      @from: source device
4244  *
4245  *      Remove all addresses that were added to the destination device by
4246  *      dev_unicast_sync(). This function is intended to be called from the
4247  *      dev->stop function of layered software devices.
4248  */
4249 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
4250 {
4251         if (to->addr_len != from->addr_len)
4252                 return;
4253
4254         netif_addr_lock_bh(from);
4255         netif_addr_lock(to);
4256         __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
4257         __dev_set_rx_mode(to);
4258         netif_addr_unlock(to);
4259         netif_addr_unlock_bh(from);
4260 }
4261 EXPORT_SYMBOL(dev_unicast_unsync);
4262
4263 static void dev_unicast_flush(struct net_device *dev)
4264 {
4265         netif_addr_lock_bh(dev);
4266         __hw_addr_flush(&dev->uc);
4267         netif_addr_unlock_bh(dev);
4268 }
4269
4270 static void dev_unicast_init(struct net_device *dev)
4271 {
4272         __hw_addr_init(&dev->uc);
4273 }
4274
4275
4276 static void __dev_addr_discard(struct dev_addr_list **list)
4277 {
4278         struct dev_addr_list *tmp;
4279
4280         while (*list != NULL) {
4281                 tmp = *list;
4282                 *list = tmp->next;
4283                 if (tmp->da_users > tmp->da_gusers)
4284                         printk("__dev_addr_discard: address leakage! "
4285                                "da_users=%d\n", tmp->da_users);
4286                 kfree(tmp);
4287         }
4288 }
4289
4290 static void dev_addr_discard(struct net_device *dev)
4291 {
4292         netif_addr_lock_bh(dev);
4293
4294         __dev_addr_discard(&dev->mc_list);
4295         netdev_mc_count(dev) = 0;
4296
4297         netif_addr_unlock_bh(dev);
4298 }
4299
4300 /**
4301  *      dev_get_flags - get flags reported to userspace
4302  *      @dev: device
4303  *
4304  *      Get the combination of flag bits exported through APIs to userspace.
4305  */
4306 unsigned dev_get_flags(const struct net_device *dev)
4307 {
4308         unsigned flags;
4309
4310         flags = (dev->flags & ~(IFF_PROMISC |
4311                                 IFF_ALLMULTI |
4312                                 IFF_RUNNING |
4313                                 IFF_LOWER_UP |
4314                                 IFF_DORMANT)) |
4315                 (dev->gflags & (IFF_PROMISC |
4316                                 IFF_ALLMULTI));
4317
4318         if (netif_running(dev)) {
4319                 if (netif_oper_up(dev))
4320                         flags |= IFF_RUNNING;
4321                 if (netif_carrier_ok(dev))
4322                         flags |= IFF_LOWER_UP;
4323                 if (netif_dormant(dev))
4324                         flags |= IFF_DORMANT;
4325         }
4326
4327         return flags;
4328 }
4329 EXPORT_SYMBOL(dev_get_flags);
4330
4331 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4332 {
4333         int old_flags = dev->flags;
4334         int ret;
4335
4336         ASSERT_RTNL();
4337
4338         /*
4339          *      Set the flags on our device.
4340          */
4341
4342         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4343                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4344                                IFF_AUTOMEDIA)) |
4345                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4346                                     IFF_ALLMULTI));
4347
4348         /*
4349          *      Load in the correct multicast list now the flags have changed.
4350          */
4351
4352         if ((old_flags ^ flags) & IFF_MULTICAST)
4353                 dev_change_rx_flags(dev, IFF_MULTICAST);
4354
4355         dev_set_rx_mode(dev);
4356
4357         /*
4358          *      Have we downed the interface. We handle IFF_UP ourselves
4359          *      according to user attempts to set it, rather than blindly
4360          *      setting it.
4361          */
4362
4363         ret = 0;
4364         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4365                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4366
4367                 if (!ret)
4368                         dev_set_rx_mode(dev);
4369         }
4370
4371         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4372                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4373
4374                 dev->gflags ^= IFF_PROMISC;
4375                 dev_set_promiscuity(dev, inc);
4376         }
4377
4378         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4379            is important. Some (broken) drivers set IFF_PROMISC, when
4380            IFF_ALLMULTI is requested not asking us and not reporting.
4381          */
4382         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4383                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4384
4385                 dev->gflags ^= IFF_ALLMULTI;
4386                 dev_set_allmulti(dev, inc);
4387         }
4388
4389         return ret;
4390 }
4391
4392 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4393 {
4394         unsigned int changes = dev->flags ^ old_flags;
4395
4396         if (changes & IFF_UP) {
4397                 if (dev->flags & IFF_UP)
4398                         call_netdevice_notifiers(NETDEV_UP, dev);
4399                 else
4400                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4401         }
4402
4403         if (dev->flags & IFF_UP &&
4404             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4405                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4406 }
4407
4408 /**
4409  *      dev_change_flags - change device settings
4410  *      @dev: device
4411  *      @flags: device state flags
4412  *
4413  *      Change settings on device based state flags. The flags are
4414  *      in the userspace exported format.
4415  */
4416 int dev_change_flags(struct net_device *dev, unsigned flags)
4417 {
4418         int ret, changes;
4419         int old_flags = dev->flags;
4420
4421         ret = __dev_change_flags(dev, flags);
4422         if (ret < 0)
4423                 return ret;
4424
4425         changes = old_flags ^ dev->flags;
4426         if (changes)
4427                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4428
4429         __dev_notify_flags(dev, old_flags);
4430         return ret;
4431 }
4432 EXPORT_SYMBOL(dev_change_flags);
4433
4434 /**
4435  *      dev_set_mtu - Change maximum transfer unit
4436  *      @dev: device
4437  *      @new_mtu: new transfer unit
4438  *
4439  *      Change the maximum transfer size of the network device.
4440  */
4441 int dev_set_mtu(struct net_device *dev, int new_mtu)
4442 {
4443         const struct net_device_ops *ops = dev->netdev_ops;
4444         int err;
4445
4446         if (new_mtu == dev->mtu)
4447                 return 0;
4448
4449         /*      MTU must be positive.    */
4450         if (new_mtu < 0)
4451                 return -EINVAL;
4452
4453         if (!netif_device_present(dev))
4454                 return -ENODEV;
4455
4456         err = 0;
4457         if (ops->ndo_change_mtu)
4458                 err = ops->ndo_change_mtu(dev, new_mtu);
4459         else
4460                 dev->mtu = new_mtu;
4461
4462         if (!err && dev->flags & IFF_UP)
4463                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4464         return err;
4465 }
4466 EXPORT_SYMBOL(dev_set_mtu);
4467
4468 /**
4469  *      dev_set_mac_address - Change Media Access Control Address
4470  *      @dev: device
4471  *      @sa: new address
4472  *
4473  *      Change the hardware (MAC) address of the device
4474  */
4475 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4476 {
4477         const struct net_device_ops *ops = dev->netdev_ops;
4478         int err;
4479
4480         if (!ops->ndo_set_mac_address)
4481                 return -EOPNOTSUPP;
4482         if (sa->sa_family != dev->type)
4483                 return -EINVAL;
4484         if (!netif_device_present(dev))
4485                 return -ENODEV;
4486         err = ops->ndo_set_mac_address(dev, sa);
4487         if (!err)
4488                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4489         return err;
4490 }
4491 EXPORT_SYMBOL(dev_set_mac_address);
4492
4493 /*
4494  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4495  */
4496 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4497 {
4498         int err;
4499         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4500
4501         if (!dev)
4502                 return -ENODEV;
4503
4504         switch (cmd) {
4505         case SIOCGIFFLAGS:      /* Get interface flags */
4506                 ifr->ifr_flags = (short) dev_get_flags(dev);
4507                 return 0;
4508
4509         case SIOCGIFMETRIC:     /* Get the metric on the interface
4510                                    (currently unused) */
4511                 ifr->ifr_metric = 0;
4512                 return 0;
4513
4514         case SIOCGIFMTU:        /* Get the MTU of a device */
4515                 ifr->ifr_mtu = dev->mtu;
4516                 return 0;
4517
4518         case SIOCGIFHWADDR:
4519                 if (!dev->addr_len)
4520                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4521                 else
4522                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4523                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4524                 ifr->ifr_hwaddr.sa_family = dev->type;
4525                 return 0;
4526
4527         case SIOCGIFSLAVE:
4528                 err = -EINVAL;
4529                 break;
4530
4531         case SIOCGIFMAP:
4532                 ifr->ifr_map.mem_start = dev->mem_start;
4533                 ifr->ifr_map.mem_end   = dev->mem_end;
4534                 ifr->ifr_map.base_addr = dev->base_addr;
4535                 ifr->ifr_map.irq       = dev->irq;
4536                 ifr->ifr_map.dma       = dev->dma;
4537                 ifr->ifr_map.port      = dev->if_port;
4538                 return 0;
4539
4540         case SIOCGIFINDEX:
4541                 ifr->ifr_ifindex = dev->ifindex;
4542                 return 0;
4543
4544         case SIOCGIFTXQLEN:
4545                 ifr->ifr_qlen = dev->tx_queue_len;
4546                 return 0;
4547
4548         default:
4549                 /* dev_ioctl() should ensure this case
4550                  * is never reached
4551                  */
4552                 WARN_ON(1);
4553                 err = -EINVAL;
4554                 break;
4555
4556         }
4557         return err;
4558 }
4559
4560 /*
4561  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4562  */
4563 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4564 {
4565         int err;
4566         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4567         const struct net_device_ops *ops;
4568
4569         if (!dev)
4570                 return -ENODEV;
4571
4572         ops = dev->netdev_ops;
4573
4574         switch (cmd) {
4575         case SIOCSIFFLAGS:      /* Set interface flags */
4576                 return dev_change_flags(dev, ifr->ifr_flags);
4577
4578         case SIOCSIFMETRIC:     /* Set the metric on the interface
4579                                    (currently unused) */
4580                 return -EOPNOTSUPP;
4581
4582         case SIOCSIFMTU:        /* Set the MTU of a device */
4583                 return dev_set_mtu(dev, ifr->ifr_mtu);
4584
4585         case SIOCSIFHWADDR:
4586                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4587
4588         case SIOCSIFHWBROADCAST:
4589                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4590                         return -EINVAL;
4591                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4592                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4593                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4594                 return 0;
4595
4596         case SIOCSIFMAP:
4597                 if (ops->ndo_set_config) {
4598                         if (!netif_device_present(dev))
4599                                 return -ENODEV;
4600                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4601                 }
4602                 return -EOPNOTSUPP;
4603
4604         case SIOCADDMULTI:
4605                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4606                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4607                         return -EINVAL;
4608                 if (!netif_device_present(dev))
4609                         return -ENODEV;
4610                 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
4611                                   dev->addr_len, 1);
4612
4613         case SIOCDELMULTI:
4614                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4615                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4616                         return -EINVAL;
4617                 if (!netif_device_present(dev))
4618                         return -ENODEV;
4619                 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
4620                                      dev->addr_len, 1);
4621
4622         case SIOCSIFTXQLEN:
4623                 if (ifr->ifr_qlen < 0)
4624                         return -EINVAL;
4625                 dev->tx_queue_len = ifr->ifr_qlen;
4626                 return 0;
4627
4628         case SIOCSIFNAME:
4629                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4630                 return dev_change_name(dev, ifr->ifr_newname);
4631
4632         /*
4633          *      Unknown or private ioctl
4634          */
4635         default:
4636                 if ((cmd >= SIOCDEVPRIVATE &&
4637                     cmd <= SIOCDEVPRIVATE + 15) ||
4638                     cmd == SIOCBONDENSLAVE ||
4639                     cmd == SIOCBONDRELEASE ||
4640                     cmd == SIOCBONDSETHWADDR ||
4641                     cmd == SIOCBONDSLAVEINFOQUERY ||
4642                     cmd == SIOCBONDINFOQUERY ||
4643                     cmd == SIOCBONDCHANGEACTIVE ||
4644                     cmd == SIOCGMIIPHY ||
4645                     cmd == SIOCGMIIREG ||
4646                     cmd == SIOCSMIIREG ||
4647                     cmd == SIOCBRADDIF ||
4648                     cmd == SIOCBRDELIF ||
4649                     cmd == SIOCSHWTSTAMP ||
4650                     cmd == SIOCWANDEV) {
4651                         err = -EOPNOTSUPP;
4652                         if (ops->ndo_do_ioctl) {
4653                                 if (netif_device_present(dev))
4654                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4655                                 else
4656                                         err = -ENODEV;
4657                         }
4658                 } else
4659                         err = -EINVAL;
4660
4661         }
4662         return err;
4663 }
4664
4665 /*
4666  *      This function handles all "interface"-type I/O control requests. The actual
4667  *      'doing' part of this is dev_ifsioc above.
4668  */
4669
4670 /**
4671  *      dev_ioctl       -       network device ioctl
4672  *      @net: the applicable net namespace
4673  *      @cmd: command to issue
4674  *      @arg: pointer to a struct ifreq in user space
4675  *
4676  *      Issue ioctl functions to devices. This is normally called by the
4677  *      user space syscall interfaces but can sometimes be useful for
4678  *      other purposes. The return value is the return from the syscall if
4679  *      positive or a negative errno code on error.
4680  */
4681
4682 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4683 {
4684         struct ifreq ifr;
4685         int ret;
4686         char *colon;
4687
4688         /* One special case: SIOCGIFCONF takes ifconf argument
4689            and requires shared lock, because it sleeps writing
4690            to user space.
4691          */
4692
4693         if (cmd == SIOCGIFCONF) {
4694                 rtnl_lock();
4695                 ret = dev_ifconf(net, (char __user *) arg);
4696                 rtnl_unlock();
4697                 return ret;
4698         }
4699         if (cmd == SIOCGIFNAME)
4700                 return dev_ifname(net, (struct ifreq __user *)arg);
4701
4702         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4703                 return -EFAULT;
4704
4705         ifr.ifr_name[IFNAMSIZ-1] = 0;
4706
4707         colon = strchr(ifr.ifr_name, ':');
4708         if (colon)
4709                 *colon = 0;
4710
4711         /*
4712          *      See which interface the caller is talking about.
4713          */
4714
4715         switch (cmd) {
4716         /*
4717          *      These ioctl calls:
4718          *      - can be done by all.
4719          *      - atomic and do not require locking.
4720          *      - return a value
4721          */
4722         case SIOCGIFFLAGS:
4723         case SIOCGIFMETRIC:
4724         case SIOCGIFMTU:
4725         case SIOCGIFHWADDR:
4726         case SIOCGIFSLAVE:
4727         case SIOCGIFMAP:
4728         case SIOCGIFINDEX:
4729         case SIOCGIFTXQLEN:
4730                 dev_load(net, ifr.ifr_name);
4731                 rcu_read_lock();
4732                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4733                 rcu_read_unlock();
4734                 if (!ret) {
4735                         if (colon)
4736                                 *colon = ':';
4737                         if (copy_to_user(arg, &ifr,
4738                                          sizeof(struct ifreq)))
4739                                 ret = -EFAULT;
4740                 }
4741                 return ret;
4742
4743         case SIOCETHTOOL:
4744                 dev_load(net, ifr.ifr_name);
4745                 rtnl_lock();
4746                 ret = dev_ethtool(net, &ifr);
4747                 rtnl_unlock();
4748                 if (!ret) {
4749                         if (colon)
4750                                 *colon = ':';
4751                         if (copy_to_user(arg, &ifr,
4752                                          sizeof(struct ifreq)))
4753                                 ret = -EFAULT;
4754                 }
4755                 return ret;
4756
4757         /*
4758          *      These ioctl calls:
4759          *      - require superuser power.
4760          *      - require strict serialization.
4761          *      - return a value
4762          */
4763         case SIOCGMIIPHY:
4764         case SIOCGMIIREG:
4765         case SIOCSIFNAME:
4766                 if (!capable(CAP_NET_ADMIN))
4767                         return -EPERM;
4768                 dev_load(net, ifr.ifr_name);
4769                 rtnl_lock();
4770                 ret = dev_ifsioc(net, &ifr, cmd);
4771                 rtnl_unlock();
4772                 if (!ret) {
4773                         if (colon)
4774                                 *colon = ':';
4775                         if (copy_to_user(arg, &ifr,
4776                                          sizeof(struct ifreq)))
4777                                 ret = -EFAULT;
4778                 }
4779                 return ret;
4780
4781         /*
4782          *      These ioctl calls:
4783          *      - require superuser power.
4784          *      - require strict serialization.
4785          *      - do not return a value
4786          */
4787         case SIOCSIFFLAGS:
4788         case SIOCSIFMETRIC:
4789         case SIOCSIFMTU:
4790         case SIOCSIFMAP:
4791         case SIOCSIFHWADDR:
4792         case SIOCSIFSLAVE:
4793         case SIOCADDMULTI:
4794         case SIOCDELMULTI:
4795         case SIOCSIFHWBROADCAST:
4796         case SIOCSIFTXQLEN:
4797         case SIOCSMIIREG:
4798         case SIOCBONDENSLAVE:
4799         case SIOCBONDRELEASE:
4800         case SIOCBONDSETHWADDR:
4801         case SIOCBONDCHANGEACTIVE:
4802         case SIOCBRADDIF:
4803         case SIOCBRDELIF:
4804         case SIOCSHWTSTAMP:
4805                 if (!capable(CAP_NET_ADMIN))
4806                         return -EPERM;
4807                 /* fall through */
4808         case SIOCBONDSLAVEINFOQUERY:
4809         case SIOCBONDINFOQUERY:
4810                 dev_load(net, ifr.ifr_name);
4811                 rtnl_lock();
4812                 ret = dev_ifsioc(net, &ifr, cmd);
4813                 rtnl_unlock();
4814                 return ret;
4815
4816         case SIOCGIFMEM:
4817                 /* Get the per device memory space. We can add this but
4818                  * currently do not support it */
4819         case SIOCSIFMEM:
4820                 /* Set the per device memory buffer space.
4821                  * Not applicable in our case */
4822         case SIOCSIFLINK:
4823                 return -EINVAL;
4824
4825         /*
4826          *      Unknown or private ioctl.
4827          */
4828         default:
4829                 if (cmd == SIOCWANDEV ||
4830                     (cmd >= SIOCDEVPRIVATE &&
4831                      cmd <= SIOCDEVPRIVATE + 15)) {
4832                         dev_load(net, ifr.ifr_name);
4833                         rtnl_lock();
4834                         ret = dev_ifsioc(net, &ifr, cmd);
4835                         rtnl_unlock();
4836                         if (!ret && copy_to_user(arg, &ifr,
4837                                                  sizeof(struct ifreq)))
4838                                 ret = -EFAULT;
4839                         return ret;
4840                 }
4841                 /* Take care of Wireless Extensions */
4842                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4843                         return wext_handle_ioctl(net, &ifr, cmd, arg);
4844                 return -EINVAL;
4845         }
4846 }
4847
4848
4849 /**
4850  *      dev_new_index   -       allocate an ifindex
4851  *      @net: the applicable net namespace
4852  *
4853  *      Returns a suitable unique value for a new device interface
4854  *      number.  The caller must hold the rtnl semaphore or the
4855  *      dev_base_lock to be sure it remains unique.
4856  */
4857 static int dev_new_index(struct net *net)
4858 {
4859         static int ifindex;
4860         for (;;) {
4861                 if (++ifindex <= 0)
4862                         ifindex = 1;
4863                 if (!__dev_get_by_index(net, ifindex))
4864                         return ifindex;
4865         }
4866 }
4867
4868 /* Delayed registration/unregisteration */
4869 static LIST_HEAD(net_todo_list);
4870
4871 static void net_set_todo(struct net_device *dev)
4872 {
4873         list_add_tail(&dev->todo_list, &net_todo_list);
4874 }
4875
4876 static void rollback_registered_many(struct list_head *head)
4877 {
4878         struct net_device *dev, *tmp;
4879
4880         BUG_ON(dev_boot_phase);
4881         ASSERT_RTNL();
4882
4883         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4884                 /* Some devices call without registering
4885                  * for initialization unwind. Remove those
4886                  * devices and proceed with the remaining.
4887                  */
4888                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4889                         pr_debug("unregister_netdevice: device %s/%p never "
4890                                  "was registered\n", dev->name, dev);
4891
4892                         WARN_ON(1);
4893                         list_del(&dev->unreg_list);
4894                         continue;
4895                 }
4896
4897                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4898
4899                 /* If device is running, close it first. */
4900                 dev_close(dev);
4901
4902                 /* And unlink it from device chain. */
4903                 unlist_netdevice(dev);
4904
4905                 dev->reg_state = NETREG_UNREGISTERING;
4906         }
4907
4908         synchronize_net();
4909
4910         list_for_each_entry(dev, head, unreg_list) {
4911                 /* Shutdown queueing discipline. */
4912                 dev_shutdown(dev);
4913
4914
4915                 /* Notify protocols, that we are about to destroy
4916                    this device. They should clean all the things.
4917                 */
4918                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4919
4920                 if (!dev->rtnl_link_ops ||
4921                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4922                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4923
4924                 /*
4925                  *      Flush the unicast and multicast chains
4926                  */
4927                 dev_unicast_flush(dev);
4928                 dev_addr_discard(dev);
4929
4930                 if (dev->netdev_ops->ndo_uninit)
4931                         dev->netdev_ops->ndo_uninit(dev);
4932
4933                 /* Notifier chain MUST detach us from master device. */
4934                 WARN_ON(dev->master);
4935
4936                 /* Remove entries from kobject tree */
4937                 netdev_unregister_kobject(dev);
4938         }
4939
4940         /* Process any work delayed until the end of the batch */
4941         dev = list_first_entry(head, struct net_device, unreg_list);
4942         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4943
4944         synchronize_net();
4945
4946         list_for_each_entry(dev, head, unreg_list)
4947                 dev_put(dev);
4948 }
4949
4950 static void rollback_registered(struct net_device *dev)
4951 {
4952         LIST_HEAD(single);
4953
4954         list_add(&dev->unreg_list, &single);
4955         rollback_registered_many(&single);
4956 }
4957
4958 static void __netdev_init_queue_locks_one(struct net_device *dev,
4959                                           struct netdev_queue *dev_queue,
4960                                           void *_unused)
4961 {
4962         spin_lock_init(&dev_queue->_xmit_lock);
4963         netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4964         dev_queue->xmit_lock_owner = -1;
4965 }
4966
4967 static void netdev_init_queue_locks(struct net_device *dev)
4968 {
4969         netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4970         __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4971 }
4972
4973 unsigned long netdev_fix_features(unsigned long features, const char *name)
4974 {
4975         /* Fix illegal SG+CSUM combinations. */
4976         if ((features & NETIF_F_SG) &&
4977             !(features & NETIF_F_ALL_CSUM)) {
4978                 if (name)
4979                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4980                                "checksum feature.\n", name);
4981                 features &= ~NETIF_F_SG;
4982         }
4983
4984         /* TSO requires that SG is present as well. */
4985         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4986                 if (name)
4987                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4988                                "SG feature.\n", name);
4989                 features &= ~NETIF_F_TSO;
4990         }
4991
4992         if (features & NETIF_F_UFO) {
4993                 if (!(features & NETIF_F_GEN_CSUM)) {
4994                         if (name)
4995                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4996                                        "since no NETIF_F_HW_CSUM feature.\n",
4997                                        name);
4998                         features &= ~NETIF_F_UFO;
4999                 }
5000
5001                 if (!(features & NETIF_F_SG)) {
5002                         if (name)
5003                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5004                                        "since no NETIF_F_SG feature.\n", name);
5005                         features &= ~NETIF_F_UFO;
5006                 }
5007         }
5008
5009         return features;
5010 }
5011 EXPORT_SYMBOL(netdev_fix_features);
5012
5013 /**
5014  *      netif_stacked_transfer_operstate -      transfer operstate
5015  *      @rootdev: the root or lower level device to transfer state from
5016  *      @dev: the device to transfer operstate to
5017  *
5018  *      Transfer operational state from root to device. This is normally
5019  *      called when a stacking relationship exists between the root
5020  *      device and the device(a leaf device).
5021  */
5022 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5023                                         struct net_device *dev)
5024 {
5025         if (rootdev->operstate == IF_OPER_DORMANT)
5026                 netif_dormant_on(dev);
5027         else
5028                 netif_dormant_off(dev);
5029
5030         if (netif_carrier_ok(rootdev)) {
5031                 if (!netif_carrier_ok(dev))
5032                         netif_carrier_on(dev);
5033         } else {
5034                 if (netif_carrier_ok(dev))
5035                         netif_carrier_off(dev);
5036         }
5037 }
5038 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5039
5040 /**
5041  *      register_netdevice      - register a network device
5042  *      @dev: device to register
5043  *
5044  *      Take a completed network device structure and add it to the kernel
5045  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5046  *      chain. 0 is returned on success. A negative errno code is returned
5047  *      on a failure to set up the device, or if the name is a duplicate.
5048  *
5049  *      Callers must hold the rtnl semaphore. You may want
5050  *      register_netdev() instead of this.
5051  *
5052  *      BUGS:
5053  *      The locking appears insufficient to guarantee two parallel registers
5054  *      will not get the same name.
5055  */
5056
5057 int register_netdevice(struct net_device *dev)
5058 {
5059         int ret;
5060         struct net *net = dev_net(dev);
5061
5062         BUG_ON(dev_boot_phase);
5063         ASSERT_RTNL();
5064
5065         might_sleep();
5066
5067         /* When net_device's are persistent, this will be fatal. */
5068         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5069         BUG_ON(!net);
5070
5071         spin_lock_init(&dev->addr_list_lock);
5072         netdev_set_addr_lockdep_class(dev);
5073         netdev_init_queue_locks(dev);
5074
5075         dev->iflink = -1;
5076
5077         /* Init, if this function is available */
5078         if (dev->netdev_ops->ndo_init) {
5079                 ret = dev->netdev_ops->ndo_init(dev);
5080                 if (ret) {
5081                         if (ret > 0)
5082                                 ret = -EIO;
5083                         goto out;
5084                 }
5085         }
5086
5087         ret = dev_get_valid_name(net, dev->name, dev->name, 0);
5088         if (ret)
5089                 goto err_uninit;
5090
5091         dev->ifindex = dev_new_index(net);
5092         if (dev->iflink == -1)
5093                 dev->iflink = dev->ifindex;
5094
5095         /* Fix illegal checksum combinations */
5096         if ((dev->features & NETIF_F_HW_CSUM) &&
5097             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5098                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5099                        dev->name);
5100                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5101         }
5102
5103         if ((dev->features & NETIF_F_NO_CSUM) &&
5104             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5105                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5106                        dev->name);
5107                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5108         }
5109
5110         dev->features = netdev_fix_features(dev->features, dev->name);
5111
5112         /* Enable software GSO if SG is supported. */
5113         if (dev->features & NETIF_F_SG)
5114                 dev->features |= NETIF_F_GSO;
5115
5116         netdev_initialize_kobject(dev);
5117
5118         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5119         ret = notifier_to_errno(ret);
5120         if (ret)
5121                 goto err_uninit;
5122
5123         ret = netdev_register_kobject(dev);
5124         if (ret)
5125                 goto err_uninit;
5126         dev->reg_state = NETREG_REGISTERED;
5127
5128         /*
5129          *      Default initial state at registry is that the
5130          *      device is present.
5131          */
5132
5133         set_bit(__LINK_STATE_PRESENT, &dev->state);
5134
5135         dev_init_scheduler(dev);
5136         dev_hold(dev);
5137         list_netdevice(dev);
5138
5139         /* Notify protocols, that a new device appeared. */
5140         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5141         ret = notifier_to_errno(ret);
5142         if (ret) {
5143                 rollback_registered(dev);
5144                 dev->reg_state = NETREG_UNREGISTERED;
5145         }
5146         /*
5147          *      Prevent userspace races by waiting until the network
5148          *      device is fully setup before sending notifications.
5149          */
5150         if (!dev->rtnl_link_ops ||
5151             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5152                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5153
5154 out:
5155         return ret;
5156
5157 err_uninit:
5158         if (dev->netdev_ops->ndo_uninit)
5159                 dev->netdev_ops->ndo_uninit(dev);
5160         goto out;
5161 }
5162 EXPORT_SYMBOL(register_netdevice);
5163
5164 /**
5165  *      init_dummy_netdev       - init a dummy network device for NAPI
5166  *      @dev: device to init
5167  *
5168  *      This takes a network device structure and initialize the minimum
5169  *      amount of fields so it can be used to schedule NAPI polls without
5170  *      registering a full blown interface. This is to be used by drivers
5171  *      that need to tie several hardware interfaces to a single NAPI
5172  *      poll scheduler due to HW limitations.
5173  */
5174 int init_dummy_netdev(struct net_device *dev)
5175 {
5176         /* Clear everything. Note we don't initialize spinlocks
5177          * are they aren't supposed to be taken by any of the
5178          * NAPI code and this dummy netdev is supposed to be
5179          * only ever used for NAPI polls
5180          */
5181         memset(dev, 0, sizeof(struct net_device));
5182
5183         /* make sure we BUG if trying to hit standard
5184          * register/unregister code path
5185          */
5186         dev->reg_state = NETREG_DUMMY;
5187
5188         /* initialize the ref count */
5189         atomic_set(&dev->refcnt, 1);
5190
5191         /* NAPI wants this */
5192         INIT_LIST_HEAD(&dev->napi_list);
5193
5194         /* a dummy interface is started by default */
5195         set_bit(__LINK_STATE_PRESENT, &dev->state);
5196         set_bit(__LINK_STATE_START, &dev->state);
5197
5198         return 0;
5199 }
5200 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5201
5202
5203 /**
5204  *      register_netdev - register a network device
5205  *      @dev: device to register
5206  *
5207  *      Take a completed network device structure and add it to the kernel
5208  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5209  *      chain. 0 is returned on success. A negative errno code is returned
5210  *      on a failure to set up the device, or if the name is a duplicate.
5211  *
5212  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5213  *      and expands the device name if you passed a format string to
5214  *      alloc_netdev.
5215  */
5216 int register_netdev(struct net_device *dev)
5217 {
5218         int err;
5219
5220         rtnl_lock();
5221
5222         /*
5223          * If the name is a format string the caller wants us to do a
5224          * name allocation.
5225          */
5226         if (strchr(dev->name, '%')) {
5227                 err = dev_alloc_name(dev, dev->name);
5228                 if (err < 0)
5229                         goto out;
5230         }
5231
5232         err = register_netdevice(dev);
5233 out:
5234         rtnl_unlock();
5235         return err;
5236 }
5237 EXPORT_SYMBOL(register_netdev);
5238
5239 /*
5240  * netdev_wait_allrefs - wait until all references are gone.
5241  *
5242  * This is called when unregistering network devices.
5243  *
5244  * Any protocol or device that holds a reference should register
5245  * for netdevice notification, and cleanup and put back the
5246  * reference if they receive an UNREGISTER event.
5247  * We can get stuck here if buggy protocols don't correctly
5248  * call dev_put.
5249  */
5250 static void netdev_wait_allrefs(struct net_device *dev)
5251 {
5252         unsigned long rebroadcast_time, warning_time;
5253
5254         linkwatch_forget_dev(dev);
5255
5256         rebroadcast_time = warning_time = jiffies;
5257         while (atomic_read(&dev->refcnt) != 0) {
5258                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5259                         rtnl_lock();
5260
5261                         /* Rebroadcast unregister notification */
5262                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5263                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5264                          * should have already handle it the first time */
5265
5266                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5267                                      &dev->state)) {
5268                                 /* We must not have linkwatch events
5269                                  * pending on unregister. If this
5270                                  * happens, we simply run the queue
5271                                  * unscheduled, resulting in a noop
5272                                  * for this device.
5273                                  */
5274                                 linkwatch_run_queue();
5275                         }
5276
5277                         __rtnl_unlock();
5278
5279                         rebroadcast_time = jiffies;
5280                 }
5281
5282                 msleep(250);
5283
5284                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5285                         printk(KERN_EMERG "unregister_netdevice: "
5286                                "waiting for %s to become free. Usage "
5287                                "count = %d\n",
5288                                dev->name, atomic_read(&dev->refcnt));
5289                         warning_time = jiffies;
5290                 }
5291         }
5292 }
5293
5294 /* The sequence is:
5295  *
5296  *      rtnl_lock();
5297  *      ...
5298  *      register_netdevice(x1);
5299  *      register_netdevice(x2);
5300  *      ...
5301  *      unregister_netdevice(y1);
5302  *      unregister_netdevice(y2);
5303  *      ...
5304  *      rtnl_unlock();
5305  *      free_netdev(y1);
5306  *      free_netdev(y2);
5307  *
5308  * We are invoked by rtnl_unlock().
5309  * This allows us to deal with problems:
5310  * 1) We can delete sysfs objects which invoke hotplug
5311  *    without deadlocking with linkwatch via keventd.
5312  * 2) Since we run with the RTNL semaphore not held, we can sleep
5313  *    safely in order to wait for the netdev refcnt to drop to zero.
5314  *
5315  * We must not return until all unregister events added during
5316  * the interval the lock was held have been completed.
5317  */
5318 void netdev_run_todo(void)
5319 {
5320         struct list_head list;
5321
5322         /* Snapshot list, allow later requests */
5323         list_replace_init(&net_todo_list, &list);
5324
5325         __rtnl_unlock();
5326
5327         while (!list_empty(&list)) {
5328                 struct net_device *dev
5329                         = list_first_entry(&list, struct net_device, todo_list);
5330                 list_del(&dev->todo_list);
5331
5332                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5333                         printk(KERN_ERR "network todo '%s' but state %d\n",
5334                                dev->name, dev->reg_state);
5335                         dump_stack();
5336                         continue;
5337                 }
5338
5339                 dev->reg_state = NETREG_UNREGISTERED;
5340
5341                 on_each_cpu(flush_backlog, dev, 1);
5342
5343                 netdev_wait_allrefs(dev);
5344
5345                 /* paranoia */
5346                 BUG_ON(atomic_read(&dev->refcnt));
5347                 WARN_ON(dev->ip_ptr);
5348                 WARN_ON(dev->ip6_ptr);
5349                 WARN_ON(dev->dn_ptr);
5350
5351                 if (dev->destructor)
5352                         dev->destructor(dev);
5353
5354                 /* Free network device */
5355                 kobject_put(&dev->dev.kobj);
5356         }
5357 }
5358
5359 /**
5360  *      dev_txq_stats_fold - fold tx_queues stats
5361  *      @dev: device to get statistics from
5362  *      @stats: struct net_device_stats to hold results
5363  */
5364 void dev_txq_stats_fold(const struct net_device *dev,
5365                         struct net_device_stats *stats)
5366 {
5367         unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5368         unsigned int i;
5369         struct netdev_queue *txq;
5370
5371         for (i = 0; i < dev->num_tx_queues; i++) {
5372                 txq = netdev_get_tx_queue(dev, i);
5373                 tx_bytes   += txq->tx_bytes;
5374                 tx_packets += txq->tx_packets;
5375                 tx_dropped += txq->tx_dropped;
5376         }
5377         if (tx_bytes || tx_packets || tx_dropped) {
5378                 stats->tx_bytes   = tx_bytes;
5379                 stats->tx_packets = tx_packets;
5380                 stats->tx_dropped = tx_dropped;
5381         }
5382 }
5383 EXPORT_SYMBOL(dev_txq_stats_fold);
5384
5385 /**
5386  *      dev_get_stats   - get network device statistics
5387  *      @dev: device to get statistics from
5388  *
5389  *      Get network statistics from device. The device driver may provide
5390  *      its own method by setting dev->netdev_ops->get_stats; otherwise
5391  *      the internal statistics structure is used.
5392  */
5393 const struct net_device_stats *dev_get_stats(struct net_device *dev)
5394 {
5395         const struct net_device_ops *ops = dev->netdev_ops;
5396
5397         if (ops->ndo_get_stats)
5398                 return ops->ndo_get_stats(dev);
5399
5400         dev_txq_stats_fold(dev, &dev->stats);
5401         return &dev->stats;
5402 }
5403 EXPORT_SYMBOL(dev_get_stats);
5404
5405 static void netdev_init_one_queue(struct net_device *dev,
5406                                   struct netdev_queue *queue,
5407                                   void *_unused)
5408 {
5409         queue->dev = dev;
5410 }
5411
5412 static void netdev_init_queues(struct net_device *dev)
5413 {
5414         netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5415         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5416         spin_lock_init(&dev->tx_global_lock);
5417 }
5418
5419 /**
5420  *      alloc_netdev_mq - allocate network device
5421  *      @sizeof_priv:   size of private data to allocate space for
5422  *      @name:          device name format string
5423  *      @setup:         callback to initialize device
5424  *      @queue_count:   the number of subqueues to allocate
5425  *
5426  *      Allocates a struct net_device with private data area for driver use
5427  *      and performs basic initialization.  Also allocates subquue structs
5428  *      for each queue on the device at the end of the netdevice.
5429  */
5430 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5431                 void (*setup)(struct net_device *), unsigned int queue_count)
5432 {
5433         struct netdev_queue *tx;
5434         struct net_device *dev;
5435         size_t alloc_size;
5436         struct net_device *p;
5437
5438         BUG_ON(strlen(name) >= sizeof(dev->name));
5439
5440         alloc_size = sizeof(struct net_device);
5441         if (sizeof_priv) {
5442                 /* ensure 32-byte alignment of private area */
5443                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5444                 alloc_size += sizeof_priv;
5445         }
5446         /* ensure 32-byte alignment of whole construct */
5447         alloc_size += NETDEV_ALIGN - 1;
5448
5449         p = kzalloc(alloc_size, GFP_KERNEL);
5450         if (!p) {
5451                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5452                 return NULL;
5453         }
5454
5455         tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5456         if (!tx) {
5457                 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5458                        "tx qdiscs.\n");
5459                 goto free_p;
5460         }
5461
5462         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5463         dev->padded = (char *)dev - (char *)p;
5464
5465         if (dev_addr_init(dev))
5466                 goto free_tx;
5467
5468         dev_unicast_init(dev);
5469
5470         dev_net_set(dev, &init_net);
5471
5472         dev->_tx = tx;
5473         dev->num_tx_queues = queue_count;
5474         dev->real_num_tx_queues = queue_count;
5475
5476         dev->gso_max_size = GSO_MAX_SIZE;
5477
5478         netdev_init_queues(dev);
5479
5480         INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5481         dev->ethtool_ntuple_list.count = 0;
5482         INIT_LIST_HEAD(&dev->napi_list);
5483         INIT_LIST_HEAD(&dev->unreg_list);
5484         INIT_LIST_HEAD(&dev->link_watch_list);
5485         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5486         setup(dev);
5487         strcpy(dev->name, name);
5488         return dev;
5489
5490 free_tx:
5491         kfree(tx);
5492
5493 free_p:
5494         kfree(p);
5495         return NULL;
5496 }
5497 EXPORT_SYMBOL(alloc_netdev_mq);
5498
5499 /**
5500  *      free_netdev - free network device
5501  *      @dev: device
5502  *
5503  *      This function does the last stage of destroying an allocated device
5504  *      interface. The reference to the device object is released.
5505  *      If this is the last reference then it will be freed.
5506  */
5507 void free_netdev(struct net_device *dev)
5508 {
5509         struct napi_struct *p, *n;
5510
5511         release_net(dev_net(dev));
5512
5513         kfree(dev->_tx);
5514
5515         /* Flush device addresses */
5516         dev_addr_flush(dev);
5517
5518         /* Clear ethtool n-tuple list */
5519         ethtool_ntuple_flush(dev);
5520
5521         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5522                 netif_napi_del(p);
5523
5524         /*  Compatibility with error handling in drivers */
5525         if (dev->reg_state == NETREG_UNINITIALIZED) {
5526                 kfree((char *)dev - dev->padded);
5527                 return;
5528         }
5529
5530         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5531         dev->reg_state = NETREG_RELEASED;
5532
5533         /* will free via device release */
5534         put_device(&dev->dev);
5535 }
5536 EXPORT_SYMBOL(free_netdev);
5537
5538 /**
5539  *      synchronize_net -  Synchronize with packet receive processing
5540  *
5541  *      Wait for packets currently being received to be done.
5542  *      Does not block later packets from starting.
5543  */
5544 void synchronize_net(void)
5545 {
5546         might_sleep();
5547         synchronize_rcu();
5548 }
5549 EXPORT_SYMBOL(synchronize_net);
5550
5551 /**
5552  *      unregister_netdevice_queue - remove device from the kernel
5553  *      @dev: device
5554  *      @head: list
5555  *
5556  *      This function shuts down a device interface and removes it
5557  *      from the kernel tables.
5558  *      If head not NULL, device is queued to be unregistered later.
5559  *
5560  *      Callers must hold the rtnl semaphore.  You may want
5561  *      unregister_netdev() instead of this.
5562  */
5563
5564 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5565 {
5566         ASSERT_RTNL();
5567
5568         if (head) {
5569                 list_move_tail(&dev->unreg_list, head);
5570         } else {
5571                 rollback_registered(dev);
5572                 /* Finish processing unregister after unlock */
5573                 net_set_todo(dev);
5574         }
5575 }
5576 EXPORT_SYMBOL(unregister_netdevice_queue);
5577
5578 /**
5579  *      unregister_netdevice_many - unregister many devices
5580  *      @head: list of devices
5581  */
5582 void unregister_netdevice_many(struct list_head *head)
5583 {
5584         struct net_device *dev;
5585
5586         if (!list_empty(head)) {
5587                 rollback_registered_many(head);
5588                 list_for_each_entry(dev, head, unreg_list)
5589                         net_set_todo(dev);
5590         }
5591 }
5592 EXPORT_SYMBOL(unregister_netdevice_many);
5593
5594 /**
5595  *      unregister_netdev - remove device from the kernel
5596  *      @dev: device
5597  *
5598  *      This function shuts down a device interface and removes it
5599  *      from the kernel tables.
5600  *
5601  *      This is just a wrapper for unregister_netdevice that takes
5602  *      the rtnl semaphore.  In general you want to use this and not
5603  *      unregister_netdevice.
5604  */
5605 void unregister_netdev(struct net_device *dev)
5606 {
5607         rtnl_lock();
5608         unregister_netdevice(dev);
5609         rtnl_unlock();
5610 }
5611 EXPORT_SYMBOL(unregister_netdev);
5612
5613 /**
5614  *      dev_change_net_namespace - move device to different nethost namespace
5615  *      @dev: device
5616  *      @net: network namespace
5617  *      @pat: If not NULL name pattern to try if the current device name
5618  *            is already taken in the destination network namespace.
5619  *
5620  *      This function shuts down a device interface and moves it
5621  *      to a new network namespace. On success 0 is returned, on
5622  *      a failure a netagive errno code is returned.
5623  *
5624  *      Callers must hold the rtnl semaphore.
5625  */
5626
5627 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5628 {
5629         int err;
5630
5631         ASSERT_RTNL();
5632
5633         /* Don't allow namespace local devices to be moved. */
5634         err = -EINVAL;
5635         if (dev->features & NETIF_F_NETNS_LOCAL)
5636                 goto out;
5637
5638 #ifdef CONFIG_SYSFS
5639         /* Don't allow real devices to be moved when sysfs
5640          * is enabled.
5641          */
5642         err = -EINVAL;
5643         if (dev->dev.parent)
5644                 goto out;
5645 #endif
5646
5647         /* Ensure the device has been registrered */
5648         err = -EINVAL;
5649         if (dev->reg_state != NETREG_REGISTERED)
5650                 goto out;
5651
5652         /* Get out if there is nothing todo */
5653         err = 0;
5654         if (net_eq(dev_net(dev), net))
5655                 goto out;
5656
5657         /* Pick the destination device name, and ensure
5658          * we can use it in the destination network namespace.
5659          */
5660         err = -EEXIST;
5661         if (__dev_get_by_name(net, dev->name)) {
5662                 /* We get here if we can't use the current device name */
5663                 if (!pat)
5664                         goto out;
5665                 if (dev_get_valid_name(net, pat, dev->name, 1))
5666                         goto out;
5667         }
5668
5669         /*
5670          * And now a mini version of register_netdevice unregister_netdevice.
5671          */
5672
5673         /* If device is running close it first. */
5674         dev_close(dev);
5675
5676         /* And unlink it from device chain */
5677         err = -ENODEV;
5678         unlist_netdevice(dev);
5679
5680         synchronize_net();
5681
5682         /* Shutdown queueing discipline. */
5683         dev_shutdown(dev);
5684
5685         /* Notify protocols, that we are about to destroy
5686            this device. They should clean all the things.
5687         */
5688         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5689         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5690
5691         /*
5692          *      Flush the unicast and multicast chains
5693          */
5694         dev_unicast_flush(dev);
5695         dev_addr_discard(dev);
5696
5697         netdev_unregister_kobject(dev);
5698
5699         /* Actually switch the network namespace */
5700         dev_net_set(dev, net);
5701
5702         /* If there is an ifindex conflict assign a new one */
5703         if (__dev_get_by_index(net, dev->ifindex)) {
5704                 int iflink = (dev->iflink == dev->ifindex);
5705                 dev->ifindex = dev_new_index(net);
5706                 if (iflink)
5707                         dev->iflink = dev->ifindex;
5708         }
5709
5710         /* Fixup kobjects */
5711         err = netdev_register_kobject(dev);
5712         WARN_ON(err);
5713
5714         /* Add the device back in the hashes */
5715         list_netdevice(dev);
5716
5717         /* Notify protocols, that a new device appeared. */
5718         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5719
5720         /*
5721          *      Prevent userspace races by waiting until the network
5722          *      device is fully setup before sending notifications.
5723          */
5724         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5725
5726         synchronize_net();
5727         err = 0;
5728 out:
5729         return err;
5730 }
5731 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5732
5733 static int dev_cpu_callback(struct notifier_block *nfb,
5734                             unsigned long action,
5735                             void *ocpu)
5736 {
5737         struct sk_buff **list_skb;
5738         struct Qdisc **list_net;
5739         struct sk_buff *skb;
5740         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5741         struct softnet_data *sd, *oldsd;
5742
5743         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5744                 return NOTIFY_OK;
5745
5746         local_irq_disable();
5747         cpu = smp_processor_id();
5748         sd = &per_cpu(softnet_data, cpu);
5749         oldsd = &per_cpu(softnet_data, oldcpu);
5750
5751         /* Find end of our completion_queue. */
5752         list_skb = &sd->completion_queue;
5753         while (*list_skb)
5754                 list_skb = &(*list_skb)->next;
5755         /* Append completion queue from offline CPU. */
5756         *list_skb = oldsd->completion_queue;
5757         oldsd->completion_queue = NULL;
5758
5759         /* Find end of our output_queue. */
5760         list_net = &sd->output_queue;
5761         while (*list_net)
5762                 list_net = &(*list_net)->next_sched;
5763         /* Append output queue from offline CPU. */
5764         *list_net = oldsd->output_queue;
5765         oldsd->output_queue = NULL;
5766
5767         raise_softirq_irqoff(NET_TX_SOFTIRQ);
5768         local_irq_enable();
5769
5770         /* Process offline CPU's input_pkt_queue */
5771         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5772                 netif_rx(skb);
5773
5774         return NOTIFY_OK;
5775 }
5776
5777
5778 /**
5779  *      netdev_increment_features - increment feature set by one
5780  *      @all: current feature set
5781  *      @one: new feature set
5782  *      @mask: mask feature set
5783  *
5784  *      Computes a new feature set after adding a device with feature set
5785  *      @one to the master device with current feature set @all.  Will not
5786  *      enable anything that is off in @mask. Returns the new feature set.
5787  */
5788 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5789                                         unsigned long mask)
5790 {
5791         /* If device needs checksumming, downgrade to it. */
5792         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5793                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5794         else if (mask & NETIF_F_ALL_CSUM) {
5795                 /* If one device supports v4/v6 checksumming, set for all. */
5796                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5797                     !(all & NETIF_F_GEN_CSUM)) {
5798                         all &= ~NETIF_F_ALL_CSUM;
5799                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5800                 }
5801
5802                 /* If one device supports hw checksumming, set for all. */
5803                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5804                         all &= ~NETIF_F_ALL_CSUM;
5805                         all |= NETIF_F_HW_CSUM;
5806                 }
5807         }
5808
5809         one |= NETIF_F_ALL_CSUM;
5810
5811         one |= all & NETIF_F_ONE_FOR_ALL;
5812         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5813         all |= one & mask & NETIF_F_ONE_FOR_ALL;
5814
5815         return all;
5816 }
5817 EXPORT_SYMBOL(netdev_increment_features);
5818
5819 static struct hlist_head *netdev_create_hash(void)
5820 {
5821         int i;
5822         struct hlist_head *hash;
5823
5824         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5825         if (hash != NULL)
5826                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5827                         INIT_HLIST_HEAD(&hash[i]);
5828
5829         return hash;
5830 }
5831
5832 /* Initialize per network namespace state */
5833 static int __net_init netdev_init(struct net *net)
5834 {
5835         INIT_LIST_HEAD(&net->dev_base_head);
5836
5837         net->dev_name_head = netdev_create_hash();
5838         if (net->dev_name_head == NULL)
5839                 goto err_name;
5840
5841         net->dev_index_head = netdev_create_hash();
5842         if (net->dev_index_head == NULL)
5843                 goto err_idx;
5844
5845         return 0;
5846
5847 err_idx:
5848         kfree(net->dev_name_head);
5849 err_name:
5850         return -ENOMEM;
5851 }
5852
5853 /**
5854  *      netdev_drivername - network driver for the device
5855  *      @dev: network device
5856  *      @buffer: buffer for resulting name
5857  *      @len: size of buffer
5858  *
5859  *      Determine network driver for device.
5860  */
5861 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5862 {
5863         const struct device_driver *driver;
5864         const struct device *parent;
5865
5866         if (len <= 0 || !buffer)
5867                 return buffer;
5868         buffer[0] = 0;
5869
5870         parent = dev->dev.parent;
5871
5872         if (!parent)
5873                 return buffer;
5874
5875         driver = parent->driver;
5876         if (driver && driver->name)
5877                 strlcpy(buffer, driver->name, len);
5878         return buffer;
5879 }
5880
5881 static void __net_exit netdev_exit(struct net *net)
5882 {
5883         kfree(net->dev_name_head);
5884         kfree(net->dev_index_head);
5885 }
5886
5887 static struct pernet_operations __net_initdata netdev_net_ops = {
5888         .init = netdev_init,
5889         .exit = netdev_exit,
5890 };
5891
5892 static void __net_exit default_device_exit(struct net *net)
5893 {
5894         struct net_device *dev, *aux;
5895         /*
5896          * Push all migratable network devices back to the
5897          * initial network namespace
5898          */
5899         rtnl_lock();
5900         for_each_netdev_safe(net, dev, aux) {
5901                 int err;
5902                 char fb_name[IFNAMSIZ];
5903
5904                 /* Ignore unmoveable devices (i.e. loopback) */
5905                 if (dev->features & NETIF_F_NETNS_LOCAL)
5906                         continue;
5907
5908                 /* Leave virtual devices for the generic cleanup */
5909                 if (dev->rtnl_link_ops)
5910                         continue;
5911
5912                 /* Push remaing network devices to init_net */
5913                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5914                 err = dev_change_net_namespace(dev, &init_net, fb_name);
5915                 if (err) {
5916                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5917                                 __func__, dev->name, err);
5918                         BUG();
5919                 }
5920         }
5921         rtnl_unlock();
5922 }
5923
5924 static void __net_exit default_device_exit_batch(struct list_head *net_list)
5925 {
5926         /* At exit all network devices most be removed from a network
5927          * namespace.  Do this in the reverse order of registeration.
5928          * Do this across as many network namespaces as possible to
5929          * improve batching efficiency.
5930          */
5931         struct net_device *dev;
5932         struct net *net;
5933         LIST_HEAD(dev_kill_list);
5934
5935         rtnl_lock();
5936         list_for_each_entry(net, net_list, exit_list) {
5937                 for_each_netdev_reverse(net, dev) {
5938                         if (dev->rtnl_link_ops)
5939                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5940                         else
5941                                 unregister_netdevice_queue(dev, &dev_kill_list);
5942                 }
5943         }
5944         unregister_netdevice_many(&dev_kill_list);
5945         rtnl_unlock();
5946 }
5947
5948 static struct pernet_operations __net_initdata default_device_ops = {
5949         .exit = default_device_exit,
5950         .exit_batch = default_device_exit_batch,
5951 };
5952
5953 /*
5954  *      Initialize the DEV module. At boot time this walks the device list and
5955  *      unhooks any devices that fail to initialise (normally hardware not
5956  *      present) and leaves us with a valid list of present and active devices.
5957  *
5958  */
5959
5960 /*
5961  *       This is called single threaded during boot, so no need
5962  *       to take the rtnl semaphore.
5963  */
5964 static int __init net_dev_init(void)
5965 {
5966         int i, rc = -ENOMEM;
5967
5968         BUG_ON(!dev_boot_phase);
5969
5970         if (dev_proc_init())
5971                 goto out;
5972
5973         if (netdev_kobject_init())
5974                 goto out;
5975
5976         INIT_LIST_HEAD(&ptype_all);
5977         for (i = 0; i < PTYPE_HASH_SIZE; i++)
5978                 INIT_LIST_HEAD(&ptype_base[i]);
5979
5980         if (register_pernet_subsys(&netdev_net_ops))
5981                 goto out;
5982
5983         /*
5984          *      Initialise the packet receive queues.
5985          */
5986
5987         for_each_possible_cpu(i) {
5988                 struct softnet_data *queue;
5989
5990                 queue = &per_cpu(softnet_data, i);
5991                 skb_queue_head_init(&queue->input_pkt_queue);
5992                 queue->completion_queue = NULL;
5993                 INIT_LIST_HEAD(&queue->poll_list);
5994
5995                 queue->backlog.poll = process_backlog;
5996                 queue->backlog.weight = weight_p;
5997                 queue->backlog.gro_list = NULL;
5998                 queue->backlog.gro_count = 0;
5999         }
6000
6001         dev_boot_phase = 0;
6002
6003         /* The loopback device is special if any other network devices
6004          * is present in a network namespace the loopback device must
6005          * be present. Since we now dynamically allocate and free the
6006          * loopback device ensure this invariant is maintained by
6007          * keeping the loopback device as the first device on the
6008          * list of network devices.  Ensuring the loopback devices
6009          * is the first device that appears and the last network device
6010          * that disappears.
6011          */
6012         if (register_pernet_device(&loopback_net_ops))
6013                 goto out;
6014
6015         if (register_pernet_device(&default_device_ops))
6016                 goto out;
6017
6018         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6019         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6020
6021         hotcpu_notifier(dev_cpu_callback, 0);
6022         dst_init();
6023         dev_mcast_init();
6024         rc = 0;
6025 out:
6026         return rc;
6027 }
6028
6029 subsys_initcall(net_dev_init);
6030
6031 static int __init initialize_hashrnd(void)
6032 {
6033         get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
6034         return 0;
6035 }
6036
6037 late_initcall_sync(initialize_hashrnd);
6038