net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135
 136 #include "net-sysfs.h"
 137
 138 /* Instead of increasing this, you should create a hash table. */
 139 #define MAX_GRO_SKBS 8
 140
 141 /* This should be increased if a protocol with a bigger head is added. */
 142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144 /*
 145  *      The list of packet types we will receive (as opposed to discard)
 146  *      and the routines to invoke.
 147  *
 148  *      Why 16. Because with 16 the only overlap we get on a hash of the
 149  *      low nibble of the protocol value is RARP/SNAP/X.25.
 150  *
 151  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 152  *             sure which should go first, but I bet it won't make much
 153  *             difference if we are running VLANs.  The good news is that
 154  *             this protocol won't be in the list unless compiled in, so
 155  *             the average user (w/out VLANs) will not be adversely affected.
 156  *             --BLG
 157  *
 158  *              0800    IP
 159  *              8100    802.1Q VLAN
 160  *              0001    802.3
 161  *              0002    AX.25
 162  *              0004    802.2
 163  *              8035    RARP
 164  *              0005    SNAP
 165  *              0805    X.25
 166  *              0806    ARP
 167  *              8137    IPX
 168  *              0009    Localtalk
 169  *              86DD    IPv6
 170  */
 171
 172 #define PTYPE_HASH_SIZE (16)
 173 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 174
 175 static DEFINE_SPINLOCK(ptype_lock);
 176 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 177 static struct list_head ptype_all __read_mostly;        /* Taps */
 178
 179 /*
 180  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 181  * semaphore.
 182  *
 183  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 184  *
 185  * Writers must hold the rtnl semaphore while they loop through the
 186  * dev_base_head list, and hold dev_base_lock for writing when they do the
 187  * actual updates.  This allows pure readers to access the list even
 188  * while a writer is preparing to update it.
 189  *
 190  * To put it another way, dev_base_lock is held for writing only to
 191  * protect against pure readers; the rtnl semaphore provides the
 192  * protection against other writers.
 193  *
 194  * See, for example usages, register_netdevice() and
 195  * unregister_netdevice(), which must be called with the rtnl
 196  * semaphore held.
 197  */
 198 DEFINE_RWLOCK(dev_base_lock);
 199 EXPORT_SYMBOL(dev_base_lock);
 200
 201 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 202 {
 203         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 204         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 205 }
 206
 207 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 208 {
 209         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 210 }
 211
 212 static inline void rps_lock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_lock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 static inline void rps_unlock(struct softnet_data *sd)
 220 {
 221 #ifdef CONFIG_RPS
 222         spin_unlock(&sd->input_pkt_queue.lock);
 223 #endif
 224 }
 225
 226 /* Device list insertion */
 227 static int list_netdevice(struct net_device *dev)
 228 {
 229         struct net *net = dev_net(dev);
 230
 231         ASSERT_RTNL();
 232
 233         write_lock_bh(&dev_base_lock);
 234         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 235         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 236         hlist_add_head_rcu(&dev->index_hlist,
 237                            dev_index_hash(net, dev->ifindex));
 238         write_unlock_bh(&dev_base_lock);
 239         return 0;
 240 }
 241
 242 /* Device list removal
 243  * caller must respect a RCU grace period before freeing/reusing dev
 244  */
 245 static void unlist_netdevice(struct net_device *dev)
 246 {
 247         ASSERT_RTNL();
 248
 249         /* Unlink dev from the device chain */
 250         write_lock_bh(&dev_base_lock);
 251         list_del_rcu(&dev->dev_list);
 252         hlist_del_rcu(&dev->name_hlist);
 253         hlist_del_rcu(&dev->index_hlist);
 254         write_unlock_bh(&dev_base_lock);
 255 }
 256
 257 /*
 258  *      Our notifier list
 259  */
 260
 261 static RAW_NOTIFIER_HEAD(netdev_chain);
 262
 263 /*
 264  *      Device drivers call our routines to queue packets here. We empty the
 265  *      queue in the local softnet handler.
 266  */
 267
 268 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 269 EXPORT_PER_CPU_SYMBOL(softnet_data);
 270
 271 #ifdef CONFIG_LOCKDEP
 272 /*
 273  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 274  * according to dev->type
 275  */
 276 static const unsigned short netdev_lock_type[] =
 277         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 278          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 279          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 280          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 281          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 282          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 283          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 284          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 285          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 286          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 287          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 288          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 289          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 290          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 291          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 292          ARPHRD_VOID, ARPHRD_NONE};
 293
 294 static const char *const netdev_lock_name[] =
 295         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 308          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 309          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 310          "_xmit_VOID", "_xmit_NONE"};
 311
 312 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314
 315 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 316 {
 317         int i;
 318
 319         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 320                 if (netdev_lock_type[i] == dev_type)
 321                         return i;
 322         /* the last key is used by default */
 323         return ARRAY_SIZE(netdev_lock_type) - 1;
 324 }
 325
 326 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 327                                                  unsigned short dev_type)
 328 {
 329         int i;
 330
 331         i = netdev_lock_pos(dev_type);
 332         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 333                                    netdev_lock_name[i]);
 334 }
 335
 336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 337 {
 338         int i;
 339
 340         i = netdev_lock_pos(dev->type);
 341         lockdep_set_class_and_name(&dev->addr_list_lock,
 342                                    &netdev_addr_lock_key[i],
 343                                    netdev_lock_name[i]);
 344 }
 345 #else
 346 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 347                                                  unsigned short dev_type)
 348 {
 349 }
 350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351 {
 352 }
 353 #endif
 354
 355 /*******************************************************************************
 356
 357                 Protocol management and registration routines
 358
 359 *******************************************************************************/
 360
 361 /*
 362  *      Add a protocol ID to the list. Now that the input handler is
 363  *      smarter we can dispense with all the messy stuff that used to be
 364  *      here.
 365  *
 366  *      BEWARE!!! Protocol handlers, mangling input packets,
 367  *      MUST BE last in hash buckets and checking protocol handlers
 368  *      MUST start from promiscuous ptype_all chain in net_bh.
 369  *      It is true now, do not change it.
 370  *      Explanation follows: if protocol handler, mangling packet, will
 371  *      be the first on list, it is not able to sense, that packet
 372  *      is cloned and should be copied-on-write, so that it will
 373  *      change it and subsequent readers will get broken packet.
 374  *                                                      --ANK (980803)
 375  */
 376
 377 static inline struct list_head *ptype_head(const struct packet_type *pt)
 378 {
 379         if (pt->type == htons(ETH_P_ALL))
 380                 return &ptype_all;
 381         else
 382                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383 }
 384
 385 /**
 386  *      dev_add_pack - add packet handler
 387  *      @pt: packet type declaration
 388  *
 389  *      Add a protocol handler to the networking stack. The passed &packet_type
 390  *      is linked into kernel lists and may not be freed until it has been
 391  *      removed from the kernel lists.
 392  *
 393  *      This call does not sleep therefore it can not
 394  *      guarantee all CPU's that are in middle of receiving packets
 395  *      will see the new packet type (until the next received packet).
 396  */
 397
 398 void dev_add_pack(struct packet_type *pt)
 399 {
 400         struct list_head *head = ptype_head(pt);
 401
 402         spin_lock(&ptype_lock);
 403         list_add_rcu(&pt->list, head);
 404         spin_unlock(&ptype_lock);
 405 }
 406 EXPORT_SYMBOL(dev_add_pack);
 407
 408 /**
 409  *      __dev_remove_pack        - remove packet handler
 410  *      @pt: packet type declaration
 411  *
 412  *      Remove a protocol handler that was previously added to the kernel
 413  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414  *      from the kernel lists and can be freed or reused once this function
 415  *      returns.
 416  *
 417  *      The packet type might still be in use by receivers
 418  *      and must not be freed until after all the CPU's have gone
 419  *      through a quiescent state.
 420  */
 421 void __dev_remove_pack(struct packet_type *pt)
 422 {
 423         struct list_head *head = ptype_head(pt);
 424         struct packet_type *pt1;
 425
 426         spin_lock(&ptype_lock);
 427
 428         list_for_each_entry(pt1, head, list) {
 429                 if (pt == pt1) {
 430                         list_del_rcu(&pt->list);
 431                         goto out;
 432                 }
 433         }
 434
 435         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 436 out:
 437         spin_unlock(&ptype_lock);
 438 }
 439 EXPORT_SYMBOL(__dev_remove_pack);
 440
 441 /**
 442  *      dev_remove_pack  - remove packet handler
 443  *      @pt: packet type declaration
 444  *
 445  *      Remove a protocol handler that was previously added to the kernel
 446  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447  *      from the kernel lists and can be freed or reused once this function
 448  *      returns.
 449  *
 450  *      This call sleeps to guarantee that no CPU is looking at the packet
 451  *      type after return.
 452  */
 453 void dev_remove_pack(struct packet_type *pt)
 454 {
 455         __dev_remove_pack(pt);
 456
 457         synchronize_net();
 458 }
 459 EXPORT_SYMBOL(dev_remove_pack);
 460
 461 /******************************************************************************
 462
 463                       Device Boot-time Settings Routines
 464
 465 *******************************************************************************/
 466
 467 /* Boot time configuration table */
 468 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 469
 470 /**
 471  *      netdev_boot_setup_add   - add new setup entry
 472  *      @name: name of the device
 473  *      @map: configured settings for the device
 474  *
 475  *      Adds new setup entry to the dev_boot_setup list.  The function
 476  *      returns 0 on error and 1 on success.  This is a generic routine to
 477  *      all netdevices.
 478  */
 479 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 480 {
 481         struct netdev_boot_setup *s;
 482         int i;
 483
 484         s = dev_boot_setup;
 485         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 486                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 487                         memset(s[i].name, 0, sizeof(s[i].name));
 488                         strlcpy(s[i].name, name, IFNAMSIZ);
 489                         memcpy(&s[i].map, map, sizeof(s[i].map));
 490                         break;
 491                 }
 492         }
 493
 494         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 495 }
 496
 497 /**
 498  *      netdev_boot_setup_check - check boot time settings
 499  *      @dev: the netdevice
 500  *
 501  *      Check boot time settings for the device.
 502  *      The found settings are set for the device to be used
 503  *      later in the device probing.
 504  *      Returns 0 if no settings found, 1 if they are.
 505  */
 506 int netdev_boot_setup_check(struct net_device *dev)
 507 {
 508         struct netdev_boot_setup *s = dev_boot_setup;
 509         int i;
 510
 511         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 512                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 513                     !strcmp(dev->name, s[i].name)) {
 514                         dev->irq        = s[i].map.irq;
 515                         dev->base_addr  = s[i].map.base_addr;
 516                         dev->mem_start  = s[i].map.mem_start;
 517                         dev->mem_end    = s[i].map.mem_end;
 518                         return 1;
 519                 }
 520         }
 521         return 0;
 522 }
 523 EXPORT_SYMBOL(netdev_boot_setup_check);
 524
 525
 526 /**
 527  *      netdev_boot_base        - get address from boot time settings
 528  *      @prefix: prefix for network device
 529  *      @unit: id for network device
 530  *
 531  *      Check boot time settings for the base address of device.
 532  *      The found settings are set for the device to be used
 533  *      later in the device probing.
 534  *      Returns 0 if no settings found.
 535  */
 536 unsigned long netdev_boot_base(const char *prefix, int unit)
 537 {
 538         const struct netdev_boot_setup *s = dev_boot_setup;
 539         char name[IFNAMSIZ];
 540         int i;
 541
 542         sprintf(name, "%s%d", prefix, unit);
 543
 544         /*
 545          * If device already registered then return base of 1
 546          * to indicate not to probe for this interface
 547          */
 548         if (__dev_get_by_name(&init_net, name))
 549                 return 1;
 550
 551         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 552                 if (!strcmp(name, s[i].name))
 553                         return s[i].map.base_addr;
 554         return 0;
 555 }
 556
 557 /*
 558  * Saves at boot time configured settings for any netdevice.
 559  */
 560 int __init netdev_boot_setup(char *str)
 561 {
 562         int ints[5];
 563         struct ifmap map;
 564
 565         str = get_options(str, ARRAY_SIZE(ints), ints);
 566         if (!str || !*str)
 567                 return 0;
 568
 569         /* Save settings */
 570         memset(&map, 0, sizeof(map));
 571         if (ints[0] > 0)
 572                 map.irq = ints[1];
 573         if (ints[0] > 1)
 574                 map.base_addr = ints[2];
 575         if (ints[0] > 2)
 576                 map.mem_start = ints[3];
 577         if (ints[0] > 3)
 578                 map.mem_end = ints[4];
 579
 580         /* Add new entry to the list */
 581         return netdev_boot_setup_add(str, &map);
 582 }
 583
 584 __setup("netdev=", netdev_boot_setup);
 585
 586 /*******************************************************************************
 587
 588                             Device Interface Subroutines
 589
 590 *******************************************************************************/
 591
 592 /**
 593  *      __dev_get_by_name       - find a device by its name
 594  *      @net: the applicable net namespace
 595  *      @name: name to find
 596  *
 597  *      Find an interface by name. Must be called under RTNL semaphore
 598  *      or @dev_base_lock. If the name is found a pointer to the device
 599  *      is returned. If the name is not found then %NULL is returned. The
 600  *      reference counters are not incremented so the caller must be
 601  *      careful with locks.
 602  */
 603
 604 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 605 {
 606         struct hlist_node *p;
 607         struct net_device *dev;
 608         struct hlist_head *head = dev_name_hash(net, name);
 609
 610         hlist_for_each_entry(dev, p, head, name_hlist)
 611                 if (!strncmp(dev->name, name, IFNAMSIZ))
 612                         return dev;
 613
 614         return NULL;
 615 }
 616 EXPORT_SYMBOL(__dev_get_by_name);
 617
 618 /**
 619  *      dev_get_by_name_rcu     - find a device by its name
 620  *      @net: the applicable net namespace
 621  *      @name: name to find
 622  *
 623  *      Find an interface by name.
 624  *      If the name is found a pointer to the device is returned.
 625  *      If the name is not found then %NULL is returned.
 626  *      The reference counters are not incremented so the caller must be
 627  *      careful with locks. The caller must hold RCU lock.
 628  */
 629
 630 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 631 {
 632         struct hlist_node *p;
 633         struct net_device *dev;
 634         struct hlist_head *head = dev_name_hash(net, name);
 635
 636         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 637                 if (!strncmp(dev->name, name, IFNAMSIZ))
 638                         return dev;
 639
 640         return NULL;
 641 }
 642 EXPORT_SYMBOL(dev_get_by_name_rcu);
 643
 644 /**
 645  *      dev_get_by_name         - find a device by its name
 646  *      @net: the applicable net namespace
 647  *      @name: name to find
 648  *
 649  *      Find an interface by name. This can be called from any
 650  *      context and does its own locking. The returned handle has
 651  *      the usage count incremented and the caller must use dev_put() to
 652  *      release it when it is no longer needed. %NULL is returned if no
 653  *      matching device is found.
 654  */
 655
 656 struct net_device *dev_get_by_name(struct net *net, const char *name)
 657 {
 658         struct net_device *dev;
 659
 660         rcu_read_lock();
 661         dev = dev_get_by_name_rcu(net, name);
 662         if (dev)
 663                 dev_hold(dev);
 664         rcu_read_unlock();
 665         return dev;
 666 }
 667 EXPORT_SYMBOL(dev_get_by_name);
 668
 669 /**
 670  *      __dev_get_by_index - find a device by its ifindex
 671  *      @net: the applicable net namespace
 672  *      @ifindex: index of device
 673  *
 674  *      Search for an interface by index. Returns %NULL if the device
 675  *      is not found or a pointer to the device. The device has not
 676  *      had its reference counter increased so the caller must be careful
 677  *      about locking. The caller must hold either the RTNL semaphore
 678  *      or @dev_base_lock.
 679  */
 680
 681 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 682 {
 683         struct hlist_node *p;
 684         struct net_device *dev;
 685         struct hlist_head *head = dev_index_hash(net, ifindex);
 686
 687         hlist_for_each_entry(dev, p, head, index_hlist)
 688                 if (dev->ifindex == ifindex)
 689                         return dev;
 690
 691         return NULL;
 692 }
 693 EXPORT_SYMBOL(__dev_get_by_index);
 694
 695 /**
 696  *      dev_get_by_index_rcu - find a device by its ifindex
 697  *      @net: the applicable net namespace
 698  *      @ifindex: index of device
 699  *
 700  *      Search for an interface by index. Returns %NULL if the device
 701  *      is not found or a pointer to the device. The device has not
 702  *      had its reference counter increased so the caller must be careful
 703  *      about locking. The caller must hold RCU lock.
 704  */
 705
 706 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 707 {
 708         struct hlist_node *p;
 709         struct net_device *dev;
 710         struct hlist_head *head = dev_index_hash(net, ifindex);
 711
 712         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 713                 if (dev->ifindex == ifindex)
 714                         return dev;
 715
 716         return NULL;
 717 }
 718 EXPORT_SYMBOL(dev_get_by_index_rcu);
 719
 720
 721 /**
 722  *      dev_get_by_index - find a device by its ifindex
 723  *      @net: the applicable net namespace
 724  *      @ifindex: index of device
 725  *
 726  *      Search for an interface by index. Returns NULL if the device
 727  *      is not found or a pointer to the device. The device returned has
 728  *      had a reference added and the pointer is safe until the user calls
 729  *      dev_put to indicate they have finished with it.
 730  */
 731
 732 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 733 {
 734         struct net_device *dev;
 735
 736         rcu_read_lock();
 737         dev = dev_get_by_index_rcu(net, ifindex);
 738         if (dev)
 739                 dev_hold(dev);
 740         rcu_read_unlock();
 741         return dev;
 742 }
 743 EXPORT_SYMBOL(dev_get_by_index);
 744
 745 /**
 746  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 747  *      @net: the applicable net namespace
 748  *      @type: media type of device
 749  *      @ha: hardware address
 750  *
 751  *      Search for an interface by MAC address. Returns NULL if the device
 752  *      is not found or a pointer to the device. The caller must hold RCU
 753  *      The returned device has not had its ref count increased
 754  *      and the caller must therefore be careful about locking
 755  *
 756  */
 757
 758 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 759                                        const char *ha)
 760 {
 761         struct net_device *dev;
 762
 763         for_each_netdev_rcu(net, dev)
 764                 if (dev->type == type &&
 765                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 766                         return dev;
 767
 768         return NULL;
 769 }
 770 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 771
 772 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 773 {
 774         struct net_device *dev;
 775
 776         ASSERT_RTNL();
 777         for_each_netdev(net, dev)
 778                 if (dev->type == type)
 779                         return dev;
 780
 781         return NULL;
 782 }
 783 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 784
 785 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 786 {
 787         struct net_device *dev, *ret = NULL;
 788
 789         rcu_read_lock();
 790         for_each_netdev_rcu(net, dev)
 791                 if (dev->type == type) {
 792                         dev_hold(dev);
 793                         ret = dev;
 794                         break;
 795                 }
 796         rcu_read_unlock();
 797         return ret;
 798 }
 799 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 800
 801 /**
 802  *      dev_get_by_flags_rcu - find any device with given flags
 803  *      @net: the applicable net namespace
 804  *      @if_flags: IFF_* values
 805  *      @mask: bitmask of bits in if_flags to check
 806  *
 807  *      Search for any interface with the given flags. Returns NULL if a device
 808  *      is not found or a pointer to the device. Must be called inside
 809  *      rcu_read_lock(), and result refcount is unchanged.
 810  */
 811
 812 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 813                                     unsigned short mask)
 814 {
 815         struct net_device *dev, *ret;
 816
 817         ret = NULL;
 818         for_each_netdev_rcu(net, dev) {
 819                 if (((dev->flags ^ if_flags) & mask) == 0) {
 820                         ret = dev;
 821                         break;
 822                 }
 823         }
 824         return ret;
 825 }
 826 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 827
 828 /**
 829  *      dev_valid_name - check if name is okay for network device
 830  *      @name: name string
 831  *
 832  *      Network device names need to be valid file names to
 833  *      to allow sysfs to work.  We also disallow any kind of
 834  *      whitespace.
 835  */
 836 int dev_valid_name(const char *name)
 837 {
 838         if (*name == '\0')
 839                 return 0;
 840         if (strlen(name) >= IFNAMSIZ)
 841                 return 0;
 842         if (!strcmp(name, ".") || !strcmp(name, ".."))
 843                 return 0;
 844
 845         while (*name) {
 846                 if (*name == '/' || isspace(*name))
 847                         return 0;
 848                 name++;
 849         }
 850         return 1;
 851 }
 852 EXPORT_SYMBOL(dev_valid_name);
 853
 854 /**
 855  *      __dev_alloc_name - allocate a name for a device
 856  *      @net: network namespace to allocate the device name in
 857  *      @name: name format string
 858  *      @buf:  scratch buffer and result name string
 859  *
 860  *      Passed a format string - eg "lt%d" it will try and find a suitable
 861  *      id. It scans list of devices to build up a free map, then chooses
 862  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 863  *      while allocating the name and adding the device in order to avoid
 864  *      duplicates.
 865  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 866  *      Returns the number of the unit assigned or a negative errno code.
 867  */
 868
 869 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 870 {
 871         int i = 0;
 872         const char *p;
 873         const int max_netdevices = 8*PAGE_SIZE;
 874         unsigned long *inuse;
 875         struct net_device *d;
 876
 877         p = strnchr(name, IFNAMSIZ-1, '%');
 878         if (p) {
 879                 /*
 880                  * Verify the string as this thing may have come from
 881                  * the user.  There must be either one "%d" and no other "%"
 882                  * characters.
 883                  */
 884                 if (p[1] != 'd' || strchr(p + 2, '%'))
 885                         return -EINVAL;
 886
 887                 /* Use one page as a bit array of possible slots */
 888                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 889                 if (!inuse)
 890                         return -ENOMEM;
 891
 892                 for_each_netdev(net, d) {
 893                         if (!sscanf(d->name, name, &i))
 894                                 continue;
 895                         if (i < 0 || i >= max_netdevices)
 896                                 continue;
 897
 898                         /*  avoid cases where sscanf is not exact inverse of printf */
 899                         snprintf(buf, IFNAMSIZ, name, i);
 900                         if (!strncmp(buf, d->name, IFNAMSIZ))
 901                                 set_bit(i, inuse);
 902                 }
 903
 904                 i = find_first_zero_bit(inuse, max_netdevices);
 905                 free_page((unsigned long) inuse);
 906         }
 907
 908         if (buf != name)
 909                 snprintf(buf, IFNAMSIZ, name, i);
 910         if (!__dev_get_by_name(net, buf))
 911                 return i;
 912
 913         /* It is possible to run out of possible slots
 914          * when the name is long and there isn't enough space left
 915          * for the digits, or if all bits are used.
 916          */
 917         return -ENFILE;
 918 }
 919
 920 /**
 921  *      dev_alloc_name - allocate a name for a device
 922  *      @dev: device
 923  *      @name: name format string
 924  *
 925  *      Passed a format string - eg "lt%d" it will try and find a suitable
 926  *      id. It scans list of devices to build up a free map, then chooses
 927  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 928  *      while allocating the name and adding the device in order to avoid
 929  *      duplicates.
 930  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 931  *      Returns the number of the unit assigned or a negative errno code.
 932  */
 933
 934 int dev_alloc_name(struct net_device *dev, const char *name)
 935 {
 936         char buf[IFNAMSIZ];
 937         struct net *net;
 938         int ret;
 939
 940         BUG_ON(!dev_net(dev));
 941         net = dev_net(dev);
 942         ret = __dev_alloc_name(net, name, buf);
 943         if (ret >= 0)
 944                 strlcpy(dev->name, buf, IFNAMSIZ);
 945         return ret;
 946 }
 947 EXPORT_SYMBOL(dev_alloc_name);
 948
 949 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 950 {
 951         struct net *net;
 952
 953         BUG_ON(!dev_net(dev));
 954         net = dev_net(dev);
 955
 956         if (!dev_valid_name(name))
 957                 return -EINVAL;
 958
 959         if (fmt && strchr(name, '%'))
 960                 return dev_alloc_name(dev, name);
 961         else if (__dev_get_by_name(net, name))
 962                 return -EEXIST;
 963         else if (dev->name != name)
 964                 strlcpy(dev->name, name, IFNAMSIZ);
 965
 966         return 0;
 967 }
 968
 969 /**
 970  *      dev_change_name - change name of a device
 971  *      @dev: device
 972  *      @newname: name (or format string) must be at least IFNAMSIZ
 973  *
 974  *      Change name of a device, can pass format strings "eth%d".
 975  *      for wildcarding.
 976  */
 977 int dev_change_name(struct net_device *dev, const char *newname)
 978 {
 979         char oldname[IFNAMSIZ];
 980         int err = 0;
 981         int ret;
 982         struct net *net;
 983
 984         ASSERT_RTNL();
 985         BUG_ON(!dev_net(dev));
 986
 987         net = dev_net(dev);
 988         if (dev->flags & IFF_UP)
 989                 return -EBUSY;
 990
 991         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 992                 return 0;
 993
 994         memcpy(oldname, dev->name, IFNAMSIZ);
 995
 996         err = dev_get_valid_name(dev, newname, 1);
 997         if (err < 0)
 998                 return err;
 999
1000 rollback:
1001         ret = device_rename(&dev->dev, dev->name);
1002         if (ret) {
1003                 memcpy(dev->name, oldname, IFNAMSIZ);
1004                 return ret;
1005         }
1006
1007         write_lock_bh(&dev_base_lock);
1008         hlist_del(&dev->name_hlist);
1009         write_unlock_bh(&dev_base_lock);
1010
1011         synchronize_rcu();
1012
1013         write_lock_bh(&dev_base_lock);
1014         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1015         write_unlock_bh(&dev_base_lock);
1016
1017         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1018         ret = notifier_to_errno(ret);
1019
1020         if (ret) {
1021                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1022                 if (err >= 0) {
1023                         err = ret;
1024                         memcpy(dev->name, oldname, IFNAMSIZ);
1025                         goto rollback;
1026                 } else {
1027                         printk(KERN_ERR
1028                                "%s: name change rollback failed: %d.\n",
1029                                dev->name, ret);
1030                 }
1031         }
1032
1033         return err;
1034 }
1035
1036 /**
1037  *      dev_set_alias - change ifalias of a device
1038  *      @dev: device
1039  *      @alias: name up to IFALIASZ
1040  *      @len: limit of bytes to copy from info
1041  *
1042  *      Set ifalias for a device,
1043  */
1044 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1045 {
1046         ASSERT_RTNL();
1047
1048         if (len >= IFALIASZ)
1049                 return -EINVAL;
1050
1051         if (!len) {
1052                 if (dev->ifalias) {
1053                         kfree(dev->ifalias);
1054                         dev->ifalias = NULL;
1055                 }
1056                 return 0;
1057         }
1058
1059         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1060         if (!dev->ifalias)
1061                 return -ENOMEM;
1062
1063         strlcpy(dev->ifalias, alias, len+1);
1064         return len;
1065 }
1066
1067
1068 /**
1069  *      netdev_features_change - device changes features
1070  *      @dev: device to cause notification
1071  *
1072  *      Called to indicate a device has changed features.
1073  */
1074 void netdev_features_change(struct net_device *dev)
1075 {
1076         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1077 }
1078 EXPORT_SYMBOL(netdev_features_change);
1079
1080 /**
1081  *      netdev_state_change - device changes state
1082  *      @dev: device to cause notification
1083  *
1084  *      Called to indicate a device has changed state. This function calls
1085  *      the notifier chains for netdev_chain and sends a NEWLINK message
1086  *      to the routing socket.
1087  */
1088 void netdev_state_change(struct net_device *dev)
1089 {
1090         if (dev->flags & IFF_UP) {
1091                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1092                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1093         }
1094 }
1095 EXPORT_SYMBOL(netdev_state_change);
1096
1097 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1098 {
1099         return call_netdevice_notifiers(event, dev);
1100 }
1101 EXPORT_SYMBOL(netdev_bonding_change);
1102
1103 /**
1104  *      dev_load        - load a network module
1105  *      @net: the applicable net namespace
1106  *      @name: name of interface
1107  *
1108  *      If a network interface is not present and the process has suitable
1109  *      privileges this function loads the module. If module loading is not
1110  *      available in this kernel then it becomes a nop.
1111  */
1112
1113 void dev_load(struct net *net, const char *name)
1114 {
1115         struct net_device *dev;
1116
1117         rcu_read_lock();
1118         dev = dev_get_by_name_rcu(net, name);
1119         rcu_read_unlock();
1120
1121         if (!dev && capable(CAP_NET_ADMIN))
1122                 request_module("%s", name);
1123 }
1124 EXPORT_SYMBOL(dev_load);
1125
1126 static int __dev_open(struct net_device *dev)
1127 {
1128         const struct net_device_ops *ops = dev->netdev_ops;
1129         int ret;
1130
1131         ASSERT_RTNL();
1132
1133         /*
1134          *      Is it even present?
1135          */
1136         if (!netif_device_present(dev))
1137                 return -ENODEV;
1138
1139         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1140         ret = notifier_to_errno(ret);
1141         if (ret)
1142                 return ret;
1143
1144         /*
1145          *      Call device private open method
1146          */
1147         set_bit(__LINK_STATE_START, &dev->state);
1148
1149         if (ops->ndo_validate_addr)
1150                 ret = ops->ndo_validate_addr(dev);
1151
1152         if (!ret && ops->ndo_open)
1153                 ret = ops->ndo_open(dev);
1154
1155         /*
1156          *      If it went open OK then:
1157          */
1158
1159         if (ret)
1160                 clear_bit(__LINK_STATE_START, &dev->state);
1161         else {
1162                 /*
1163                  *      Set the flags.
1164                  */
1165                 dev->flags |= IFF_UP;
1166
1167                 /*
1168                  *      Enable NET_DMA
1169                  */
1170                 net_dmaengine_get();
1171
1172                 /*
1173                  *      Initialize multicasting status
1174                  */
1175                 dev_set_rx_mode(dev);
1176
1177                 /*
1178                  *      Wakeup transmit queue engine
1179                  */
1180                 dev_activate(dev);
1181         }
1182
1183         return ret;
1184 }
1185
1186 /**
1187  *      dev_open        - prepare an interface for use.
1188  *      @dev:   device to open
1189  *
1190  *      Takes a device from down to up state. The device's private open
1191  *      function is invoked and then the multicast lists are loaded. Finally
1192  *      the device is moved into the up state and a %NETDEV_UP message is
1193  *      sent to the netdev notifier chain.
1194  *
1195  *      Calling this function on an active interface is a nop. On a failure
1196  *      a negative errno code is returned.
1197  */
1198 int dev_open(struct net_device *dev)
1199 {
1200         int ret;
1201
1202         /*
1203          *      Is it already up?
1204          */
1205         if (dev->flags & IFF_UP)
1206                 return 0;
1207
1208         /*
1209          *      Open device
1210          */
1211         ret = __dev_open(dev);
1212         if (ret < 0)
1213                 return ret;
1214
1215         /*
1216          *      ... and announce new interface.
1217          */
1218         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1219         call_netdevice_notifiers(NETDEV_UP, dev);
1220
1221         return ret;
1222 }
1223 EXPORT_SYMBOL(dev_open);
1224
1225 static int __dev_close_many(struct list_head *head)
1226 {
1227         struct net_device *dev;
1228
1229         ASSERT_RTNL();
1230         might_sleep();
1231
1232         list_for_each_entry(dev, head, unreg_list) {
1233                 /*
1234                  *      Tell people we are going down, so that they can
1235                  *      prepare to death, when device is still operating.
1236                  */
1237                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1238
1239                 clear_bit(__LINK_STATE_START, &dev->state);
1240
1241                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1242                  * can be even on different cpu. So just clear netif_running().
1243                  *
1244                  * dev->stop() will invoke napi_disable() on all of it's
1245                  * napi_struct instances on this device.
1246                  */
1247                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1248         }
1249
1250         dev_deactivate_many(head);
1251
1252         list_for_each_entry(dev, head, unreg_list) {
1253                 const struct net_device_ops *ops = dev->netdev_ops;
1254
1255                 /*
1256                  *      Call the device specific close. This cannot fail.
1257                  *      Only if device is UP
1258                  *
1259                  *      We allow it to be called even after a DETACH hot-plug
1260                  *      event.
1261                  */
1262                 if (ops->ndo_stop)
1263                         ops->ndo_stop(dev);
1264
1265                 /*
1266                  *      Device is now down.
1267                  */
1268
1269                 dev->flags &= ~IFF_UP;
1270
1271                 /*
1272                  *      Shutdown NET_DMA
1273                  */
1274                 net_dmaengine_put();
1275         }
1276
1277         return 0;
1278 }
1279
1280 static int __dev_close(struct net_device *dev)
1281 {
1282         LIST_HEAD(single);
1283
1284         list_add(&dev->unreg_list, &single);
1285         return __dev_close_many(&single);
1286 }
1287
1288 int dev_close_many(struct list_head *head)
1289 {
1290         struct net_device *dev, *tmp;
1291         LIST_HEAD(tmp_list);
1292
1293         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1294                 if (!(dev->flags & IFF_UP))
1295                         list_move(&dev->unreg_list, &tmp_list);
1296
1297         __dev_close_many(head);
1298
1299         /*
1300          * Tell people we are down
1301          */
1302         list_for_each_entry(dev, head, unreg_list) {
1303                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1304                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1305         }
1306
1307         /* rollback_registered_many needs the complete original list */
1308         list_splice(&tmp_list, head);
1309         return 0;
1310 }
1311
1312 /**
1313  *      dev_close - shutdown an interface.
1314  *      @dev: device to shutdown
1315  *
1316  *      This function moves an active device into down state. A
1317  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1318  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1319  *      chain.
1320  */
1321 int dev_close(struct net_device *dev)
1322 {
1323         LIST_HEAD(single);
1324
1325         list_add(&dev->unreg_list, &single);
1326         dev_close_many(&single);
1327
1328         return 0;
1329 }
1330 EXPORT_SYMBOL(dev_close);
1331
1332
1333 /**
1334  *      dev_disable_lro - disable Large Receive Offload on a device
1335  *      @dev: device
1336  *
1337  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1338  *      called under RTNL.  This is needed if received packets may be
1339  *      forwarded to another interface.
1340  */
1341 void dev_disable_lro(struct net_device *dev)
1342 {
1343         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1344             dev->ethtool_ops->set_flags) {
1345                 u32 flags = dev->ethtool_ops->get_flags(dev);
1346                 if (flags & ETH_FLAG_LRO) {
1347                         flags &= ~ETH_FLAG_LRO;
1348                         dev->ethtool_ops->set_flags(dev, flags);
1349                 }
1350         }
1351         WARN_ON(dev->features & NETIF_F_LRO);
1352 }
1353 EXPORT_SYMBOL(dev_disable_lro);
1354
1355
1356 static int dev_boot_phase = 1;
1357
1358 /*
1359  *      Device change register/unregister. These are not inline or static
1360  *      as we export them to the world.
1361  */
1362
1363 /**
1364  *      register_netdevice_notifier - register a network notifier block
1365  *      @nb: notifier
1366  *
1367  *      Register a notifier to be called when network device events occur.
1368  *      The notifier passed is linked into the kernel structures and must
1369  *      not be reused until it has been unregistered. A negative errno code
1370  *      is returned on a failure.
1371  *
1372  *      When registered all registration and up events are replayed
1373  *      to the new notifier to allow device to have a race free
1374  *      view of the network device list.
1375  */
1376
1377 int register_netdevice_notifier(struct notifier_block *nb)
1378 {
1379         struct net_device *dev;
1380         struct net_device *last;
1381         struct net *net;
1382         int err;
1383
1384         rtnl_lock();
1385         err = raw_notifier_chain_register(&netdev_chain, nb);
1386         if (err)
1387                 goto unlock;
1388         if (dev_boot_phase)
1389                 goto unlock;
1390         for_each_net(net) {
1391                 for_each_netdev(net, dev) {
1392                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1393                         err = notifier_to_errno(err);
1394                         if (err)
1395                                 goto rollback;
1396
1397                         if (!(dev->flags & IFF_UP))
1398                                 continue;
1399
1400                         nb->notifier_call(nb, NETDEV_UP, dev);
1401                 }
1402         }
1403
1404 unlock:
1405         rtnl_unlock();
1406         return err;
1407
1408 rollback:
1409         last = dev;
1410         for_each_net(net) {
1411                 for_each_netdev(net, dev) {
1412                         if (dev == last)
1413                                 break;
1414
1415                         if (dev->flags & IFF_UP) {
1416                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1417                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1418                         }
1419                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1420                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1421                 }
1422         }
1423
1424         raw_notifier_chain_unregister(&netdev_chain, nb);
1425         goto unlock;
1426 }
1427 EXPORT_SYMBOL(register_netdevice_notifier);
1428
1429 /**
1430  *      unregister_netdevice_notifier - unregister a network notifier block
1431  *      @nb: notifier
1432  *
1433  *      Unregister a notifier previously registered by
1434  *      register_netdevice_notifier(). The notifier is unlinked into the
1435  *      kernel structures and may then be reused. A negative errno code
1436  *      is returned on a failure.
1437  */
1438
1439 int unregister_netdevice_notifier(struct notifier_block *nb)
1440 {
1441         int err;
1442
1443         rtnl_lock();
1444         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1445         rtnl_unlock();
1446         return err;
1447 }
1448 EXPORT_SYMBOL(unregister_netdevice_notifier);
1449
1450 /**
1451  *      call_netdevice_notifiers - call all network notifier blocks
1452  *      @val: value passed unmodified to notifier function
1453  *      @dev: net_device pointer passed unmodified to notifier function
1454  *
1455  *      Call all network notifier blocks.  Parameters and return value
1456  *      are as for raw_notifier_call_chain().
1457  */
1458
1459 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1460 {
1461         ASSERT_RTNL();
1462         return raw_notifier_call_chain(&netdev_chain, val, dev);
1463 }
1464
1465 /* When > 0 there are consumers of rx skb time stamps */
1466 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1467
1468 void net_enable_timestamp(void)
1469 {
1470         atomic_inc(&netstamp_needed);
1471 }
1472 EXPORT_SYMBOL(net_enable_timestamp);
1473
1474 void net_disable_timestamp(void)
1475 {
1476         atomic_dec(&netstamp_needed);
1477 }
1478 EXPORT_SYMBOL(net_disable_timestamp);
1479
1480 static inline void net_timestamp_set(struct sk_buff *skb)
1481 {
1482         if (atomic_read(&netstamp_needed))
1483                 __net_timestamp(skb);
1484         else
1485                 skb->tstamp.tv64 = 0;
1486 }
1487
1488 static inline void net_timestamp_check(struct sk_buff *skb)
1489 {
1490         if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1491                 __net_timestamp(skb);
1492 }
1493
1494 /**
1495  * dev_forward_skb - loopback an skb to another netif
1496  *
1497  * @dev: destination network device
1498  * @skb: buffer to forward
1499  *
1500  * return values:
1501  *      NET_RX_SUCCESS  (no congestion)
1502  *      NET_RX_DROP     (packet was dropped, but freed)
1503  *
1504  * dev_forward_skb can be used for injecting an skb from the
1505  * start_xmit function of one device into the receive queue
1506  * of another device.
1507  *
1508  * The receiving device may be in another namespace, so
1509  * we have to clear all information in the skb that could
1510  * impact namespace isolation.
1511  */
1512 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1513 {
1514         skb_orphan(skb);
1515         nf_reset(skb);
1516
1517         if (unlikely(!(dev->flags & IFF_UP) ||
1518                      (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1519                 atomic_long_inc(&dev->rx_dropped);
1520                 kfree_skb(skb);
1521                 return NET_RX_DROP;
1522         }
1523         skb_set_dev(skb, dev);
1524         skb->tstamp.tv64 = 0;
1525         skb->pkt_type = PACKET_HOST;
1526         skb->protocol = eth_type_trans(skb, dev);
1527         return netif_rx(skb);
1528 }
1529 EXPORT_SYMBOL_GPL(dev_forward_skb);
1530
1531 static inline int deliver_skb(struct sk_buff *skb,
1532                               struct packet_type *pt_prev,
1533                               struct net_device *orig_dev)
1534 {
1535         atomic_inc(&skb->users);
1536         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1537 }
1538
1539 /*
1540  *      Support routine. Sends outgoing frames to any network
1541  *      taps currently in use.
1542  */
1543
1544 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1545 {
1546         struct packet_type *ptype;
1547         struct sk_buff *skb2 = NULL;
1548         struct packet_type *pt_prev = NULL;
1549
1550 #ifdef CONFIG_NET_CLS_ACT
1551         if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1552                 net_timestamp_set(skb);
1553 #else
1554         net_timestamp_set(skb);
1555 #endif
1556
1557         rcu_read_lock();
1558         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1559                 /* Never send packets back to the socket
1560                  * they originated from - MvS (miquels@drinkel.ow.org)
1561                  */
1562                 if ((ptype->dev == dev || !ptype->dev) &&
1563                     (ptype->af_packet_priv == NULL ||
1564                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1565                         if (pt_prev) {
1566                                 deliver_skb(skb2, pt_prev, skb->dev);
1567                                 pt_prev = ptype;
1568                                 continue;
1569                         }
1570
1571                         skb2 = skb_clone(skb, GFP_ATOMIC);
1572                         if (!skb2)
1573                                 break;
1574
1575                         /* skb->nh should be correctly
1576                            set by sender, so that the second statement is
1577                            just protection against buggy protocols.
1578                          */
1579                         skb_reset_mac_header(skb2);
1580
1581                         if (skb_network_header(skb2) < skb2->data ||
1582                             skb2->network_header > skb2->tail) {
1583                                 if (net_ratelimit())
1584                                         printk(KERN_CRIT "protocol %04x is "
1585                                                "buggy, dev %s\n",
1586                                                ntohs(skb2->protocol),
1587                                                dev->name);
1588                                 skb_reset_network_header(skb2);
1589                         }
1590
1591                         skb2->transport_header = skb2->network_header;
1592                         skb2->pkt_type = PACKET_OUTGOING;
1593                         pt_prev = ptype;
1594                 }
1595         }
1596         if (pt_prev)
1597                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1598         rcu_read_unlock();
1599 }
1600
1601 /*
1602  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1603  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1604  */
1605 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1606 {
1607         int rc;
1608
1609         if (txq < 1 || txq > dev->num_tx_queues)
1610                 return -EINVAL;
1611
1612         if (dev->reg_state == NETREG_REGISTERED) {
1613                 ASSERT_RTNL();
1614
1615                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1616                                                   txq);
1617                 if (rc)
1618                         return rc;
1619
1620                 if (txq < dev->real_num_tx_queues)
1621                         qdisc_reset_all_tx_gt(dev, txq);
1622         }
1623
1624         dev->real_num_tx_queues = txq;
1625         return 0;
1626 }
1627 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1628
1629 #ifdef CONFIG_RPS
1630 /**
1631  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1632  *      @dev: Network device
1633  *      @rxq: Actual number of RX queues
1634  *
1635  *      This must be called either with the rtnl_lock held or before
1636  *      registration of the net device.  Returns 0 on success, or a
1637  *      negative error code.  If called before registration, it always
1638  *      succeeds.
1639  */
1640 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1641 {
1642         int rc;
1643
1644         if (rxq < 1 || rxq > dev->num_rx_queues)
1645                 return -EINVAL;
1646
1647         if (dev->reg_state == NETREG_REGISTERED) {
1648                 ASSERT_RTNL();
1649
1650                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1651                                                   rxq);
1652                 if (rc)
1653                         return rc;
1654         }
1655
1656         dev->real_num_rx_queues = rxq;
1657         return 0;
1658 }
1659 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1660 #endif
1661
1662 static inline void __netif_reschedule(struct Qdisc *q)
1663 {
1664         struct softnet_data *sd;
1665         unsigned long flags;
1666
1667         local_irq_save(flags);
1668         sd = &__get_cpu_var(softnet_data);
1669         q->next_sched = NULL;
1670         *sd->output_queue_tailp = q;
1671         sd->output_queue_tailp = &q->next_sched;
1672         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1673         local_irq_restore(flags);
1674 }
1675
1676 void __netif_schedule(struct Qdisc *q)
1677 {
1678         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1679                 __netif_reschedule(q);
1680 }
1681 EXPORT_SYMBOL(__netif_schedule);
1682
1683 void dev_kfree_skb_irq(struct sk_buff *skb)
1684 {
1685         if (atomic_dec_and_test(&skb->users)) {
1686                 struct softnet_data *sd;
1687                 unsigned long flags;
1688
1689                 local_irq_save(flags);
1690                 sd = &__get_cpu_var(softnet_data);
1691                 skb->next = sd->completion_queue;
1692                 sd->completion_queue = skb;
1693                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1694                 local_irq_restore(flags);
1695         }
1696 }
1697 EXPORT_SYMBOL(dev_kfree_skb_irq);
1698
1699 void dev_kfree_skb_any(struct sk_buff *skb)
1700 {
1701         if (in_irq() || irqs_disabled())
1702                 dev_kfree_skb_irq(skb);
1703         else
1704                 dev_kfree_skb(skb);
1705 }
1706 EXPORT_SYMBOL(dev_kfree_skb_any);
1707
1708
1709 /**
1710  * netif_device_detach - mark device as removed
1711  * @dev: network device
1712  *
1713  * Mark device as removed from system and therefore no longer available.
1714  */
1715 void netif_device_detach(struct net_device *dev)
1716 {
1717         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1718             netif_running(dev)) {
1719                 netif_tx_stop_all_queues(dev);
1720         }
1721 }
1722 EXPORT_SYMBOL(netif_device_detach);
1723
1724 /**
1725  * netif_device_attach - mark device as attached
1726  * @dev: network device
1727  *
1728  * Mark device as attached from system and restart if needed.
1729  */
1730 void netif_device_attach(struct net_device *dev)
1731 {
1732         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1733             netif_running(dev)) {
1734                 netif_tx_wake_all_queues(dev);
1735                 __netdev_watchdog_up(dev);
1736         }
1737 }
1738 EXPORT_SYMBOL(netif_device_attach);
1739
1740 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1741 {
1742         return ((features & NETIF_F_NO_CSUM) ||
1743                 ((features & NETIF_F_V4_CSUM) &&
1744                  protocol == htons(ETH_P_IP)) ||
1745                 ((features & NETIF_F_V6_CSUM) &&
1746                  protocol == htons(ETH_P_IPV6)) ||
1747                 ((features & NETIF_F_FCOE_CRC) &&
1748                  protocol == htons(ETH_P_FCOE)));
1749 }
1750
1751 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1752 {
1753         __be16 protocol = skb->protocol;
1754         int features = dev->features;
1755
1756         if (vlan_tx_tag_present(skb)) {
1757                 features &= dev->vlan_features;
1758         } else if (protocol == htons(ETH_P_8021Q)) {
1759                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1760                 protocol = veh->h_vlan_encapsulated_proto;
1761                 features &= dev->vlan_features;
1762         }
1763
1764         return can_checksum_protocol(features, protocol);
1765 }
1766
1767 /**
1768  * skb_dev_set -- assign a new device to a buffer
1769  * @skb: buffer for the new device
1770  * @dev: network device
1771  *
1772  * If an skb is owned by a device already, we have to reset
1773  * all data private to the namespace a device belongs to
1774  * before assigning it a new device.
1775  */
1776 #ifdef CONFIG_NET_NS
1777 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1778 {
1779         skb_dst_drop(skb);
1780         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1781                 secpath_reset(skb);
1782                 nf_reset(skb);
1783                 skb_init_secmark(skb);
1784                 skb->mark = 0;
1785                 skb->priority = 0;
1786                 skb->nf_trace = 0;
1787                 skb->ipvs_property = 0;
1788 #ifdef CONFIG_NET_SCHED
1789                 skb->tc_index = 0;
1790 #endif
1791         }
1792         skb->dev = dev;
1793 }
1794 EXPORT_SYMBOL(skb_set_dev);
1795 #endif /* CONFIG_NET_NS */
1796
1797 /*
1798  * Invalidate hardware checksum when packet is to be mangled, and
1799  * complete checksum manually on outgoing path.
1800  */
1801 int skb_checksum_help(struct sk_buff *skb)
1802 {
1803         __wsum csum;
1804         int ret = 0, offset;
1805
1806         if (skb->ip_summed == CHECKSUM_COMPLETE)
1807                 goto out_set_summed;
1808
1809         if (unlikely(skb_shinfo(skb)->gso_size)) {
1810                 /* Let GSO fix up the checksum. */
1811                 goto out_set_summed;
1812         }
1813
1814         offset = skb_checksum_start_offset(skb);
1815         BUG_ON(offset >= skb_headlen(skb));
1816         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1817
1818         offset += skb->csum_offset;
1819         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1820
1821         if (skb_cloned(skb) &&
1822             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1823                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1824                 if (ret)
1825                         goto out;
1826         }
1827
1828         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1829 out_set_summed:
1830         skb->ip_summed = CHECKSUM_NONE;
1831 out:
1832         return ret;
1833 }
1834 EXPORT_SYMBOL(skb_checksum_help);
1835
1836 /**
1837  *      skb_gso_segment - Perform segmentation on skb.
1838  *      @skb: buffer to segment
1839  *      @features: features for the output path (see dev->features)
1840  *
1841  *      This function segments the given skb and returns a list of segments.
1842  *
1843  *      It may return NULL if the skb requires no segmentation.  This is
1844  *      only possible when GSO is used for verifying header integrity.
1845  */
1846 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1847 {
1848         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1849         struct packet_type *ptype;
1850         __be16 type = skb->protocol;
1851         int vlan_depth = ETH_HLEN;
1852         int err;
1853
1854         while (type == htons(ETH_P_8021Q)) {
1855                 struct vlan_hdr *vh;
1856
1857                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1858                         return ERR_PTR(-EINVAL);
1859
1860                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1861                 type = vh->h_vlan_encapsulated_proto;
1862                 vlan_depth += VLAN_HLEN;
1863         }
1864
1865         skb_reset_mac_header(skb);
1866         skb->mac_len = skb->network_header - skb->mac_header;
1867         __skb_pull(skb, skb->mac_len);
1868
1869         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1870                 struct net_device *dev = skb->dev;
1871                 struct ethtool_drvinfo info = {};
1872
1873                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1874                         dev->ethtool_ops->get_drvinfo(dev, &info);
1875
1876                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1877                      info.driver, dev ? dev->features : 0L,
1878                      skb->sk ? skb->sk->sk_route_caps : 0L,
1879                      skb->len, skb->data_len, skb->ip_summed);
1880
1881                 if (skb_header_cloned(skb) &&
1882                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1883                         return ERR_PTR(err);
1884         }
1885
1886         rcu_read_lock();
1887         list_for_each_entry_rcu(ptype,
1888                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1889                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1890                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1891                                 err = ptype->gso_send_check(skb);
1892                                 segs = ERR_PTR(err);
1893                                 if (err || skb_gso_ok(skb, features))
1894                                         break;
1895                                 __skb_push(skb, (skb->data -
1896                                                  skb_network_header(skb)));
1897                         }
1898                         segs = ptype->gso_segment(skb, features);
1899                         break;
1900                 }
1901         }
1902         rcu_read_unlock();
1903
1904         __skb_push(skb, skb->data - skb_mac_header(skb));
1905
1906         return segs;
1907 }
1908 EXPORT_SYMBOL(skb_gso_segment);
1909
1910 /* Take action when hardware reception checksum errors are detected. */
1911 #ifdef CONFIG_BUG
1912 void netdev_rx_csum_fault(struct net_device *dev)
1913 {
1914         if (net_ratelimit()) {
1915                 printk(KERN_ERR "%s: hw csum failure.\n",
1916                         dev ? dev->name : "<unknown>");
1917                 dump_stack();
1918         }
1919 }
1920 EXPORT_SYMBOL(netdev_rx_csum_fault);
1921 #endif
1922
1923 /* Actually, we should eliminate this check as soon as we know, that:
1924  * 1. IOMMU is present and allows to map all the memory.
1925  * 2. No high memory really exists on this machine.
1926  */
1927
1928 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1929 {
1930 #ifdef CONFIG_HIGHMEM
1931         int i;
1932         if (!(dev->features & NETIF_F_HIGHDMA)) {
1933                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1934                         if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1935                                 return 1;
1936         }
1937
1938         if (PCI_DMA_BUS_IS_PHYS) {
1939                 struct device *pdev = dev->dev.parent;
1940
1941                 if (!pdev)
1942                         return 0;
1943                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1944                         dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1945                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1946                                 return 1;
1947                 }
1948         }
1949 #endif
1950         return 0;
1951 }
1952
1953 struct dev_gso_cb {
1954         void (*destructor)(struct sk_buff *skb);
1955 };
1956
1957 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1958
1959 static void dev_gso_skb_destructor(struct sk_buff *skb)
1960 {
1961         struct dev_gso_cb *cb;
1962
1963         do {
1964                 struct sk_buff *nskb = skb->next;
1965
1966                 skb->next = nskb->next;
1967                 nskb->next = NULL;
1968                 kfree_skb(nskb);
1969         } while (skb->next);
1970
1971         cb = DEV_GSO_CB(skb);
1972         if (cb->destructor)
1973                 cb->destructor(skb);
1974 }
1975
1976 /**
1977  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1978  *      @skb: buffer to segment
1979  *
1980  *      This function segments the given skb and stores the list of segments
1981  *      in skb->next.
1982  */
1983 static int dev_gso_segment(struct sk_buff *skb)
1984 {
1985         struct net_device *dev = skb->dev;
1986         struct sk_buff *segs;
1987         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1988                                          NETIF_F_SG : 0);
1989
1990         segs = skb_gso_segment(skb, features);
1991
1992         /* Verifying header integrity only. */
1993         if (!segs)
1994                 return 0;
1995
1996         if (IS_ERR(segs))
1997                 return PTR_ERR(segs);
1998
1999         skb->next = segs;
2000         DEV_GSO_CB(skb)->destructor = skb->destructor;
2001         skb->destructor = dev_gso_skb_destructor;
2002
2003         return 0;
2004 }
2005
2006 /*
2007  * Try to orphan skb early, right before transmission by the device.
2008  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2009  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2010  */
2011 static inline void skb_orphan_try(struct sk_buff *skb)
2012 {
2013         struct sock *sk = skb->sk;
2014
2015         if (sk && !skb_shinfo(skb)->tx_flags) {
2016                 /* skb_tx_hash() wont be able to get sk.
2017                  * We copy sk_hash into skb->rxhash
2018                  */
2019                 if (!skb->rxhash)
2020                         skb->rxhash = sk->sk_hash;
2021                 skb_orphan(skb);
2022         }
2023 }
2024
2025 int netif_get_vlan_features(struct sk_buff *skb, struct net_device *dev)
2026 {
2027         __be16 protocol = skb->protocol;
2028
2029         if (protocol == htons(ETH_P_8021Q)) {
2030                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2031                 protocol = veh->h_vlan_encapsulated_proto;
2032         } else if (!skb->vlan_tci)
2033                 return dev->features;
2034
2035         if (protocol != htons(ETH_P_8021Q))
2036                 return dev->features & dev->vlan_features;
2037         else
2038                 return 0;
2039 }
2040 EXPORT_SYMBOL(netif_get_vlan_features);
2041
2042 /*
2043  * Returns true if either:
2044  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2045  *      2. skb is fragmented and the device does not support SG, or if
2046  *         at least one of fragments is in highmem and device does not
2047  *         support DMA from it.
2048  */
2049 static inline int skb_needs_linearize(struct sk_buff *skb,
2050                                       struct net_device *dev)
2051 {
2052         if (skb_is_nonlinear(skb)) {
2053                 int features = dev->features;
2054
2055                 if (vlan_tx_tag_present(skb))
2056                         features &= dev->vlan_features;
2057
2058                 return (skb_has_frag_list(skb) &&
2059                         !(features & NETIF_F_FRAGLIST)) ||
2060                         (skb_shinfo(skb)->nr_frags &&
2061                         (!(features & NETIF_F_SG) ||
2062                         illegal_highdma(dev, skb)));
2063         }
2064
2065         return 0;
2066 }
2067
2068 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2069                         struct netdev_queue *txq)
2070 {
2071         const struct net_device_ops *ops = dev->netdev_ops;
2072         int rc = NETDEV_TX_OK;
2073
2074         if (likely(!skb->next)) {
2075                 /*
2076                  * If device doesnt need skb->dst, release it right now while
2077                  * its hot in this cpu cache
2078                  */
2079                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2080                         skb_dst_drop(skb);
2081
2082                 if (!list_empty(&ptype_all))
2083                         dev_queue_xmit_nit(skb, dev);
2084
2085                 skb_orphan_try(skb);
2086
2087                 if (vlan_tx_tag_present(skb) &&
2088                     !(dev->features & NETIF_F_HW_VLAN_TX)) {
2089                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2090                         if (unlikely(!skb))
2091                                 goto out;
2092
2093                         skb->vlan_tci = 0;
2094                 }
2095
2096                 if (netif_needs_gso(dev, skb)) {
2097                         if (unlikely(dev_gso_segment(skb)))
2098                                 goto out_kfree_skb;
2099                         if (skb->next)
2100                                 goto gso;
2101                 } else {
2102                         if (skb_needs_linearize(skb, dev) &&
2103                             __skb_linearize(skb))
2104                                 goto out_kfree_skb;
2105
2106                         /* If packet is not checksummed and device does not
2107                          * support checksumming for this protocol, complete
2108                          * checksumming here.
2109                          */
2110                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2111                                 skb_set_transport_header(skb,
2112                                         skb_checksum_start_offset(skb));
2113                                 if (!dev_can_checksum(dev, skb) &&
2114                                      skb_checksum_help(skb))
2115                                         goto out_kfree_skb;
2116                         }
2117                 }
2118
2119                 rc = ops->ndo_start_xmit(skb, dev);
2120                 trace_net_dev_xmit(skb, rc);
2121                 if (rc == NETDEV_TX_OK)
2122                         txq_trans_update(txq);
2123                 return rc;
2124         }
2125
2126 gso:
2127         do {
2128                 struct sk_buff *nskb = skb->next;
2129
2130                 skb->next = nskb->next;
2131                 nskb->next = NULL;
2132
2133                 /*
2134                  * If device doesnt need nskb->dst, release it right now while
2135                  * its hot in this cpu cache
2136                  */
2137                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2138                         skb_dst_drop(nskb);
2139
2140                 rc = ops->ndo_start_xmit(nskb, dev);
2141                 trace_net_dev_xmit(nskb, rc);
2142                 if (unlikely(rc != NETDEV_TX_OK)) {
2143                         if (rc & ~NETDEV_TX_MASK)
2144                                 goto out_kfree_gso_skb;
2145                         nskb->next = skb->next;
2146                         skb->next = nskb;
2147                         return rc;
2148                 }
2149                 txq_trans_update(txq);
2150                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2151                         return NETDEV_TX_BUSY;
2152         } while (skb->next);
2153
2154 out_kfree_gso_skb:
2155         if (likely(skb->next == NULL))
2156                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2157 out_kfree_skb:
2158         kfree_skb(skb);
2159 out:
2160         return rc;
2161 }
2162
2163 static u32 hashrnd __read_mostly;
2164
2165 /*
2166  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2167  * to be used as a distribution range.
2168  */
2169 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2170                   unsigned int num_tx_queues)
2171 {
2172         u32 hash;
2173
2174         if (skb_rx_queue_recorded(skb)) {
2175                 hash = skb_get_rx_queue(skb);
2176                 while (unlikely(hash >= num_tx_queues))
2177                         hash -= num_tx_queues;
2178                 return hash;
2179         }
2180
2181         if (skb->sk && skb->sk->sk_hash)
2182                 hash = skb->sk->sk_hash;
2183         else
2184                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2185         hash = jhash_1word(hash, hashrnd);
2186
2187         return (u16) (((u64) hash * num_tx_queues) >> 32);
2188 }
2189 EXPORT_SYMBOL(__skb_tx_hash);
2190
2191 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2192 {
2193         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2194                 if (net_ratelimit()) {
2195                         pr_warning("%s selects TX queue %d, but "
2196                                 "real number of TX queues is %d\n",
2197                                 dev->name, queue_index, dev->real_num_tx_queues);
2198                 }
2199                 return 0;
2200         }
2201         return queue_index;
2202 }
2203
2204 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2205 {
2206 #ifdef CONFIG_XPS
2207         struct xps_dev_maps *dev_maps;
2208         struct xps_map *map;
2209         int queue_index = -1;
2210
2211         rcu_read_lock();
2212         dev_maps = rcu_dereference(dev->xps_maps);
2213         if (dev_maps) {
2214                 map = rcu_dereference(
2215                     dev_maps->cpu_map[raw_smp_processor_id()]);
2216                 if (map) {
2217                         if (map->len == 1)
2218                                 queue_index = map->queues[0];
2219                         else {
2220                                 u32 hash;
2221                                 if (skb->sk && skb->sk->sk_hash)
2222                                         hash = skb->sk->sk_hash;
2223                                 else
2224                                         hash = (__force u16) skb->protocol ^
2225                                             skb->rxhash;
2226                                 hash = jhash_1word(hash, hashrnd);
2227                                 queue_index = map->queues[
2228                                     ((u64)hash * map->len) >> 32];
2229                         }
2230                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2231                                 queue_index = -1;
2232                 }
2233         }
2234         rcu_read_unlock();
2235
2236         return queue_index;
2237 #else
2238         return -1;
2239 #endif
2240 }
2241
2242 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2243                                         struct sk_buff *skb)
2244 {
2245         int queue_index;
2246         const struct net_device_ops *ops = dev->netdev_ops;
2247
2248         if (dev->real_num_tx_queues == 1)
2249                 queue_index = 0;
2250         else if (ops->ndo_select_queue) {
2251                 queue_index = ops->ndo_select_queue(dev, skb);
2252                 queue_index = dev_cap_txqueue(dev, queue_index);
2253         } else {
2254                 struct sock *sk = skb->sk;
2255                 queue_index = sk_tx_queue_get(sk);
2256
2257                 if (queue_index < 0 || skb->ooo_okay ||
2258                     queue_index >= dev->real_num_tx_queues) {
2259                         int old_index = queue_index;
2260
2261                         queue_index = get_xps_queue(dev, skb);
2262                         if (queue_index < 0)
2263                                 queue_index = skb_tx_hash(dev, skb);
2264
2265                         if (queue_index != old_index && sk) {
2266                                 struct dst_entry *dst =
2267                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2268
2269                                 if (dst && skb_dst(skb) == dst)
2270                                         sk_tx_queue_set(sk, queue_index);
2271                         }
2272                 }
2273         }
2274
2275         skb_set_queue_mapping(skb, queue_index);
2276         return netdev_get_tx_queue(dev, queue_index);
2277 }
2278
2279 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2280                                  struct net_device *dev,
2281                                  struct netdev_queue *txq)
2282 {
2283         spinlock_t *root_lock = qdisc_lock(q);
2284         bool contended = qdisc_is_running(q);
2285         int rc;
2286
2287         /*
2288          * Heuristic to force contended enqueues to serialize on a
2289          * separate lock before trying to get qdisc main lock.
2290          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2291          * and dequeue packets faster.
2292          */
2293         if (unlikely(contended))
2294                 spin_lock(&q->busylock);
2295
2296         spin_lock(root_lock);
2297         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2298                 kfree_skb(skb);
2299                 rc = NET_XMIT_DROP;
2300         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2301                    qdisc_run_begin(q)) {
2302                 /*
2303                  * This is a work-conserving queue; there are no old skbs
2304                  * waiting to be sent out; and the qdisc is not running -
2305                  * xmit the skb directly.
2306                  */
2307                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2308                         skb_dst_force(skb);
2309                 __qdisc_update_bstats(q, skb->len);
2310                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2311                         if (unlikely(contended)) {
2312                                 spin_unlock(&q->busylock);
2313                                 contended = false;
2314                         }
2315                         __qdisc_run(q);
2316                 } else
2317                         qdisc_run_end(q);
2318
2319                 rc = NET_XMIT_SUCCESS;
2320         } else {
2321                 skb_dst_force(skb);
2322                 rc = qdisc_enqueue_root(skb, q);
2323                 if (qdisc_run_begin(q)) {
2324                         if (unlikely(contended)) {
2325                                 spin_unlock(&q->busylock);
2326                                 contended = false;
2327                         }
2328                         __qdisc_run(q);
2329                 }
2330         }
2331         spin_unlock(root_lock);
2332         if (unlikely(contended))
2333                 spin_unlock(&q->busylock);
2334         return rc;
2335 }
2336
2337 static DEFINE_PER_CPU(int, xmit_recursion);
2338 #define RECURSION_LIMIT 10
2339
2340 /**
2341  *      dev_queue_xmit - transmit a buffer
2342  *      @skb: buffer to transmit
2343  *
2344  *      Queue a buffer for transmission to a network device. The caller must
2345  *      have set the device and priority and built the buffer before calling
2346  *      this function. The function can be called from an interrupt.
2347  *
2348  *      A negative errno code is returned on a failure. A success does not
2349  *      guarantee the frame will be transmitted as it may be dropped due
2350  *      to congestion or traffic shaping.
2351  *
2352  * -----------------------------------------------------------------------------------
2353  *      I notice this method can also return errors from the queue disciplines,
2354  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2355  *      be positive.
2356  *
2357  *      Regardless of the return value, the skb is consumed, so it is currently
2358  *      difficult to retry a send to this method.  (You can bump the ref count
2359  *      before sending to hold a reference for retry if you are careful.)
2360  *
2361  *      When calling this method, interrupts MUST be enabled.  This is because
2362  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2363  *          --BLG
2364  */
2365 int dev_queue_xmit(struct sk_buff *skb)
2366 {
2367         struct net_device *dev = skb->dev;
2368         struct netdev_queue *txq;
2369         struct Qdisc *q;
2370         int rc = -ENOMEM;
2371
2372         /* Disable soft irqs for various locks below. Also
2373          * stops preemption for RCU.
2374          */
2375         rcu_read_lock_bh();
2376
2377         txq = dev_pick_tx(dev, skb);
2378         q = rcu_dereference_bh(txq->qdisc);
2379
2380 #ifdef CONFIG_NET_CLS_ACT
2381         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2382 #endif
2383         trace_net_dev_queue(skb);
2384         if (q->enqueue) {
2385                 rc = __dev_xmit_skb(skb, q, dev, txq);
2386                 goto out;
2387         }
2388
2389         /* The device has no queue. Common case for software devices:
2390            loopback, all the sorts of tunnels...
2391
2392            Really, it is unlikely that netif_tx_lock protection is necessary
2393            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2394            counters.)
2395            However, it is possible, that they rely on protection
2396            made by us here.
2397
2398            Check this and shot the lock. It is not prone from deadlocks.
2399            Either shot noqueue qdisc, it is even simpler 8)
2400          */
2401         if (dev->flags & IFF_UP) {
2402                 int cpu = smp_processor_id(); /* ok because BHs are off */
2403
2404                 if (txq->xmit_lock_owner != cpu) {
2405
2406                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2407                                 goto recursion_alert;
2408
2409                         HARD_TX_LOCK(dev, txq, cpu);
2410
2411                         if (!netif_tx_queue_stopped(txq)) {
2412                                 __this_cpu_inc(xmit_recursion);
2413                                 rc = dev_hard_start_xmit(skb, dev, txq);
2414                                 __this_cpu_dec(xmit_recursion);
2415                                 if (dev_xmit_complete(rc)) {
2416                                         HARD_TX_UNLOCK(dev, txq);
2417                                         goto out;
2418                                 }
2419                         }
2420                         HARD_TX_UNLOCK(dev, txq);
2421                         if (net_ratelimit())
2422                                 printk(KERN_CRIT "Virtual device %s asks to "
2423                                        "queue packet!\n", dev->name);
2424                 } else {
2425                         /* Recursion is detected! It is possible,
2426                          * unfortunately
2427                          */
2428 recursion_alert:
2429                         if (net_ratelimit())
2430                                 printk(KERN_CRIT "Dead loop on virtual device "
2431                                        "%s, fix it urgently!\n", dev->name);
2432                 }
2433         }
2434
2435         rc = -ENETDOWN;
2436         rcu_read_unlock_bh();
2437
2438         kfree_skb(skb);
2439         return rc;
2440 out:
2441         rcu_read_unlock_bh();
2442         return rc;
2443 }
2444 EXPORT_SYMBOL(dev_queue_xmit);
2445
2446
2447 /*=======================================================================
2448                         Receiver routines
2449   =======================================================================*/
2450
2451 int netdev_max_backlog __read_mostly = 1000;
2452 int netdev_tstamp_prequeue __read_mostly = 1;
2453 int netdev_budget __read_mostly = 300;
2454 int weight_p __read_mostly = 64;            /* old backlog weight */
2455
2456 /* Called with irq disabled */
2457 static inline void ____napi_schedule(struct softnet_data *sd,
2458                                      struct napi_struct *napi)
2459 {
2460         list_add_tail(&napi->poll_list, &sd->poll_list);
2461         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2462 }
2463
2464 /*
2465  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2466  * and src/dst port numbers. Returns a non-zero hash number on success
2467  * and 0 on failure.
2468  */
2469 __u32 __skb_get_rxhash(struct sk_buff *skb)
2470 {
2471         int nhoff, hash = 0, poff;
2472         struct ipv6hdr *ip6;
2473         struct iphdr *ip;
2474         u8 ip_proto;
2475         u32 addr1, addr2, ihl;
2476         union {
2477                 u32 v32;
2478                 u16 v16[2];
2479         } ports;
2480
2481         nhoff = skb_network_offset(skb);
2482
2483         switch (skb->protocol) {
2484         case __constant_htons(ETH_P_IP):
2485                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2486                         goto done;
2487
2488                 ip = (struct iphdr *) (skb->data + nhoff);
2489                 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2490                         ip_proto = 0;
2491                 else
2492                         ip_proto = ip->protocol;
2493                 addr1 = (__force u32) ip->saddr;
2494                 addr2 = (__force u32) ip->daddr;
2495                 ihl = ip->ihl;
2496                 break;
2497         case __constant_htons(ETH_P_IPV6):
2498                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2499                         goto done;
2500
2501                 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2502                 ip_proto = ip6->nexthdr;
2503                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2504                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2505                 ihl = (40 >> 2);
2506                 break;
2507         default:
2508                 goto done;
2509         }
2510
2511         ports.v32 = 0;
2512         poff = proto_ports_offset(ip_proto);
2513         if (poff >= 0) {
2514                 nhoff += ihl * 4 + poff;
2515                 if (pskb_may_pull(skb, nhoff + 4)) {
2516                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2517                         if (ports.v16[1] < ports.v16[0])
2518                                 swap(ports.v16[0], ports.v16[1]);
2519                 }
2520         }
2521
2522         /* get a consistent hash (same value on both flow directions) */
2523         if (addr2 < addr1)
2524                 swap(addr1, addr2);
2525
2526         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2527         if (!hash)
2528                 hash = 1;
2529
2530 done:
2531         return hash;
2532 }
2533 EXPORT_SYMBOL(__skb_get_rxhash);
2534
2535 #ifdef CONFIG_RPS
2536
2537 /* One global table that all flow-based protocols share. */
2538 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2539 EXPORT_SYMBOL(rps_sock_flow_table);
2540
2541 /*
2542  * get_rps_cpu is called from netif_receive_skb and returns the target
2543  * CPU from the RPS map of the receiving queue for a given skb.
2544  * rcu_read_lock must be held on entry.
2545  */
2546 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2547                        struct rps_dev_flow **rflowp)
2548 {
2549         struct netdev_rx_queue *rxqueue;
2550         struct rps_map *map;
2551         struct rps_dev_flow_table *flow_table;
2552         struct rps_sock_flow_table *sock_flow_table;
2553         int cpu = -1;
2554         u16 tcpu;
2555
2556         if (skb_rx_queue_recorded(skb)) {
2557                 u16 index = skb_get_rx_queue(skb);
2558                 if (unlikely(index >= dev->real_num_rx_queues)) {
2559                         WARN_ONCE(dev->real_num_rx_queues > 1,
2560                                   "%s received packet on queue %u, but number "
2561                                   "of RX queues is %u\n",
2562                                   dev->name, index, dev->real_num_rx_queues);
2563                         goto done;
2564                 }
2565                 rxqueue = dev->_rx + index;
2566         } else
2567                 rxqueue = dev->_rx;
2568
2569         map = rcu_dereference(rxqueue->rps_map);
2570         if (map) {
2571                 if (map->len == 1) {
2572                         tcpu = map->cpus[0];
2573                         if (cpu_online(tcpu))
2574                                 cpu = tcpu;
2575                         goto done;
2576                 }
2577         } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2578                 goto done;
2579         }
2580
2581         skb_reset_network_header(skb);
2582         if (!skb_get_rxhash(skb))
2583                 goto done;
2584
2585         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2586         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2587         if (flow_table && sock_flow_table) {
2588                 u16 next_cpu;
2589                 struct rps_dev_flow *rflow;
2590
2591                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2592                 tcpu = rflow->cpu;
2593
2594                 next_cpu = sock_flow_table->ents[skb->rxhash &
2595                     sock_flow_table->mask];
2596
2597                 /*
2598                  * If the desired CPU (where last recvmsg was done) is
2599                  * different from current CPU (one in the rx-queue flow
2600                  * table entry), switch if one of the following holds:
2601                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2602                  *   - Current CPU is offline.
2603                  *   - The current CPU's queue tail has advanced beyond the
2604                  *     last packet that was enqueued using this table entry.
2605                  *     This guarantees that all previous packets for the flow
2606                  *     have been dequeued, thus preserving in order delivery.
2607                  */
2608                 if (unlikely(tcpu != next_cpu) &&
2609                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2610                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2611                       rflow->last_qtail)) >= 0)) {
2612                         tcpu = rflow->cpu = next_cpu;
2613                         if (tcpu != RPS_NO_CPU)
2614                                 rflow->last_qtail = per_cpu(softnet_data,
2615                                     tcpu).input_queue_head;
2616                 }
2617                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2618                         *rflowp = rflow;
2619                         cpu = tcpu;
2620                         goto done;
2621                 }
2622         }
2623
2624         if (map) {
2625                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2626
2627                 if (cpu_online(tcpu)) {
2628                         cpu = tcpu;
2629                         goto done;
2630                 }
2631         }
2632
2633 done:
2634         return cpu;
2635 }
2636
2637 /* Called from hardirq (IPI) context */
2638 static void rps_trigger_softirq(void *data)
2639 {
2640         struct softnet_data *sd = data;
2641
2642         ____napi_schedule(sd, &sd->backlog);
2643         sd->received_rps++;
2644 }
2645
2646 #endif /* CONFIG_RPS */
2647
2648 /*
2649  * Check if this softnet_data structure is another cpu one
2650  * If yes, queue it to our IPI list and return 1
2651  * If no, return 0
2652  */
2653 static int rps_ipi_queued(struct softnet_data *sd)
2654 {
2655 #ifdef CONFIG_RPS
2656         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2657
2658         if (sd != mysd) {
2659                 sd->rps_ipi_next = mysd->rps_ipi_list;
2660                 mysd->rps_ipi_list = sd;
2661
2662                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2663                 return 1;
2664         }
2665 #endif /* CONFIG_RPS */
2666         return 0;
2667 }
2668
2669 /*
2670  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2671  * queue (may be a remote CPU queue).
2672  */
2673 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2674                               unsigned int *qtail)
2675 {
2676         struct softnet_data *sd;
2677         unsigned long flags;
2678
2679         sd = &per_cpu(softnet_data, cpu);
2680
2681         local_irq_save(flags);
2682
2683         rps_lock(sd);
2684         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2685                 if (skb_queue_len(&sd->input_pkt_queue)) {
2686 enqueue:
2687                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2688                         input_queue_tail_incr_save(sd, qtail);
2689                         rps_unlock(sd);
2690                         local_irq_restore(flags);
2691                         return NET_RX_SUCCESS;
2692                 }
2693
2694                 /* Schedule NAPI for backlog device
2695                  * We can use non atomic operation since we own the queue lock
2696                  */
2697                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2698                         if (!rps_ipi_queued(sd))
2699                                 ____napi_schedule(sd, &sd->backlog);
2700                 }
2701                 goto enqueue;
2702         }
2703
2704         sd->dropped++;
2705         rps_unlock(sd);
2706
2707         local_irq_restore(flags);
2708
2709         atomic_long_inc(&skb->dev->rx_dropped);
2710         kfree_skb(skb);
2711         return NET_RX_DROP;
2712 }
2713
2714 /**
2715  *      netif_rx        -       post buffer to the network code
2716  *      @skb: buffer to post
2717  *
2718  *      This function receives a packet from a device driver and queues it for
2719  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2720  *      may be dropped during processing for congestion control or by the
2721  *      protocol layers.
2722  *
2723  *      return values:
2724  *      NET_RX_SUCCESS  (no congestion)
2725  *      NET_RX_DROP     (packet was dropped)
2726  *
2727  */
2728
2729 int netif_rx(struct sk_buff *skb)
2730 {
2731         int ret;
2732
2733         /* if netpoll wants it, pretend we never saw it */
2734         if (netpoll_rx(skb))
2735                 return NET_RX_DROP;
2736
2737         if (netdev_tstamp_prequeue)
2738                 net_timestamp_check(skb);
2739
2740         trace_netif_rx(skb);
2741 #ifdef CONFIG_RPS
2742         {
2743                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2744                 int cpu;
2745
2746                 preempt_disable();
2747                 rcu_read_lock();
2748
2749                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2750                 if (cpu < 0)
2751                         cpu = smp_processor_id();
2752
2753                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2754
2755                 rcu_read_unlock();
2756                 preempt_enable();
2757         }
2758 #else
2759         {
2760                 unsigned int qtail;
2761                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2762                 put_cpu();
2763         }
2764 #endif
2765         return ret;
2766 }
2767 EXPORT_SYMBOL(netif_rx);
2768
2769 int netif_rx_ni(struct sk_buff *skb)
2770 {
2771         int err;
2772
2773         preempt_disable();
2774         err = netif_rx(skb);
2775         if (local_softirq_pending())
2776                 do_softirq();
2777         preempt_enable();
2778
2779         return err;
2780 }
2781 EXPORT_SYMBOL(netif_rx_ni);
2782
2783 static void net_tx_action(struct softirq_action *h)
2784 {
2785         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2786
2787         if (sd->completion_queue) {
2788                 struct sk_buff *clist;
2789
2790                 local_irq_disable();
2791                 clist = sd->completion_queue;
2792                 sd->completion_queue = NULL;
2793                 local_irq_enable();
2794
2795                 while (clist) {
2796                         struct sk_buff *skb = clist;
2797                         clist = clist->next;
2798
2799                         WARN_ON(atomic_read(&skb->users));
2800                         trace_kfree_skb(skb, net_tx_action);
2801                         __kfree_skb(skb);
2802                 }
2803         }
2804
2805         if (sd->output_queue) {
2806                 struct Qdisc *head;
2807
2808                 local_irq_disable();
2809                 head = sd->output_queue;
2810                 sd->output_queue = NULL;
2811                 sd->output_queue_tailp = &sd->output_queue;
2812                 local_irq_enable();
2813
2814                 while (head) {
2815                         struct Qdisc *q = head;
2816                         spinlock_t *root_lock;
2817
2818                         head = head->next_sched;
2819
2820                         root_lock = qdisc_lock(q);
2821                         if (spin_trylock(root_lock)) {
2822                                 smp_mb__before_clear_bit();
2823                                 clear_bit(__QDISC_STATE_SCHED,
2824                                           &q->state);
2825                                 qdisc_run(q);
2826                                 spin_unlock(root_lock);
2827                         } else {
2828                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2829                                               &q->state)) {
2830                                         __netif_reschedule(q);
2831                                 } else {
2832                                         smp_mb__before_clear_bit();
2833                                         clear_bit(__QDISC_STATE_SCHED,
2834                                                   &q->state);
2835                                 }
2836                         }
2837                 }
2838         }
2839 }
2840
2841 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2842     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2843 /* This hook is defined here for ATM LANE */
2844 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2845                              unsigned char *addr) __read_mostly;
2846 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2847 #endif
2848
2849 #ifdef CONFIG_NET_CLS_ACT
2850 /* TODO: Maybe we should just force sch_ingress to be compiled in
2851  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2852  * a compare and 2 stores extra right now if we dont have it on
2853  * but have CONFIG_NET_CLS_ACT
2854  * NOTE: This doesnt stop any functionality; if you dont have
2855  * the ingress scheduler, you just cant add policies on ingress.
2856  *
2857  */
2858 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2859 {
2860         struct net_device *dev = skb->dev;
2861         u32 ttl = G_TC_RTTL(skb->tc_verd);
2862         int result = TC_ACT_OK;
2863         struct Qdisc *q;
2864
2865         if (unlikely(MAX_RED_LOOP < ttl++)) {
2866                 if (net_ratelimit())
2867                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2868                                skb->skb_iif, dev->ifindex);
2869                 return TC_ACT_SHOT;
2870         }
2871
2872         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2873         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2874
2875         q = rxq->qdisc;
2876         if (q != &noop_qdisc) {
2877                 spin_lock(qdisc_lock(q));
2878                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2879                         result = qdisc_enqueue_root(skb, q);
2880                 spin_unlock(qdisc_lock(q));
2881         }
2882
2883         return result;
2884 }
2885
2886 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2887                                          struct packet_type **pt_prev,
2888                                          int *ret, struct net_device *orig_dev)
2889 {
2890         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2891
2892         if (!rxq || rxq->qdisc == &noop_qdisc)
2893                 goto out;
2894
2895         if (*pt_prev) {
2896                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2897                 *pt_prev = NULL;
2898         }
2899
2900         switch (ing_filter(skb, rxq)) {
2901         case TC_ACT_SHOT:
2902         case TC_ACT_STOLEN:
2903                 kfree_skb(skb);
2904                 return NULL;
2905         }
2906
2907 out:
2908         skb->tc_verd = 0;
2909         return skb;
2910 }
2911 #endif
2912
2913 /**
2914  *      netdev_rx_handler_register - register receive handler
2915  *      @dev: device to register a handler for
2916  *      @rx_handler: receive handler to register
2917  *      @rx_handler_data: data pointer that is used by rx handler
2918  *
2919  *      Register a receive hander for a device. This handler will then be
2920  *      called from __netif_receive_skb. A negative errno code is returned
2921  *      on a failure.
2922  *
2923  *      The caller must hold the rtnl_mutex.
2924  */
2925 int netdev_rx_handler_register(struct net_device *dev,
2926                                rx_handler_func_t *rx_handler,
2927                                void *rx_handler_data)
2928 {
2929         ASSERT_RTNL();
2930
2931         if (dev->rx_handler)
2932                 return -EBUSY;
2933
2934         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2935         rcu_assign_pointer(dev->rx_handler, rx_handler);
2936
2937         return 0;
2938 }
2939 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2940
2941 /**
2942  *      netdev_rx_handler_unregister - unregister receive handler
2943  *      @dev: device to unregister a handler from
2944  *
2945  *      Unregister a receive hander from a device.
2946  *
2947  *      The caller must hold the rtnl_mutex.
2948  */
2949 void netdev_rx_handler_unregister(struct net_device *dev)
2950 {
2951
2952         ASSERT_RTNL();
2953         rcu_assign_pointer(dev->rx_handler, NULL);
2954         rcu_assign_pointer(dev->rx_handler_data, NULL);
2955 }
2956 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2957
2958 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2959                                               struct net_device *master)
2960 {
2961         if (skb->pkt_type == PACKET_HOST) {
2962                 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2963
2964                 memcpy(dest, master->dev_addr, ETH_ALEN);
2965         }
2966 }
2967
2968 /* On bonding slaves other than the currently active slave, suppress
2969  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2970  * ARP on active-backup slaves with arp_validate enabled.
2971  */
2972 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2973 {
2974         struct net_device *dev = skb->dev;
2975
2976         if (master->priv_flags & IFF_MASTER_ARPMON)
2977                 dev->last_rx = jiffies;
2978
2979         if ((master->priv_flags & IFF_MASTER_ALB) &&
2980             (master->priv_flags & IFF_BRIDGE_PORT)) {
2981                 /* Do address unmangle. The local destination address
2982                  * will be always the one master has. Provides the right
2983                  * functionality in a bridge.
2984                  */
2985                 skb_bond_set_mac_by_master(skb, master);
2986         }
2987
2988         if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2989                 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2990                     skb->protocol == __cpu_to_be16(ETH_P_ARP))
2991                         return 0;
2992
2993                 if (master->priv_flags & IFF_MASTER_ALB) {
2994                         if (skb->pkt_type != PACKET_BROADCAST &&
2995                             skb->pkt_type != PACKET_MULTICAST)
2996                                 return 0;
2997                 }
2998                 if (master->priv_flags & IFF_MASTER_8023AD &&
2999                     skb->protocol == __cpu_to_be16(ETH_P_SLOW))
3000                         return 0;
3001
3002                 return 1;
3003         }
3004         return 0;
3005 }
3006 EXPORT_SYMBOL(__skb_bond_should_drop);
3007
3008 static int __netif_receive_skb(struct sk_buff *skb)
3009 {
3010         struct packet_type *ptype, *pt_prev;
3011         rx_handler_func_t *rx_handler;
3012         struct net_device *orig_dev;
3013         struct net_device *master;
3014         struct net_device *null_or_orig;
3015         struct net_device *orig_or_bond;
3016         int ret = NET_RX_DROP;
3017         __be16 type;
3018
3019         if (!netdev_tstamp_prequeue)
3020                 net_timestamp_check(skb);
3021
3022         trace_netif_receive_skb(skb);
3023
3024         /* if we've gotten here through NAPI, check netpoll */
3025         if (netpoll_receive_skb(skb))
3026                 return NET_RX_DROP;
3027
3028         if (!skb->skb_iif)
3029                 skb->skb_iif = skb->dev->ifindex;
3030
3031         /*
3032          * bonding note: skbs received on inactive slaves should only
3033          * be delivered to pkt handlers that are exact matches.  Also
3034          * the deliver_no_wcard flag will be set.  If packet handlers
3035          * are sensitive to duplicate packets these skbs will need to
3036          * be dropped at the handler.
3037          */
3038         null_or_orig = NULL;
3039         orig_dev = skb->dev;
3040         master = ACCESS_ONCE(orig_dev->master);
3041         if (skb->deliver_no_wcard)
3042                 null_or_orig = orig_dev;
3043         else if (master) {
3044                 if (skb_bond_should_drop(skb, master)) {
3045                         skb->deliver_no_wcard = 1;
3046                         null_or_orig = orig_dev; /* deliver only exact match */
3047                 } else
3048                         skb->dev = master;
3049         }
3050
3051         __this_cpu_inc(softnet_data.processed);
3052         skb_reset_network_header(skb);
3053         skb_reset_transport_header(skb);
3054         skb->mac_len = skb->network_header - skb->mac_header;
3055
3056         pt_prev = NULL;
3057
3058         rcu_read_lock();
3059
3060 #ifdef CONFIG_NET_CLS_ACT
3061         if (skb->tc_verd & TC_NCLS) {
3062                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3063                 goto ncls;
3064         }
3065 #endif
3066
3067         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3068                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3069                     ptype->dev == orig_dev) {
3070                         if (pt_prev)
3071                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3072                         pt_prev = ptype;
3073                 }
3074         }
3075
3076 #ifdef CONFIG_NET_CLS_ACT
3077         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3078         if (!skb)
3079                 goto out;
3080 ncls:
3081 #endif
3082
3083         /* Handle special case of bridge or macvlan */
3084         rx_handler = rcu_dereference(skb->dev->rx_handler);
3085         if (rx_handler) {
3086                 if (pt_prev) {
3087                         ret = deliver_skb(skb, pt_prev, orig_dev);
3088                         pt_prev = NULL;
3089                 }
3090                 skb = rx_handler(skb);
3091                 if (!skb)
3092                         goto out;
3093         }
3094
3095         if (vlan_tx_tag_present(skb)) {
3096                 if (pt_prev) {
3097                         ret = deliver_skb(skb, pt_prev, orig_dev);
3098                         pt_prev = NULL;
3099                 }
3100                 if (vlan_hwaccel_do_receive(&skb)) {
3101                         ret = __netif_receive_skb(skb);
3102                         goto out;
3103                 } else if (unlikely(!skb))
3104                         goto out;
3105         }
3106
3107         /*
3108          * Make sure frames received on VLAN interfaces stacked on
3109          * bonding interfaces still make their way to any base bonding
3110          * device that may have registered for a specific ptype.  The
3111          * handler may have to adjust skb->dev and orig_dev.
3112          */
3113         orig_or_bond = orig_dev;
3114         if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3115             (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
3116                 orig_or_bond = vlan_dev_real_dev(skb->dev);
3117         }
3118
3119         type = skb->protocol;
3120         list_for_each_entry_rcu(ptype,
3121                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3122                 if (ptype->type == type && (ptype->dev == null_or_orig ||
3123                      ptype->dev == skb->dev || ptype->dev == orig_dev ||
3124                      ptype->dev == orig_or_bond)) {
3125                         if (pt_prev)
3126                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3127                         pt_prev = ptype;
3128                 }
3129         }
3130
3131         if (pt_prev) {
3132                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3133         } else {
3134                 atomic_long_inc(&skb->dev->rx_dropped);
3135                 kfree_skb(skb);
3136                 /* Jamal, now you will not able to escape explaining
3137                  * me how you were going to use this. :-)
3138                  */
3139                 ret = NET_RX_DROP;
3140         }
3141
3142 out:
3143         rcu_read_unlock();
3144         return ret;
3145 }
3146
3147 /**
3148  *      netif_receive_skb - process receive buffer from network
3149  *      @skb: buffer to process
3150  *
3151  *      netif_receive_skb() is the main receive data processing function.
3152  *      It always succeeds. The buffer may be dropped during processing
3153  *      for congestion control or by the protocol layers.
3154  *
3155  *      This function may only be called from softirq context and interrupts
3156  *      should be enabled.
3157  *
3158  *      Return values (usually ignored):
3159  *      NET_RX_SUCCESS: no congestion
3160  *      NET_RX_DROP: packet was dropped
3161  */
3162 int netif_receive_skb(struct sk_buff *skb)
3163 {
3164         if (netdev_tstamp_prequeue)
3165                 net_timestamp_check(skb);
3166
3167         if (skb_defer_rx_timestamp(skb))
3168                 return NET_RX_SUCCESS;
3169
3170 #ifdef CONFIG_RPS
3171         {
3172                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3173                 int cpu, ret;
3174
3175                 rcu_read_lock();
3176
3177                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3178
3179                 if (cpu >= 0) {
3180                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3181                         rcu_read_unlock();
3182                 } else {
3183                         rcu_read_unlock();
3184                         ret = __netif_receive_skb(skb);
3185                 }
3186
3187                 return ret;
3188         }
3189 #else
3190         return __netif_receive_skb(skb);
3191 #endif
3192 }
3193 EXPORT_SYMBOL(netif_receive_skb);
3194
3195 /* Network device is going away, flush any packets still pending
3196  * Called with irqs disabled.
3197  */
3198 static void flush_backlog(void *arg)
3199 {
3200         struct net_device *dev = arg;
3201         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3202         struct sk_buff *skb, *tmp;
3203
3204         rps_lock(sd);
3205         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3206                 if (skb->dev == dev) {
3207                         __skb_unlink(skb, &sd->input_pkt_queue);
3208                         kfree_skb(skb);
3209                         input_queue_head_incr(sd);
3210                 }
3211         }
3212         rps_unlock(sd);
3213
3214         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3215                 if (skb->dev == dev) {
3216                         __skb_unlink(skb, &sd->process_queue);
3217                         kfree_skb(skb);
3218                         input_queue_head_incr(sd);
3219                 }
3220         }
3221 }
3222
3223 static int napi_gro_complete(struct sk_buff *skb)
3224 {
3225         struct packet_type *ptype;
3226         __be16 type = skb->protocol;
3227         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3228         int err = -ENOENT;
3229
3230         if (NAPI_GRO_CB(skb)->count == 1) {
3231                 skb_shinfo(skb)->gso_size = 0;
3232                 goto out;
3233         }
3234
3235         rcu_read_lock();
3236         list_for_each_entry_rcu(ptype, head, list) {
3237                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3238                         continue;
3239
3240                 err = ptype->gro_complete(skb);
3241                 break;
3242         }
3243         rcu_read_unlock();
3244
3245         if (err) {
3246                 WARN_ON(&ptype->list == head);
3247                 kfree_skb(skb);
3248                 return NET_RX_SUCCESS;
3249         }
3250
3251 out:
3252         return netif_receive_skb(skb);
3253 }
3254
3255 inline void napi_gro_flush(struct napi_struct *napi)
3256 {
3257         struct sk_buff *skb, *next;
3258
3259         for (skb = napi->gro_list; skb; skb = next) {
3260                 next = skb->next;
3261                 skb->next = NULL;
3262                 napi_gro_complete(skb);
3263         }
3264
3265         napi->gro_count = 0;
3266         napi->gro_list = NULL;
3267 }
3268 EXPORT_SYMBOL(napi_gro_flush);
3269
3270 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3271 {
3272         struct sk_buff **pp = NULL;
3273         struct packet_type *ptype;
3274         __be16 type = skb->protocol;
3275         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3276         int same_flow;
3277         int mac_len;
3278         enum gro_result ret;
3279
3280         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3281                 goto normal;
3282
3283         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3284                 goto normal;
3285
3286         rcu_read_lock();
3287         list_for_each_entry_rcu(ptype, head, list) {
3288                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3289                         continue;
3290
3291                 skb_set_network_header(skb, skb_gro_offset(skb));
3292                 mac_len = skb->network_header - skb->mac_header;
3293                 skb->mac_len = mac_len;
3294                 NAPI_GRO_CB(skb)->same_flow = 0;
3295                 NAPI_GRO_CB(skb)->flush = 0;
3296                 NAPI_GRO_CB(skb)->free = 0;
3297
3298                 pp = ptype->gro_receive(&napi->gro_list, skb);
3299                 break;
3300         }
3301         rcu_read_unlock();
3302
3303         if (&ptype->list == head)
3304                 goto normal;
3305
3306         same_flow = NAPI_GRO_CB(skb)->same_flow;
3307         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3308
3309         if (pp) {
3310                 struct sk_buff *nskb = *pp;
3311
3312                 *pp = nskb->next;
3313                 nskb->next = NULL;
3314                 napi_gro_complete(nskb);
3315                 napi->gro_count--;
3316         }
3317
3318         if (same_flow)
3319                 goto ok;
3320
3321         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3322                 goto normal;
3323
3324         napi->gro_count++;
3325         NAPI_GRO_CB(skb)->count = 1;
3326         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3327         skb->next = napi->gro_list;
3328         napi->gro_list = skb;
3329         ret = GRO_HELD;
3330
3331 pull:
3332         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3333                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3334
3335                 BUG_ON(skb->end - skb->tail < grow);
3336
3337                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3338
3339                 skb->tail += grow;
3340                 skb->data_len -= grow;
3341
3342                 skb_shinfo(skb)->frags[0].page_offset += grow;
3343                 skb_shinfo(skb)->frags[0].size -= grow;
3344
3345                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3346                         put_page(skb_shinfo(skb)->frags[0].page);
3347                         memmove(skb_shinfo(skb)->frags,
3348                                 skb_shinfo(skb)->frags + 1,
3349                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3350                 }
3351         }
3352
3353 ok:
3354         return ret;
3355
3356 normal:
3357         ret = GRO_NORMAL;
3358         goto pull;
3359 }
3360 EXPORT_SYMBOL(dev_gro_receive);
3361
3362 static inline gro_result_t
3363 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3364 {
3365         struct sk_buff *p;
3366
3367         for (p = napi->gro_list; p; p = p->next) {
3368                 unsigned long diffs;
3369
3370                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3371                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3372                 diffs |= compare_ether_header(skb_mac_header(p),
3373                                               skb_gro_mac_header(skb));
3374                 NAPI_GRO_CB(p)->same_flow = !diffs;
3375                 NAPI_GRO_CB(p)->flush = 0;
3376         }
3377
3378         return dev_gro_receive(napi, skb);
3379 }
3380
3381 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3382 {
3383         switch (ret) {
3384         case GRO_NORMAL:
3385                 if (netif_receive_skb(skb))
3386                         ret = GRO_DROP;
3387                 break;
3388
3389         case GRO_DROP:
3390         case GRO_MERGED_FREE:
3391                 kfree_skb(skb);
3392                 break;
3393
3394         case GRO_HELD:
3395         case GRO_MERGED:
3396                 break;
3397         }
3398
3399         return ret;
3400 }
3401 EXPORT_SYMBOL(napi_skb_finish);
3402
3403 void skb_gro_reset_offset(struct sk_buff *skb)
3404 {
3405         NAPI_GRO_CB(skb)->data_offset = 0;
3406         NAPI_GRO_CB(skb)->frag0 = NULL;
3407         NAPI_GRO_CB(skb)->frag0_len = 0;
3408
3409         if (skb->mac_header == skb->tail &&
3410             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3411                 NAPI_GRO_CB(skb)->frag0 =
3412                         page_address(skb_shinfo(skb)->frags[0].page) +
3413                         skb_shinfo(skb)->frags[0].page_offset;
3414                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3415         }
3416 }
3417 EXPORT_SYMBOL(skb_gro_reset_offset);
3418
3419 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3420 {
3421         skb_gro_reset_offset(skb);
3422
3423         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3424 }
3425 EXPORT_SYMBOL(napi_gro_receive);
3426
3427 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3428 {
3429         __skb_pull(skb, skb_headlen(skb));
3430         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3431         skb->vlan_tci = 0;
3432
3433         napi->skb = skb;
3434 }
3435
3436 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3437 {
3438         struct sk_buff *skb = napi->skb;
3439
3440         if (!skb) {
3441                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3442                 if (skb)
3443                         napi->skb = skb;
3444         }
3445         return skb;
3446 }
3447 EXPORT_SYMBOL(napi_get_frags);
3448
3449 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3450                                gro_result_t ret)
3451 {
3452         switch (ret) {
3453         case GRO_NORMAL:
3454         case GRO_HELD:
3455                 skb->protocol = eth_type_trans(skb, skb->dev);
3456
3457                 if (ret == GRO_HELD)
3458                         skb_gro_pull(skb, -ETH_HLEN);
3459                 else if (netif_receive_skb(skb))
3460                         ret = GRO_DROP;
3461                 break;
3462
3463         case GRO_DROP:
3464         case GRO_MERGED_FREE:
3465                 napi_reuse_skb(napi, skb);
3466                 break;
3467
3468         case GRO_MERGED:
3469                 break;
3470         }
3471
3472         return ret;
3473 }
3474 EXPORT_SYMBOL(napi_frags_finish);
3475
3476 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3477 {
3478         struct sk_buff *skb = napi->skb;
3479         struct ethhdr *eth;
3480         unsigned int hlen;
3481         unsigned int off;
3482
3483         napi->skb = NULL;
3484
3485         skb_reset_mac_header(skb);
3486         skb_gro_reset_offset(skb);
3487
3488         off = skb_gro_offset(skb);
3489         hlen = off + sizeof(*eth);
3490         eth = skb_gro_header_fast(skb, off);
3491         if (skb_gro_header_hard(skb, hlen)) {
3492                 eth = skb_gro_header_slow(skb, hlen, off);
3493                 if (unlikely(!eth)) {
3494                         napi_reuse_skb(napi, skb);
3495                         skb = NULL;
3496                         goto out;
3497                 }
3498         }
3499
3500         skb_gro_pull(skb, sizeof(*eth));
3501
3502         /*
3503          * This works because the only protocols we care about don't require
3504          * special handling.  We'll fix it up properly at the end.
3505          */
3506         skb->protocol = eth->h_proto;
3507
3508 out:
3509         return skb;
3510 }
3511 EXPORT_SYMBOL(napi_frags_skb);
3512
3513 gro_result_t napi_gro_frags(struct napi_struct *napi)
3514 {
3515         struct sk_buff *skb = napi_frags_skb(napi);
3516
3517         if (!skb)
3518                 return GRO_DROP;
3519
3520         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3521 }
3522 EXPORT_SYMBOL(napi_gro_frags);
3523
3524 /*
3525  * net_rps_action sends any pending IPI's for rps.
3526  * Note: called with local irq disabled, but exits with local irq enabled.
3527  */
3528 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3529 {
3530 #ifdef CONFIG_RPS
3531         struct softnet_data *remsd = sd->rps_ipi_list;
3532
3533         if (remsd) {
3534                 sd->rps_ipi_list = NULL;
3535
3536                 local_irq_enable();
3537
3538                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3539                 while (remsd) {
3540                         struct softnet_data *next = remsd->rps_ipi_next;
3541
3542                         if (cpu_online(remsd->cpu))
3543                                 __smp_call_function_single(remsd->cpu,
3544                                                            &remsd->csd, 0);
3545                         remsd = next;
3546                 }
3547         } else
3548 #endif
3549                 local_irq_enable();
3550 }
3551
3552 static int process_backlog(struct napi_struct *napi, int quota)
3553 {
3554         int work = 0;
3555         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3556
3557 #ifdef CONFIG_RPS
3558         /* Check if we have pending ipi, its better to send them now,
3559          * not waiting net_rx_action() end.
3560          */
3561         if (sd->rps_ipi_list) {
3562                 local_irq_disable();
3563                 net_rps_action_and_irq_enable(sd);
3564         }
3565 #endif
3566         napi->weight = weight_p;
3567         local_irq_disable();
3568         while (work < quota) {
3569                 struct sk_buff *skb;
3570                 unsigned int qlen;
3571
3572                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3573                         local_irq_enable();
3574                         __netif_receive_skb(skb);
3575                         local_irq_disable();
3576                         input_queue_head_incr(sd);
3577                         if (++work >= quota) {
3578                                 local_irq_enable();
3579                                 return work;
3580                         }
3581                 }
3582
3583                 rps_lock(sd);
3584                 qlen = skb_queue_len(&sd->input_pkt_queue);
3585                 if (qlen)
3586                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3587                                                    &sd->process_queue);
3588
3589                 if (qlen < quota - work) {
3590                         /*
3591                          * Inline a custom version of __napi_complete().
3592                          * only current cpu owns and manipulates this napi,
3593                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3594                          * we can use a plain write instead of clear_bit(),
3595                          * and we dont need an smp_mb() memory barrier.
3596                          */
3597                         list_del(&napi->poll_list);
3598                         napi->state = 0;
3599
3600                         quota = work + qlen;
3601                 }
3602                 rps_unlock(sd);
3603         }
3604         local_irq_enable();
3605
3606         return work;
3607 }
3608
3609 /**
3610  * __napi_schedule - schedule for receive
3611  * @n: entry to schedule
3612  *
3613  * The entry's receive function will be scheduled to run
3614  */
3615 void __napi_schedule(struct napi_struct *n)
3616 {
3617         unsigned long flags;
3618
3619         local_irq_save(flags);
3620         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3621         local_irq_restore(flags);
3622 }
3623 EXPORT_SYMBOL(__napi_schedule);
3624
3625 void __napi_complete(struct napi_struct *n)
3626 {
3627         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3628         BUG_ON(n->gro_list);
3629
3630         list_del(&n->poll_list);
3631         smp_mb__before_clear_bit();
3632         clear_bit(NAPI_STATE_SCHED, &n->state);
3633 }
3634 EXPORT_SYMBOL(__napi_complete);
3635
3636 void napi_complete(struct napi_struct *n)
3637 {
3638         unsigned long flags;
3639
3640         /*
3641          * don't let napi dequeue from the cpu poll list
3642          * just in case its running on a different cpu
3643          */
3644         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3645                 return;
3646
3647         napi_gro_flush(n);
3648         local_irq_save(flags);
3649         __napi_complete(n);
3650         local_irq_restore(flags);
3651 }
3652 EXPORT_SYMBOL(napi_complete);
3653
3654 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3655                     int (*poll)(struct napi_struct *, int), int weight)
3656 {
3657         INIT_LIST_HEAD(&napi->poll_list);
3658         napi->gro_count = 0;
3659         napi->gro_list = NULL;
3660         napi->skb = NULL;
3661         napi->poll = poll;
3662         napi->weight = weight;
3663         list_add(&napi->dev_list, &dev->napi_list);
3664         napi->dev = dev;
3665 #ifdef CONFIG_NETPOLL
3666         spin_lock_init(&napi->poll_lock);
3667         napi->poll_owner = -1;
3668 #endif
3669         set_bit(NAPI_STATE_SCHED, &napi->state);
3670 }
3671 EXPORT_SYMBOL(netif_napi_add);
3672
3673 void netif_napi_del(struct napi_struct *napi)
3674 {
3675         struct sk_buff *skb, *next;
3676
3677         list_del_init(&napi->dev_list);
3678         napi_free_frags(napi);
3679
3680         for (skb = napi->gro_list; skb; skb = next) {
3681                 next = skb->next;
3682                 skb->next = NULL;
3683                 kfree_skb(skb);
3684         }
3685
3686         napi->gro_list = NULL;
3687         napi->gro_count = 0;
3688 }
3689 EXPORT_SYMBOL(netif_napi_del);
3690
3691 static void net_rx_action(struct softirq_action *h)
3692 {
3693         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3694         unsigned long time_limit = jiffies + 2;
3695         int budget = netdev_budget;
3696         void *have;
3697
3698         local_irq_disable();
3699
3700         while (!list_empty(&sd->poll_list)) {
3701                 struct napi_struct *n;
3702                 int work, weight;
3703
3704                 /* If softirq window is exhuasted then punt.
3705                  * Allow this to run for 2 jiffies since which will allow
3706                  * an average latency of 1.5/HZ.
3707                  */
3708                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3709                         goto softnet_break;
3710
3711                 local_irq_enable();
3712
3713                 /* Even though interrupts have been re-enabled, this
3714                  * access is safe because interrupts can only add new
3715                  * entries to the tail of this list, and only ->poll()
3716                  * calls can remove this head entry from the list.
3717                  */
3718                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3719
3720                 have = netpoll_poll_lock(n);
3721
3722                 weight = n->weight;
3723
3724                 /* This NAPI_STATE_SCHED test is for avoiding a race
3725                  * with netpoll's poll_napi().  Only the entity which
3726                  * obtains the lock and sees NAPI_STATE_SCHED set will
3727                  * actually make the ->poll() call.  Therefore we avoid
3728                  * accidently calling ->poll() when NAPI is not scheduled.
3729                  */
3730                 work = 0;
3731                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3732                         work = n->poll(n, weight);
3733                         trace_napi_poll(n);
3734                 }
3735
3736                 WARN_ON_ONCE(work > weight);
3737
3738                 budget -= work;
3739
3740                 local_irq_disable();
3741
3742                 /* Drivers must not modify the NAPI state if they
3743                  * consume the entire weight.  In such cases this code
3744                  * still "owns" the NAPI instance and therefore can
3745                  * move the instance around on the list at-will.
3746                  */
3747                 if (unlikely(work == weight)) {
3748                         if (unlikely(napi_disable_pending(n))) {
3749                                 local_irq_enable();
3750                                 napi_complete(n);
3751                                 local_irq_disable();
3752                         } else
3753                                 list_move_tail(&n->poll_list, &sd->poll_list);
3754                 }
3755
3756                 netpoll_poll_unlock(have);
3757         }
3758 out:
3759         net_rps_action_and_irq_enable(sd);
3760
3761 #ifdef CONFIG_NET_DMA
3762         /*
3763          * There may not be any more sk_buffs coming right now, so push
3764          * any pending DMA copies to hardware
3765          */
3766         dma_issue_pending_all();
3767 #endif
3768
3769         return;
3770
3771 softnet_break:
3772         sd->time_squeeze++;
3773         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3774         goto out;
3775 }
3776
3777 static gifconf_func_t *gifconf_list[NPROTO];
3778
3779 /**
3780  *      register_gifconf        -       register a SIOCGIF handler
3781  *      @family: Address family
3782  *      @gifconf: Function handler
3783  *
3784  *      Register protocol dependent address dumping routines. The handler
3785  *      that is passed must not be freed or reused until it has been replaced
3786  *      by another handler.
3787  */
3788 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3789 {
3790         if (family >= NPROTO)
3791                 return -EINVAL;
3792         gifconf_list[family] = gifconf;
3793         return 0;
3794 }
3795 EXPORT_SYMBOL(register_gifconf);
3796
3797
3798 /*
3799  *      Map an interface index to its name (SIOCGIFNAME)
3800  */
3801
3802 /*
3803  *      We need this ioctl for efficient implementation of the
3804  *      if_indextoname() function required by the IPv6 API.  Without
3805  *      it, we would have to search all the interfaces to find a
3806  *      match.  --pb
3807  */
3808
3809 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3810 {
3811         struct net_device *dev;
3812         struct ifreq ifr;
3813
3814         /*
3815          *      Fetch the caller's info block.
3816          */
3817
3818         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3819                 return -EFAULT;
3820
3821         rcu_read_lock();
3822         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3823         if (!dev) {
3824                 rcu_read_unlock();
3825                 return -ENODEV;
3826         }
3827
3828         strcpy(ifr.ifr_name, dev->name);
3829         rcu_read_unlock();
3830
3831         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3832                 return -EFAULT;
3833         return 0;
3834 }
3835
3836 /*
3837  *      Perform a SIOCGIFCONF call. This structure will change
3838  *      size eventually, and there is nothing I can do about it.
3839  *      Thus we will need a 'compatibility mode'.
3840  */
3841
3842 static int dev_ifconf(struct net *net, char __user *arg)
3843 {
3844         struct ifconf ifc;
3845         struct net_device *dev;
3846         char __user *pos;
3847         int len;
3848         int total;
3849         int i;
3850
3851         /*
3852          *      Fetch the caller's info block.
3853          */
3854
3855         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3856                 return -EFAULT;
3857
3858         pos = ifc.ifc_buf;
3859         len = ifc.ifc_len;
3860
3861         /*
3862          *      Loop over the interfaces, and write an info block for each.
3863          */
3864
3865         total = 0;
3866         for_each_netdev(net, dev) {
3867                 for (i = 0; i < NPROTO; i++) {
3868                         if (gifconf_list[i]) {
3869                                 int done;
3870                                 if (!pos)
3871                                         done = gifconf_list[i](dev, NULL, 0);
3872                                 else
3873                                         done = gifconf_list[i](dev, pos + total,
3874                                                                len - total);
3875                                 if (done < 0)
3876                                         return -EFAULT;
3877                                 total += done;
3878                         }
3879                 }
3880         }
3881
3882         /*
3883          *      All done.  Write the updated control block back to the caller.
3884          */
3885         ifc.ifc_len = total;
3886
3887         /*
3888          *      Both BSD and Solaris return 0 here, so we do too.
3889          */
3890         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3891 }
3892
3893 #ifdef CONFIG_PROC_FS
3894 /*
3895  *      This is invoked by the /proc filesystem handler to display a device
3896  *      in detail.
3897  */
3898 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3899         __acquires(RCU)
3900 {
3901         struct net *net = seq_file_net(seq);
3902         loff_t off;
3903         struct net_device *dev;
3904
3905         rcu_read_lock();
3906         if (!*pos)
3907                 return SEQ_START_TOKEN;
3908
3909         off = 1;
3910         for_each_netdev_rcu(net, dev)
3911                 if (off++ == *pos)
3912                         return dev;
3913
3914         return NULL;
3915 }
3916
3917 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3918 {
3919         struct net_device *dev = (v == SEQ_START_TOKEN) ?
3920                                   first_net_device(seq_file_net(seq)) :
3921                                   next_net_device((struct net_device *)v);
3922
3923         ++*pos;
3924         return rcu_dereference(dev);
3925 }
3926
3927 void dev_seq_stop(struct seq_file *seq, void *v)
3928         __releases(RCU)
3929 {
3930         rcu_read_unlock();
3931 }
3932
3933 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3934 {
3935         struct rtnl_link_stats64 temp;
3936         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3937
3938         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3939                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3940                    dev->name, stats->rx_bytes, stats->rx_packets,
3941                    stats->rx_errors,
3942                    stats->rx_dropped + stats->rx_missed_errors,
3943                    stats->rx_fifo_errors,
3944                    stats->rx_length_errors + stats->rx_over_errors +
3945                     stats->rx_crc_errors + stats->rx_frame_errors,
3946                    stats->rx_compressed, stats->multicast,
3947                    stats->tx_bytes, stats->tx_packets,
3948                    stats->tx_errors, stats->tx_dropped,
3949                    stats->tx_fifo_errors, stats->collisions,
3950                    stats->tx_carrier_errors +
3951                     stats->tx_aborted_errors +
3952                     stats->tx_window_errors +
3953                     stats->tx_heartbeat_errors,
3954                    stats->tx_compressed);
3955 }
3956
3957 /*
3958  *      Called from the PROCfs module. This now uses the new arbitrary sized
3959  *      /proc/net interface to create /proc/net/dev
3960  */
3961 static int dev_seq_show(struct seq_file *seq, void *v)
3962 {
3963         if (v == SEQ_START_TOKEN)
3964                 seq_puts(seq, "Inter-|   Receive                            "
3965                               "                    |  Transmit\n"
3966                               " face |bytes    packets errs drop fifo frame "
3967                               "compressed multicast|bytes    packets errs "
3968                               "drop fifo colls carrier compressed\n");
3969         else
3970                 dev_seq_printf_stats(seq, v);
3971         return 0;
3972 }
3973
3974 static struct softnet_data *softnet_get_online(loff_t *pos)
3975 {
3976         struct softnet_data *sd = NULL;
3977
3978         while (*pos < nr_cpu_ids)
3979                 if (cpu_online(*pos)) {
3980                         sd = &per_cpu(softnet_data, *pos);
3981                         break;
3982                 } else
3983                         ++*pos;
3984         return sd;
3985 }
3986
3987 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3988 {
3989         return softnet_get_online(pos);
3990 }
3991
3992 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3993 {
3994         ++*pos;
3995         return softnet_get_online(pos);
3996 }
3997
3998 static void softnet_seq_stop(struct seq_file *seq, void *v)
3999 {
4000 }
4001
4002 static int softnet_seq_show(struct seq_file *seq, void *v)
4003 {
4004         struct softnet_data *sd = v;
4005
4006         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4007                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4008                    0, 0, 0, 0, /* was fastroute */
4009                    sd->cpu_collision, sd->received_rps);
4010         return 0;
4011 }
4012
4013 static const struct seq_operations dev_seq_ops = {
4014         .start = dev_seq_start,
4015         .next  = dev_seq_next,
4016         .stop  = dev_seq_stop,
4017         .show  = dev_seq_show,
4018 };
4019
4020 static int dev_seq_open(struct inode *inode, struct file *file)
4021 {
4022         return seq_open_net(inode, file, &dev_seq_ops,
4023                             sizeof(struct seq_net_private));
4024 }
4025
4026 static const struct file_operations dev_seq_fops = {
4027         .owner   = THIS_MODULE,
4028         .open    = dev_seq_open,
4029         .read    = seq_read,
4030         .llseek  = seq_lseek,
4031         .release = seq_release_net,
4032 };
4033
4034 static const struct seq_operations softnet_seq_ops = {
4035         .start = softnet_seq_start,
4036         .next  = softnet_seq_next,
4037         .stop  = softnet_seq_stop,
4038         .show  = softnet_seq_show,
4039 };
4040
4041 static int softnet_seq_open(struct inode *inode, struct file *file)
4042 {
4043         return seq_open(file, &softnet_seq_ops);
4044 }
4045
4046 static const struct file_operations softnet_seq_fops = {
4047         .owner   = THIS_MODULE,
4048         .open    = softnet_seq_open,
4049         .read    = seq_read,
4050         .llseek  = seq_lseek,
4051         .release = seq_release,
4052 };
4053
4054 static void *ptype_get_idx(loff_t pos)
4055 {
4056         struct packet_type *pt = NULL;
4057         loff_t i = 0;
4058         int t;
4059
4060         list_for_each_entry_rcu(pt, &ptype_all, list) {
4061                 if (i == pos)
4062                         return pt;
4063                 ++i;
4064         }
4065
4066         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4067                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4068                         if (i == pos)
4069                                 return pt;
4070                         ++i;
4071                 }
4072         }
4073         return NULL;
4074 }
4075
4076 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4077         __acquires(RCU)
4078 {
4079         rcu_read_lock();
4080         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4081 }
4082
4083 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4084 {
4085         struct packet_type *pt;
4086         struct list_head *nxt;
4087         int hash;
4088
4089         ++*pos;
4090         if (v == SEQ_START_TOKEN)
4091                 return ptype_get_idx(0);
4092
4093         pt = v;
4094         nxt = pt->list.next;
4095         if (pt->type == htons(ETH_P_ALL)) {
4096                 if (nxt != &ptype_all)
4097                         goto found;
4098                 hash = 0;
4099                 nxt = ptype_base[0].next;
4100         } else
4101                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4102
4103         while (nxt == &ptype_base[hash]) {
4104                 if (++hash >= PTYPE_HASH_SIZE)
4105                         return NULL;
4106                 nxt = ptype_base[hash].next;
4107         }
4108 found:
4109         return list_entry(nxt, struct packet_type, list);
4110 }
4111
4112 static void ptype_seq_stop(struct seq_file *seq, void *v)
4113         __releases(RCU)
4114 {
4115         rcu_read_unlock();
4116 }
4117
4118 static int ptype_seq_show(struct seq_file *seq, void *v)
4119 {
4120         struct packet_type *pt = v;
4121
4122         if (v == SEQ_START_TOKEN)
4123                 seq_puts(seq, "Type Device      Function\n");
4124         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4125                 if (pt->type == htons(ETH_P_ALL))
4126                         seq_puts(seq, "ALL ");
4127                 else
4128                         seq_printf(seq, "%04x", ntohs(pt->type));
4129
4130                 seq_printf(seq, " %-8s %pF\n",
4131                            pt->dev ? pt->dev->name : "", pt->func);
4132         }
4133
4134         return 0;
4135 }
4136
4137 static const struct seq_operations ptype_seq_ops = {
4138         .start = ptype_seq_start,
4139         .next  = ptype_seq_next,
4140         .stop  = ptype_seq_stop,
4141         .show  = ptype_seq_show,
4142 };
4143
4144 static int ptype_seq_open(struct inode *inode, struct file *file)
4145 {
4146         return seq_open_net(inode, file, &ptype_seq_ops,
4147                         sizeof(struct seq_net_private));
4148 }
4149
4150 static const struct file_operations ptype_seq_fops = {
4151         .owner   = THIS_MODULE,
4152         .open    = ptype_seq_open,
4153         .read    = seq_read,
4154         .llseek  = seq_lseek,
4155         .release = seq_release_net,
4156 };
4157
4158
4159 static int __net_init dev_proc_net_init(struct net *net)
4160 {
4161         int rc = -ENOMEM;
4162
4163         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4164                 goto out;
4165         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4166                 goto out_dev;
4167         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4168                 goto out_softnet;
4169
4170         if (wext_proc_init(net))
4171                 goto out_ptype;
4172         rc = 0;
4173 out:
4174         return rc;
4175 out_ptype:
4176         proc_net_remove(net, "ptype");
4177 out_softnet:
4178         proc_net_remove(net, "softnet_stat");
4179 out_dev:
4180         proc_net_remove(net, "dev");
4181         goto out;
4182 }
4183
4184 static void __net_exit dev_proc_net_exit(struct net *net)
4185 {
4186         wext_proc_exit(net);
4187
4188         proc_net_remove(net, "ptype");
4189         proc_net_remove(net, "softnet_stat");
4190         proc_net_remove(net, "dev");
4191 }
4192
4193 static struct pernet_operations __net_initdata dev_proc_ops = {
4194         .init = dev_proc_net_init,
4195         .exit = dev_proc_net_exit,
4196 };
4197
4198 static int __init dev_proc_init(void)
4199 {
4200         return register_pernet_subsys(&dev_proc_ops);
4201 }
4202 #else
4203 #define dev_proc_init() 0
4204 #endif  /* CONFIG_PROC_FS */
4205
4206
4207 /**
4208  *      netdev_set_master       -       set up master/slave pair
4209  *      @slave: slave device
4210  *      @master: new master device
4211  *
4212  *      Changes the master device of the slave. Pass %NULL to break the
4213  *      bonding. The caller must hold the RTNL semaphore. On a failure
4214  *      a negative errno code is returned. On success the reference counts
4215  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4216  *      function returns zero.
4217  */
4218 int netdev_set_master(struct net_device *slave, struct net_device *master)
4219 {
4220         struct net_device *old = slave->master;
4221
4222         ASSERT_RTNL();
4223
4224         if (master) {
4225                 if (old)
4226                         return -EBUSY;
4227                 dev_hold(master);
4228         }
4229
4230         slave->master = master;
4231
4232         if (old) {
4233                 synchronize_net();
4234                 dev_put(old);
4235         }
4236         if (master)
4237                 slave->flags |= IFF_SLAVE;
4238         else
4239                 slave->flags &= ~IFF_SLAVE;
4240
4241         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4242         return 0;
4243 }
4244 EXPORT_SYMBOL(netdev_set_master);
4245
4246 static void dev_change_rx_flags(struct net_device *dev, int flags)
4247 {
4248         const struct net_device_ops *ops = dev->netdev_ops;
4249
4250         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4251                 ops->ndo_change_rx_flags(dev, flags);
4252 }
4253
4254 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4255 {
4256         unsigned short old_flags = dev->flags;
4257         uid_t uid;
4258         gid_t gid;
4259
4260         ASSERT_RTNL();
4261
4262         dev->flags |= IFF_PROMISC;
4263         dev->promiscuity += inc;
4264         if (dev->promiscuity == 0) {
4265                 /*
4266                  * Avoid overflow.
4267                  * If inc causes overflow, untouch promisc and return error.
4268                  */
4269                 if (inc < 0)
4270                         dev->flags &= ~IFF_PROMISC;
4271                 else {
4272                         dev->promiscuity -= inc;
4273                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4274                                 "set promiscuity failed, promiscuity feature "
4275                                 "of device might be broken.\n", dev->name);
4276                         return -EOVERFLOW;
4277                 }
4278         }
4279         if (dev->flags != old_flags) {
4280                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4281                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4282                                                                "left");
4283                 if (audit_enabled) {
4284                         current_uid_gid(&uid, &gid);
4285                         audit_log(current->audit_context, GFP_ATOMIC,
4286                                 AUDIT_ANOM_PROMISCUOUS,
4287                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4288                                 dev->name, (dev->flags & IFF_PROMISC),
4289                                 (old_flags & IFF_PROMISC),
4290                                 audit_get_loginuid(current),
4291                                 uid, gid,
4292                                 audit_get_sessionid(current));
4293                 }
4294
4295                 dev_change_rx_flags(dev, IFF_PROMISC);
4296         }
4297         return 0;
4298 }
4299
4300 /**
4301  *      dev_set_promiscuity     - update promiscuity count on a device
4302  *      @dev: device
4303  *      @inc: modifier
4304  *
4305  *      Add or remove promiscuity from a device. While the count in the device
4306  *      remains above zero the interface remains promiscuous. Once it hits zero
4307  *      the device reverts back to normal filtering operation. A negative inc
4308  *      value is used to drop promiscuity on the device.
4309  *      Return 0 if successful or a negative errno code on error.
4310  */
4311 int dev_set_promiscuity(struct net_device *dev, int inc)
4312 {
4313         unsigned short old_flags = dev->flags;
4314         int err;
4315
4316         err = __dev_set_promiscuity(dev, inc);
4317         if (err < 0)
4318                 return err;
4319         if (dev->flags != old_flags)
4320                 dev_set_rx_mode(dev);
4321         return err;
4322 }
4323 EXPORT_SYMBOL(dev_set_promiscuity);
4324
4325 /**
4326  *      dev_set_allmulti        - update allmulti count on a device
4327  *      @dev: device
4328  *      @inc: modifier
4329  *
4330  *      Add or remove reception of all multicast frames to a device. While the
4331  *      count in the device remains above zero the interface remains listening
4332  *      to all interfaces. Once it hits zero the device reverts back to normal
4333  *      filtering operation. A negative @inc value is used to drop the counter
4334  *      when releasing a resource needing all multicasts.
4335  *      Return 0 if successful or a negative errno code on error.
4336  */
4337
4338 int dev_set_allmulti(struct net_device *dev, int inc)
4339 {
4340         unsigned short old_flags = dev->flags;
4341
4342         ASSERT_RTNL();
4343
4344         dev->flags |= IFF_ALLMULTI;
4345         dev->allmulti += inc;
4346         if (dev->allmulti == 0) {
4347                 /*
4348                  * Avoid overflow.
4349                  * If inc causes overflow, untouch allmulti and return error.
4350                  */
4351                 if (inc < 0)
4352                         dev->flags &= ~IFF_ALLMULTI;
4353                 else {
4354                         dev->allmulti -= inc;
4355                         printk(KERN_WARNING "%s: allmulti touches roof, "
4356                                 "set allmulti failed, allmulti feature of "
4357                                 "device might be broken.\n", dev->name);
4358                         return -EOVERFLOW;
4359                 }
4360         }
4361         if (dev->flags ^ old_flags) {
4362                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4363                 dev_set_rx_mode(dev);
4364         }
4365         return 0;
4366 }
4367 EXPORT_SYMBOL(dev_set_allmulti);
4368
4369 /*
4370  *      Upload unicast and multicast address lists to device and
4371  *      configure RX filtering. When the device doesn't support unicast
4372  *      filtering it is put in promiscuous mode while unicast addresses
4373  *      are present.
4374  */
4375 void __dev_set_rx_mode(struct net_device *dev)
4376 {
4377         const struct net_device_ops *ops = dev->netdev_ops;
4378
4379         /* dev_open will call this function so the list will stay sane. */
4380         if (!(dev->flags&IFF_UP))
4381                 return;
4382
4383         if (!netif_device_present(dev))
4384                 return;
4385
4386         if (ops->ndo_set_rx_mode)
4387                 ops->ndo_set_rx_mode(dev);
4388         else {
4389                 /* Unicast addresses changes may only happen under the rtnl,
4390                  * therefore calling __dev_set_promiscuity here is safe.
4391                  */
4392                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4393                         __dev_set_promiscuity(dev, 1);
4394                         dev->uc_promisc = 1;
4395                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4396                         __dev_set_promiscuity(dev, -1);
4397                         dev->uc_promisc = 0;
4398                 }
4399
4400                 if (ops->ndo_set_multicast_list)
4401                         ops->ndo_set_multicast_list(dev);
4402         }
4403 }
4404
4405 void dev_set_rx_mode(struct net_device *dev)
4406 {
4407         netif_addr_lock_bh(dev);
4408         __dev_set_rx_mode(dev);
4409         netif_addr_unlock_bh(dev);
4410 }
4411
4412 /**
4413  *      dev_get_flags - get flags reported to userspace
4414  *      @dev: device
4415  *
4416  *      Get the combination of flag bits exported through APIs to userspace.
4417  */
4418 unsigned dev_get_flags(const struct net_device *dev)
4419 {
4420         unsigned flags;
4421
4422         flags = (dev->flags & ~(IFF_PROMISC |
4423                                 IFF_ALLMULTI |
4424                                 IFF_RUNNING |
4425                                 IFF_LOWER_UP |
4426                                 IFF_DORMANT)) |
4427                 (dev->gflags & (IFF_PROMISC |
4428                                 IFF_ALLMULTI));
4429
4430         if (netif_running(dev)) {
4431                 if (netif_oper_up(dev))
4432                         flags |= IFF_RUNNING;
4433                 if (netif_carrier_ok(dev))
4434                         flags |= IFF_LOWER_UP;
4435                 if (netif_dormant(dev))
4436                         flags |= IFF_DORMANT;
4437         }
4438
4439         return flags;
4440 }
4441 EXPORT_SYMBOL(dev_get_flags);
4442
4443 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4444 {
4445         int old_flags = dev->flags;
4446         int ret;
4447
4448         ASSERT_RTNL();
4449
4450         /*
4451          *      Set the flags on our device.
4452          */
4453
4454         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4455                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4456                                IFF_AUTOMEDIA)) |
4457                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4458                                     IFF_ALLMULTI));
4459
4460         /*
4461          *      Load in the correct multicast list now the flags have changed.
4462          */
4463
4464         if ((old_flags ^ flags) & IFF_MULTICAST)
4465                 dev_change_rx_flags(dev, IFF_MULTICAST);
4466
4467         dev_set_rx_mode(dev);
4468
4469         /*
4470          *      Have we downed the interface. We handle IFF_UP ourselves
4471          *      according to user attempts to set it, rather than blindly
4472          *      setting it.
4473          */
4474
4475         ret = 0;
4476         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4477                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4478
4479                 if (!ret)
4480                         dev_set_rx_mode(dev);
4481         }
4482
4483         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4484                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4485
4486                 dev->gflags ^= IFF_PROMISC;
4487                 dev_set_promiscuity(dev, inc);
4488         }
4489
4490         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4491            is important. Some (broken) drivers set IFF_PROMISC, when
4492            IFF_ALLMULTI is requested not asking us and not reporting.
4493          */
4494         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4495                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4496
4497                 dev->gflags ^= IFF_ALLMULTI;
4498                 dev_set_allmulti(dev, inc);
4499         }
4500
4501         return ret;
4502 }
4503
4504 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4505 {
4506         unsigned int changes = dev->flags ^ old_flags;
4507
4508         if (changes & IFF_UP) {
4509                 if (dev->flags & IFF_UP)
4510                         call_netdevice_notifiers(NETDEV_UP, dev);
4511                 else
4512                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4513         }
4514
4515         if (dev->flags & IFF_UP &&
4516             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4517                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4518 }
4519
4520 /**
4521  *      dev_change_flags - change device settings
4522  *      @dev: device
4523  *      @flags: device state flags
4524  *
4525  *      Change settings on device based state flags. The flags are
4526  *      in the userspace exported format.
4527  */
4528 int dev_change_flags(struct net_device *dev, unsigned flags)
4529 {
4530         int ret, changes;
4531         int old_flags = dev->flags;
4532
4533         ret = __dev_change_flags(dev, flags);
4534         if (ret < 0)
4535                 return ret;
4536
4537         changes = old_flags ^ dev->flags;
4538         if (changes)
4539                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4540
4541         __dev_notify_flags(dev, old_flags);
4542         return ret;
4543 }
4544 EXPORT_SYMBOL(dev_change_flags);
4545
4546 /**
4547  *      dev_set_mtu - Change maximum transfer unit
4548  *      @dev: device
4549  *      @new_mtu: new transfer unit
4550  *
4551  *      Change the maximum transfer size of the network device.
4552  */
4553 int dev_set_mtu(struct net_device *dev, int new_mtu)
4554 {
4555         const struct net_device_ops *ops = dev->netdev_ops;
4556         int err;
4557
4558         if (new_mtu == dev->mtu)
4559                 return 0;
4560
4561         /*      MTU must be positive.    */
4562         if (new_mtu < 0)
4563                 return -EINVAL;
4564
4565         if (!netif_device_present(dev))
4566                 return -ENODEV;
4567
4568         err = 0;
4569         if (ops->ndo_change_mtu)
4570                 err = ops->ndo_change_mtu(dev, new_mtu);
4571         else
4572                 dev->mtu = new_mtu;
4573
4574         if (!err && dev->flags & IFF_UP)
4575                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4576         return err;
4577 }
4578 EXPORT_SYMBOL(dev_set_mtu);
4579
4580 /**
4581  *      dev_set_mac_address - Change Media Access Control Address
4582  *      @dev: device
4583  *      @sa: new address
4584  *
4585  *      Change the hardware (MAC) address of the device
4586  */
4587 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4588 {
4589         const struct net_device_ops *ops = dev->netdev_ops;
4590         int err;
4591
4592         if (!ops->ndo_set_mac_address)
4593                 return -EOPNOTSUPP;
4594         if (sa->sa_family != dev->type)
4595                 return -EINVAL;
4596         if (!netif_device_present(dev))
4597                 return -ENODEV;
4598         err = ops->ndo_set_mac_address(dev, sa);
4599         if (!err)
4600                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4601         return err;
4602 }
4603 EXPORT_SYMBOL(dev_set_mac_address);
4604
4605 /*
4606  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4607  */
4608 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4609 {
4610         int err;
4611         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4612
4613         if (!dev)
4614                 return -ENODEV;
4615
4616         switch (cmd) {
4617         case SIOCGIFFLAGS:      /* Get interface flags */
4618                 ifr->ifr_flags = (short) dev_get_flags(dev);
4619                 return 0;
4620
4621         case SIOCGIFMETRIC:     /* Get the metric on the interface
4622                                    (currently unused) */
4623                 ifr->ifr_metric = 0;
4624                 return 0;
4625
4626         case SIOCGIFMTU:        /* Get the MTU of a device */
4627                 ifr->ifr_mtu = dev->mtu;
4628                 return 0;
4629
4630         case SIOCGIFHWADDR:
4631                 if (!dev->addr_len)
4632                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4633                 else
4634                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4635                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4636                 ifr->ifr_hwaddr.sa_family = dev->type;
4637                 return 0;
4638
4639         case SIOCGIFSLAVE:
4640                 err = -EINVAL;
4641                 break;
4642
4643         case SIOCGIFMAP:
4644                 ifr->ifr_map.mem_start = dev->mem_start;
4645                 ifr->ifr_map.mem_end   = dev->mem_end;
4646                 ifr->ifr_map.base_addr = dev->base_addr;
4647                 ifr->ifr_map.irq       = dev->irq;
4648                 ifr->ifr_map.dma       = dev->dma;
4649                 ifr->ifr_map.port      = dev->if_port;
4650                 return 0;
4651
4652         case SIOCGIFINDEX:
4653                 ifr->ifr_ifindex = dev->ifindex;
4654                 return 0;
4655
4656         case SIOCGIFTXQLEN:
4657                 ifr->ifr_qlen = dev->tx_queue_len;
4658                 return 0;
4659
4660         default:
4661                 /* dev_ioctl() should ensure this case
4662                  * is never reached
4663                  */
4664                 WARN_ON(1);
4665                 err = -EINVAL;
4666                 break;
4667
4668         }
4669         return err;
4670 }
4671
4672 /*
4673  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4674  */
4675 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4676 {
4677         int err;
4678         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4679         const struct net_device_ops *ops;
4680
4681         if (!dev)
4682                 return -ENODEV;
4683
4684         ops = dev->netdev_ops;
4685
4686         switch (cmd) {
4687         case SIOCSIFFLAGS:      /* Set interface flags */
4688                 return dev_change_flags(dev, ifr->ifr_flags);
4689
4690         case SIOCSIFMETRIC:     /* Set the metric on the interface
4691                                    (currently unused) */
4692                 return -EOPNOTSUPP;
4693
4694         case SIOCSIFMTU:        /* Set the MTU of a device */
4695                 return dev_set_mtu(dev, ifr->ifr_mtu);
4696
4697         case SIOCSIFHWADDR:
4698                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4699
4700         case SIOCSIFHWBROADCAST:
4701                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4702                         return -EINVAL;
4703                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4704                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4705                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4706                 return 0;
4707
4708         case SIOCSIFMAP:
4709                 if (ops->ndo_set_config) {
4710                         if (!netif_device_present(dev))
4711                                 return -ENODEV;
4712                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4713                 }
4714                 return -EOPNOTSUPP;
4715
4716         case SIOCADDMULTI:
4717                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4718                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4719                         return -EINVAL;
4720                 if (!netif_device_present(dev))
4721                         return -ENODEV;
4722                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4723
4724         case SIOCDELMULTI:
4725                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4726                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4727                         return -EINVAL;
4728                 if (!netif_device_present(dev))
4729                         return -ENODEV;
4730                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4731
4732         case SIOCSIFTXQLEN:
4733                 if (ifr->ifr_qlen < 0)
4734                         return -EINVAL;
4735                 dev->tx_queue_len = ifr->ifr_qlen;
4736                 return 0;
4737
4738         case SIOCSIFNAME:
4739                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4740                 return dev_change_name(dev, ifr->ifr_newname);
4741
4742         /*
4743          *      Unknown or private ioctl
4744          */
4745         default:
4746                 if ((cmd >= SIOCDEVPRIVATE &&
4747                     cmd <= SIOCDEVPRIVATE + 15) ||
4748                     cmd == SIOCBONDENSLAVE ||
4749                     cmd == SIOCBONDRELEASE ||
4750                     cmd == SIOCBONDSETHWADDR ||
4751                     cmd == SIOCBONDSLAVEINFOQUERY ||
4752                     cmd == SIOCBONDINFOQUERY ||
4753                     cmd == SIOCBONDCHANGEACTIVE ||
4754                     cmd == SIOCGMIIPHY ||
4755                     cmd == SIOCGMIIREG ||
4756                     cmd == SIOCSMIIREG ||
4757                     cmd == SIOCBRADDIF ||
4758                     cmd == SIOCBRDELIF ||
4759                     cmd == SIOCSHWTSTAMP ||
4760                     cmd == SIOCWANDEV) {
4761                         err = -EOPNOTSUPP;
4762                         if (ops->ndo_do_ioctl) {
4763                                 if (netif_device_present(dev))
4764                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4765                                 else
4766                                         err = -ENODEV;
4767                         }
4768                 } else
4769                         err = -EINVAL;
4770
4771         }
4772         return err;
4773 }
4774
4775 /*
4776  *      This function handles all "interface"-type I/O control requests. The actual
4777  *      'doing' part of this is dev_ifsioc above.
4778  */
4779
4780 /**
4781  *      dev_ioctl       -       network device ioctl
4782  *      @net: the applicable net namespace
4783  *      @cmd: command to issue
4784  *      @arg: pointer to a struct ifreq in user space
4785  *
4786  *      Issue ioctl functions to devices. This is normally called by the
4787  *      user space syscall interfaces but can sometimes be useful for
4788  *      other purposes. The return value is the return from the syscall if
4789  *      positive or a negative errno code on error.
4790  */
4791
4792 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4793 {
4794         struct ifreq ifr;
4795         int ret;
4796         char *colon;
4797
4798         /* One special case: SIOCGIFCONF takes ifconf argument
4799            and requires shared lock, because it sleeps writing
4800            to user space.
4801          */
4802
4803         if (cmd == SIOCGIFCONF) {
4804                 rtnl_lock();
4805                 ret = dev_ifconf(net, (char __user *) arg);
4806                 rtnl_unlock();
4807                 return ret;
4808         }
4809         if (cmd == SIOCGIFNAME)
4810                 return dev_ifname(net, (struct ifreq __user *)arg);
4811
4812         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4813                 return -EFAULT;
4814
4815         ifr.ifr_name[IFNAMSIZ-1] = 0;
4816
4817         colon = strchr(ifr.ifr_name, ':');
4818         if (colon)
4819                 *colon = 0;
4820
4821         /*
4822          *      See which interface the caller is talking about.
4823          */
4824
4825         switch (cmd) {
4826         /*
4827          *      These ioctl calls:
4828          *      - can be done by all.
4829          *      - atomic and do not require locking.
4830          *      - return a value
4831          */
4832         case SIOCGIFFLAGS:
4833         case SIOCGIFMETRIC:
4834         case SIOCGIFMTU:
4835         case SIOCGIFHWADDR:
4836         case SIOCGIFSLAVE:
4837         case SIOCGIFMAP:
4838         case SIOCGIFINDEX:
4839         case SIOCGIFTXQLEN:
4840                 dev_load(net, ifr.ifr_name);
4841                 rcu_read_lock();
4842                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4843                 rcu_read_unlock();
4844                 if (!ret) {
4845                         if (colon)
4846                                 *colon = ':';
4847                         if (copy_to_user(arg, &ifr,
4848                                          sizeof(struct ifreq)))
4849                                 ret = -EFAULT;
4850                 }
4851                 return ret;
4852
4853         case SIOCETHTOOL:
4854                 dev_load(net, ifr.ifr_name);
4855                 rtnl_lock();
4856                 ret = dev_ethtool(net, &ifr);
4857                 rtnl_unlock();
4858                 if (!ret) {
4859                         if (colon)
4860                                 *colon = ':';
4861                         if (copy_to_user(arg, &ifr,
4862                                          sizeof(struct ifreq)))
4863                                 ret = -EFAULT;
4864                 }
4865                 return ret;
4866
4867         /*
4868          *      These ioctl calls:
4869          *      - require superuser power.
4870          *      - require strict serialization.
4871          *      - return a value
4872          */
4873         case SIOCGMIIPHY:
4874         case SIOCGMIIREG:
4875         case SIOCSIFNAME:
4876                 if (!capable(CAP_NET_ADMIN))
4877                         return -EPERM;
4878                 dev_load(net, ifr.ifr_name);
4879                 rtnl_lock();
4880                 ret = dev_ifsioc(net, &ifr, cmd);
4881                 rtnl_unlock();
4882                 if (!ret) {
4883                         if (colon)
4884                                 *colon = ':';
4885                         if (copy_to_user(arg, &ifr,
4886                                          sizeof(struct ifreq)))
4887                                 ret = -EFAULT;
4888                 }
4889                 return ret;
4890
4891         /*
4892          *      These ioctl calls:
4893          *      - require superuser power.
4894          *      - require strict serialization.
4895          *      - do not return a value
4896          */
4897         case SIOCSIFFLAGS:
4898         case SIOCSIFMETRIC:
4899         case SIOCSIFMTU:
4900         case SIOCSIFMAP:
4901         case SIOCSIFHWADDR:
4902         case SIOCSIFSLAVE:
4903         case SIOCADDMULTI:
4904         case SIOCDELMULTI:
4905         case SIOCSIFHWBROADCAST:
4906         case SIOCSIFTXQLEN:
4907         case SIOCSMIIREG:
4908         case SIOCBONDENSLAVE:
4909         case SIOCBONDRELEASE:
4910         case SIOCBONDSETHWADDR:
4911         case SIOCBONDCHANGEACTIVE:
4912         case SIOCBRADDIF:
4913         case SIOCBRDELIF:
4914         case SIOCSHWTSTAMP:
4915                 if (!capable(CAP_NET_ADMIN))
4916                         return -EPERM;
4917                 /* fall through */
4918         case SIOCBONDSLAVEINFOQUERY:
4919         case SIOCBONDINFOQUERY:
4920                 dev_load(net, ifr.ifr_name);
4921                 rtnl_lock();
4922                 ret = dev_ifsioc(net, &ifr, cmd);
4923                 rtnl_unlock();
4924                 return ret;
4925
4926         case SIOCGIFMEM:
4927                 /* Get the per device memory space. We can add this but
4928                  * currently do not support it */
4929         case SIOCSIFMEM:
4930                 /* Set the per device memory buffer space.
4931                  * Not applicable in our case */
4932         case SIOCSIFLINK:
4933                 return -EINVAL;
4934
4935         /*
4936          *      Unknown or private ioctl.
4937          */
4938         default:
4939                 if (cmd == SIOCWANDEV ||
4940                     (cmd >= SIOCDEVPRIVATE &&
4941                      cmd <= SIOCDEVPRIVATE + 15)) {
4942                         dev_load(net, ifr.ifr_name);
4943                         rtnl_lock();
4944                         ret = dev_ifsioc(net, &ifr, cmd);
4945                         rtnl_unlock();
4946                         if (!ret && copy_to_user(arg, &ifr,
4947                                                  sizeof(struct ifreq)))
4948                                 ret = -EFAULT;
4949                         return ret;
4950                 }
4951                 /* Take care of Wireless Extensions */
4952                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4953                         return wext_handle_ioctl(net, &ifr, cmd, arg);
4954                 return -EINVAL;
4955         }
4956 }
4957
4958
4959 /**
4960  *      dev_new_index   -       allocate an ifindex
4961  *      @net: the applicable net namespace
4962  *
4963  *      Returns a suitable unique value for a new device interface
4964  *      number.  The caller must hold the rtnl semaphore or the
4965  *      dev_base_lock to be sure it remains unique.
4966  */
4967 static int dev_new_index(struct net *net)
4968 {
4969         static int ifindex;
4970         for (;;) {
4971                 if (++ifindex <= 0)
4972                         ifindex = 1;
4973                 if (!__dev_get_by_index(net, ifindex))
4974                         return ifindex;
4975         }
4976 }
4977
4978 /* Delayed registration/unregisteration */
4979 static LIST_HEAD(net_todo_list);
4980
4981 static void net_set_todo(struct net_device *dev)
4982 {
4983         list_add_tail(&dev->todo_list, &net_todo_list);
4984 }
4985
4986 static void rollback_registered_many(struct list_head *head)
4987 {
4988         struct net_device *dev, *tmp;
4989
4990         BUG_ON(dev_boot_phase);
4991         ASSERT_RTNL();
4992
4993         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4994                 /* Some devices call without registering
4995                  * for initialization unwind. Remove those
4996                  * devices and proceed with the remaining.
4997                  */
4998                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4999                         pr_debug("unregister_netdevice: device %s/%p never "
5000                                  "was registered\n", dev->name, dev);
5001
5002                         WARN_ON(1);
5003                         list_del(&dev->unreg_list);
5004                         continue;
5005                 }
5006
5007                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5008         }
5009
5010         /* If device is running, close it first. */
5011         dev_close_many(head);
5012
5013         list_for_each_entry(dev, head, unreg_list) {
5014                 /* And unlink it from device chain. */
5015                 unlist_netdevice(dev);
5016
5017                 dev->reg_state = NETREG_UNREGISTERING;
5018         }
5019
5020         synchronize_net();
5021
5022         list_for_each_entry(dev, head, unreg_list) {
5023                 /* Shutdown queueing discipline. */
5024                 dev_shutdown(dev);
5025
5026
5027                 /* Notify protocols, that we are about to destroy
5028                    this device. They should clean all the things.
5029                 */
5030                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5031
5032                 if (!dev->rtnl_link_ops ||
5033                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5034                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5035
5036                 /*
5037                  *      Flush the unicast and multicast chains
5038                  */
5039                 dev_uc_flush(dev);
5040                 dev_mc_flush(dev);
5041
5042                 if (dev->netdev_ops->ndo_uninit)
5043                         dev->netdev_ops->ndo_uninit(dev);
5044
5045                 /* Notifier chain MUST detach us from master device. */
5046                 WARN_ON(dev->master);
5047
5048                 /* Remove entries from kobject tree */
5049                 netdev_unregister_kobject(dev);
5050         }
5051
5052         /* Process any work delayed until the end of the batch */
5053         dev = list_first_entry(head, struct net_device, unreg_list);
5054         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5055
5056         rcu_barrier();
5057
5058         list_for_each_entry(dev, head, unreg_list)
5059                 dev_put(dev);
5060 }
5061
5062 static void rollback_registered(struct net_device *dev)
5063 {
5064         LIST_HEAD(single);
5065
5066         list_add(&dev->unreg_list, &single);
5067         rollback_registered_many(&single);
5068 }
5069
5070 unsigned long netdev_fix_features(unsigned long features, const char *name)
5071 {
5072         /* Fix illegal SG+CSUM combinations. */
5073         if ((features & NETIF_F_SG) &&
5074             !(features & NETIF_F_ALL_CSUM)) {
5075                 if (name)
5076                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
5077                                "checksum feature.\n", name);
5078                 features &= ~NETIF_F_SG;
5079         }
5080
5081         /* TSO requires that SG is present as well. */
5082         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5083                 if (name)
5084                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5085                                "SG feature.\n", name);
5086                 features &= ~NETIF_F_TSO;
5087         }
5088
5089         if (features & NETIF_F_UFO) {
5090                 /* maybe split UFO into V4 and V6? */
5091                 if (!((features & NETIF_F_GEN_CSUM) ||
5092                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5093                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5094                         if (name)
5095                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5096                                        "since no checksum offload features.\n",
5097                                        name);
5098                         features &= ~NETIF_F_UFO;
5099                 }
5100
5101                 if (!(features & NETIF_F_SG)) {
5102                         if (name)
5103                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5104                                        "since no NETIF_F_SG feature.\n", name);
5105                         features &= ~NETIF_F_UFO;
5106                 }
5107         }
5108
5109         return features;
5110 }
5111 EXPORT_SYMBOL(netdev_fix_features);
5112
5113 /**
5114  *      netif_stacked_transfer_operstate -      transfer operstate
5115  *      @rootdev: the root or lower level device to transfer state from
5116  *      @dev: the device to transfer operstate to
5117  *
5118  *      Transfer operational state from root to device. This is normally
5119  *      called when a stacking relationship exists between the root
5120  *      device and the device(a leaf device).
5121  */
5122 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5123                                         struct net_device *dev)
5124 {
5125         if (rootdev->operstate == IF_OPER_DORMANT)
5126                 netif_dormant_on(dev);
5127         else
5128                 netif_dormant_off(dev);
5129
5130         if (netif_carrier_ok(rootdev)) {
5131                 if (!netif_carrier_ok(dev))
5132                         netif_carrier_on(dev);
5133         } else {
5134                 if (netif_carrier_ok(dev))
5135                         netif_carrier_off(dev);
5136         }
5137 }
5138 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5139
5140 #ifdef CONFIG_RPS
5141 static int netif_alloc_rx_queues(struct net_device *dev)
5142 {
5143         unsigned int i, count = dev->num_rx_queues;
5144         struct netdev_rx_queue *rx;
5145
5146         BUG_ON(count < 1);
5147
5148         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5149         if (!rx) {
5150                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5151                 return -ENOMEM;
5152         }
5153         dev->_rx = rx;
5154
5155         for (i = 0; i < count; i++)
5156                 rx[i].dev = dev;
5157         return 0;
5158 }
5159 #endif
5160
5161 static void netdev_init_one_queue(struct net_device *dev,
5162                                   struct netdev_queue *queue, void *_unused)
5163 {
5164         /* Initialize queue lock */
5165         spin_lock_init(&queue->_xmit_lock);
5166         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5167         queue->xmit_lock_owner = -1;
5168         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5169         queue->dev = dev;
5170 }
5171
5172 static int netif_alloc_netdev_queues(struct net_device *dev)
5173 {
5174         unsigned int count = dev->num_tx_queues;
5175         struct netdev_queue *tx;
5176
5177         BUG_ON(count < 1);
5178
5179         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5180         if (!tx) {
5181                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5182                        count);
5183                 return -ENOMEM;
5184         }
5185         dev->_tx = tx;
5186
5187         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5188         spin_lock_init(&dev->tx_global_lock);
5189
5190         return 0;
5191 }
5192
5193 /**
5194  *      register_netdevice      - register a network device
5195  *      @dev: device to register
5196  *
5197  *      Take a completed network device structure and add it to the kernel
5198  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5199  *      chain. 0 is returned on success. A negative errno code is returned
5200  *      on a failure to set up the device, or if the name is a duplicate.
5201  *
5202  *      Callers must hold the rtnl semaphore. You may want
5203  *      register_netdev() instead of this.
5204  *
5205  *      BUGS:
5206  *      The locking appears insufficient to guarantee two parallel registers
5207  *      will not get the same name.
5208  */
5209
5210 int register_netdevice(struct net_device *dev)
5211 {
5212         int ret;
5213         struct net *net = dev_net(dev);
5214
5215         BUG_ON(dev_boot_phase);
5216         ASSERT_RTNL();
5217
5218         might_sleep();
5219
5220         /* When net_device's are persistent, this will be fatal. */
5221         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5222         BUG_ON(!net);
5223
5224         spin_lock_init(&dev->addr_list_lock);
5225         netdev_set_addr_lockdep_class(dev);
5226
5227         dev->iflink = -1;
5228
5229         /* Init, if this function is available */
5230         if (dev->netdev_ops->ndo_init) {
5231                 ret = dev->netdev_ops->ndo_init(dev);
5232                 if (ret) {
5233                         if (ret > 0)
5234                                 ret = -EIO;
5235                         goto out;
5236                 }
5237         }
5238
5239         ret = dev_get_valid_name(dev, dev->name, 0);
5240         if (ret)
5241                 goto err_uninit;
5242
5243         dev->ifindex = dev_new_index(net);
5244         if (dev->iflink == -1)
5245                 dev->iflink = dev->ifindex;
5246
5247         /* Fix illegal checksum combinations */
5248         if ((dev->features & NETIF_F_HW_CSUM) &&
5249             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5250                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5251                        dev->name);
5252                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5253         }
5254
5255         if ((dev->features & NETIF_F_NO_CSUM) &&
5256             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5257                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5258                        dev->name);
5259                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5260         }
5261
5262         dev->features = netdev_fix_features(dev->features, dev->name);
5263
5264         /* Enable software GSO if SG is supported. */
5265         if (dev->features & NETIF_F_SG)
5266                 dev->features |= NETIF_F_GSO;
5267
5268         /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5269          * vlan_dev_init() will do the dev->features check, so these features
5270          * are enabled only if supported by underlying device.
5271          */
5272         dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5273
5274         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5275         ret = notifier_to_errno(ret);
5276         if (ret)
5277                 goto err_uninit;
5278
5279         ret = netdev_register_kobject(dev);
5280         if (ret)
5281                 goto err_uninit;
5282         dev->reg_state = NETREG_REGISTERED;
5283
5284         /*
5285          *      Default initial state at registry is that the
5286          *      device is present.
5287          */
5288
5289         set_bit(__LINK_STATE_PRESENT, &dev->state);
5290
5291         dev_init_scheduler(dev);
5292         dev_hold(dev);
5293         list_netdevice(dev);
5294
5295         /* Notify protocols, that a new device appeared. */
5296         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5297         ret = notifier_to_errno(ret);
5298         if (ret) {
5299                 rollback_registered(dev);
5300                 dev->reg_state = NETREG_UNREGISTERED;
5301         }
5302         /*
5303          *      Prevent userspace races by waiting until the network
5304          *      device is fully setup before sending notifications.
5305          */
5306         if (!dev->rtnl_link_ops ||
5307             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5308                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5309
5310 out:
5311         return ret;
5312
5313 err_uninit:
5314         if (dev->netdev_ops->ndo_uninit)
5315                 dev->netdev_ops->ndo_uninit(dev);
5316         goto out;
5317 }
5318 EXPORT_SYMBOL(register_netdevice);
5319
5320 /**
5321  *      init_dummy_netdev       - init a dummy network device for NAPI
5322  *      @dev: device to init
5323  *
5324  *      This takes a network device structure and initialize the minimum
5325  *      amount of fields so it can be used to schedule NAPI polls without
5326  *      registering a full blown interface. This is to be used by drivers
5327  *      that need to tie several hardware interfaces to a single NAPI
5328  *      poll scheduler due to HW limitations.
5329  */
5330 int init_dummy_netdev(struct net_device *dev)
5331 {
5332         /* Clear everything. Note we don't initialize spinlocks
5333          * are they aren't supposed to be taken by any of the
5334          * NAPI code and this dummy netdev is supposed to be
5335          * only ever used for NAPI polls
5336          */
5337         memset(dev, 0, sizeof(struct net_device));
5338
5339         /* make sure we BUG if trying to hit standard
5340          * register/unregister code path
5341          */
5342         dev->reg_state = NETREG_DUMMY;
5343
5344         /* NAPI wants this */
5345         INIT_LIST_HEAD(&dev->napi_list);
5346
5347         /* a dummy interface is started by default */
5348         set_bit(__LINK_STATE_PRESENT, &dev->state);
5349         set_bit(__LINK_STATE_START, &dev->state);
5350
5351         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5352          * because users of this 'device' dont need to change
5353          * its refcount.
5354          */
5355
5356         return 0;
5357 }
5358 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5359
5360
5361 /**
5362  *      register_netdev - register a network device
5363  *      @dev: device to register
5364  *
5365  *      Take a completed network device structure and add it to the kernel
5366  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5367  *      chain. 0 is returned on success. A negative errno code is returned
5368  *      on a failure to set up the device, or if the name is a duplicate.
5369  *
5370  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5371  *      and expands the device name if you passed a format string to
5372  *      alloc_netdev.
5373  */
5374 int register_netdev(struct net_device *dev)
5375 {
5376         int err;
5377
5378         rtnl_lock();
5379
5380         /*
5381          * If the name is a format string the caller wants us to do a
5382          * name allocation.
5383          */
5384         if (strchr(dev->name, '%')) {
5385                 err = dev_alloc_name(dev, dev->name);
5386                 if (err < 0)
5387                         goto out;
5388         }
5389
5390         err = register_netdevice(dev);
5391 out:
5392         rtnl_unlock();
5393         return err;
5394 }
5395 EXPORT_SYMBOL(register_netdev);
5396
5397 int netdev_refcnt_read(const struct net_device *dev)
5398 {
5399         int i, refcnt = 0;
5400
5401         for_each_possible_cpu(i)
5402                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5403         return refcnt;
5404 }
5405 EXPORT_SYMBOL(netdev_refcnt_read);
5406
5407 /*
5408  * netdev_wait_allrefs - wait until all references are gone.
5409  *
5410  * This is called when unregistering network devices.
5411  *
5412  * Any protocol or device that holds a reference should register
5413  * for netdevice notification, and cleanup and put back the
5414  * reference if they receive an UNREGISTER event.
5415  * We can get stuck here if buggy protocols don't correctly
5416  * call dev_put.
5417  */
5418 static void netdev_wait_allrefs(struct net_device *dev)
5419 {
5420         unsigned long rebroadcast_time, warning_time;
5421         int refcnt;
5422
5423         linkwatch_forget_dev(dev);
5424
5425         rebroadcast_time = warning_time = jiffies;
5426         refcnt = netdev_refcnt_read(dev);
5427
5428         while (refcnt != 0) {
5429                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5430                         rtnl_lock();
5431
5432                         /* Rebroadcast unregister notification */
5433                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5434                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5435                          * should have already handle it the first time */
5436
5437                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5438                                      &dev->state)) {
5439                                 /* We must not have linkwatch events
5440                                  * pending on unregister. If this
5441                                  * happens, we simply run the queue
5442                                  * unscheduled, resulting in a noop
5443                                  * for this device.
5444                                  */
5445                                 linkwatch_run_queue();
5446                         }
5447
5448                         __rtnl_unlock();
5449
5450                         rebroadcast_time = jiffies;
5451                 }
5452
5453                 msleep(250);
5454
5455                 refcnt = netdev_refcnt_read(dev);
5456
5457                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5458                         printk(KERN_EMERG "unregister_netdevice: "
5459                                "waiting for %s to become free. Usage "
5460                                "count = %d\n",
5461                                dev->name, refcnt);
5462                         warning_time = jiffies;
5463                 }
5464         }
5465 }
5466
5467 /* The sequence is:
5468  *
5469  *      rtnl_lock();
5470  *      ...
5471  *      register_netdevice(x1);
5472  *      register_netdevice(x2);
5473  *      ...
5474  *      unregister_netdevice(y1);
5475  *      unregister_netdevice(y2);
5476  *      ...
5477  *      rtnl_unlock();
5478  *      free_netdev(y1);
5479  *      free_netdev(y2);
5480  *
5481  * We are invoked by rtnl_unlock().
5482  * This allows us to deal with problems:
5483  * 1) We can delete sysfs objects which invoke hotplug
5484  *    without deadlocking with linkwatch via keventd.
5485  * 2) Since we run with the RTNL semaphore not held, we can sleep
5486  *    safely in order to wait for the netdev refcnt to drop to zero.
5487  *
5488  * We must not return until all unregister events added during
5489  * the interval the lock was held have been completed.
5490  */
5491 void netdev_run_todo(void)
5492 {
5493         struct list_head list;
5494
5495         /* Snapshot list, allow later requests */
5496         list_replace_init(&net_todo_list, &list);
5497
5498         __rtnl_unlock();
5499
5500         while (!list_empty(&list)) {
5501                 struct net_device *dev
5502                         = list_first_entry(&list, struct net_device, todo_list);
5503                 list_del(&dev->todo_list);
5504
5505                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5506                         printk(KERN_ERR "network todo '%s' but state %d\n",
5507                                dev->name, dev->reg_state);
5508                         dump_stack();
5509                         continue;
5510                 }
5511
5512                 dev->reg_state = NETREG_UNREGISTERED;
5513
5514                 on_each_cpu(flush_backlog, dev, 1);
5515
5516                 netdev_wait_allrefs(dev);
5517
5518                 /* paranoia */
5519                 BUG_ON(netdev_refcnt_read(dev));
5520                 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5521                 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5522                 WARN_ON(dev->dn_ptr);
5523
5524                 if (dev->destructor)
5525                         dev->destructor(dev);
5526
5527                 /* Free network device */
5528                 kobject_put(&dev->dev.kobj);
5529         }
5530 }
5531
5532 /**
5533  *      dev_txq_stats_fold - fold tx_queues stats
5534  *      @dev: device to get statistics from
5535  *      @stats: struct rtnl_link_stats64 to hold results
5536  */
5537 void dev_txq_stats_fold(const struct net_device *dev,
5538                         struct rtnl_link_stats64 *stats)
5539 {
5540         u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5541         unsigned int i;
5542         struct netdev_queue *txq;
5543
5544         for (i = 0; i < dev->num_tx_queues; i++) {
5545                 txq = netdev_get_tx_queue(dev, i);
5546                 spin_lock_bh(&txq->_xmit_lock);
5547                 tx_bytes   += txq->tx_bytes;
5548                 tx_packets += txq->tx_packets;
5549                 tx_dropped += txq->tx_dropped;
5550                 spin_unlock_bh(&txq->_xmit_lock);
5551         }
5552         if (tx_bytes || tx_packets || tx_dropped) {
5553                 stats->tx_bytes   = tx_bytes;
5554                 stats->tx_packets = tx_packets;
5555                 stats->tx_dropped = tx_dropped;
5556         }
5557 }
5558 EXPORT_SYMBOL(dev_txq_stats_fold);
5559
5560 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5561  * fields in the same order, with only the type differing.
5562  */
5563 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5564                                     const struct net_device_stats *netdev_stats)
5565 {
5566 #if BITS_PER_LONG == 64
5567         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5568         memcpy(stats64, netdev_stats, sizeof(*stats64));
5569 #else
5570         size_t i, n = sizeof(*stats64) / sizeof(u64);
5571         const unsigned long *src = (const unsigned long *)netdev_stats;
5572         u64 *dst = (u64 *)stats64;
5573
5574         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5575                      sizeof(*stats64) / sizeof(u64));
5576         for (i = 0; i < n; i++)
5577                 dst[i] = src[i];
5578 #endif
5579 }
5580
5581 /**
5582  *      dev_get_stats   - get network device statistics
5583  *      @dev: device to get statistics from
5584  *      @storage: place to store stats
5585  *
5586  *      Get network statistics from device. Return @storage.
5587  *      The device driver may provide its own method by setting
5588  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5589  *      otherwise the internal statistics structure is used.
5590  */
5591 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5592                                         struct rtnl_link_stats64 *storage)
5593 {
5594         const struct net_device_ops *ops = dev->netdev_ops;
5595
5596         if (ops->ndo_get_stats64) {
5597                 memset(storage, 0, sizeof(*storage));
5598                 ops->ndo_get_stats64(dev, storage);
5599         } else if (ops->ndo_get_stats) {
5600                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5601         } else {
5602                 netdev_stats_to_stats64(storage, &dev->stats);
5603                 dev_txq_stats_fold(dev, storage);
5604         }
5605         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5606         return storage;
5607 }
5608 EXPORT_SYMBOL(dev_get_stats);
5609
5610 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5611 {
5612         struct netdev_queue *queue = dev_ingress_queue(dev);
5613
5614 #ifdef CONFIG_NET_CLS_ACT
5615         if (queue)
5616                 return queue;
5617         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5618         if (!queue)
5619                 return NULL;
5620         netdev_init_one_queue(dev, queue, NULL);
5621         queue->qdisc = &noop_qdisc;
5622         queue->qdisc_sleeping = &noop_qdisc;
5623         rcu_assign_pointer(dev->ingress_queue, queue);
5624 #endif
5625         return queue;
5626 }
5627
5628 /**
5629  *      alloc_netdev_mq - allocate network device
5630  *      @sizeof_priv:   size of private data to allocate space for
5631  *      @name:          device name format string
5632  *      @setup:         callback to initialize device
5633  *      @queue_count:   the number of subqueues to allocate
5634  *
5635  *      Allocates a struct net_device with private data area for driver use
5636  *      and performs basic initialization.  Also allocates subquue structs
5637  *      for each queue on the device at the end of the netdevice.
5638  */
5639 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5640                 void (*setup)(struct net_device *), unsigned int queue_count)
5641 {
5642         struct net_device *dev;
5643         size_t alloc_size;
5644         struct net_device *p;
5645
5646         BUG_ON(strlen(name) >= sizeof(dev->name));
5647
5648         if (queue_count < 1) {
5649                 pr_err("alloc_netdev: Unable to allocate device "
5650                        "with zero queues.\n");
5651                 return NULL;
5652         }
5653
5654         alloc_size = sizeof(struct net_device);
5655         if (sizeof_priv) {
5656                 /* ensure 32-byte alignment of private area */
5657                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5658                 alloc_size += sizeof_priv;
5659         }
5660         /* ensure 32-byte alignment of whole construct */
5661         alloc_size += NETDEV_ALIGN - 1;
5662
5663         p = kzalloc(alloc_size, GFP_KERNEL);
5664         if (!p) {
5665                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5666                 return NULL;
5667         }
5668
5669         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5670         dev->padded = (char *)dev - (char *)p;
5671
5672         dev->pcpu_refcnt = alloc_percpu(int);
5673         if (!dev->pcpu_refcnt)
5674                 goto free_p;
5675
5676         if (dev_addr_init(dev))
5677                 goto free_pcpu;
5678
5679         dev_mc_init(dev);
5680         dev_uc_init(dev);
5681
5682         dev_net_set(dev, &init_net);
5683
5684         dev->num_tx_queues = queue_count;
5685         dev->real_num_tx_queues = queue_count;
5686         if (netif_alloc_netdev_queues(dev))
5687                 goto free_pcpu;
5688
5689 #ifdef CONFIG_RPS
5690         dev->num_rx_queues = queue_count;
5691         dev->real_num_rx_queues = queue_count;
5692         if (netif_alloc_rx_queues(dev))
5693                 goto free_pcpu;
5694 #endif
5695
5696         dev->gso_max_size = GSO_MAX_SIZE;
5697
5698         INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5699         dev->ethtool_ntuple_list.count = 0;
5700         INIT_LIST_HEAD(&dev->napi_list);
5701         INIT_LIST_HEAD(&dev->unreg_list);
5702         INIT_LIST_HEAD(&dev->link_watch_list);
5703         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5704         setup(dev);
5705         strcpy(dev->name, name);
5706         return dev;
5707
5708 free_pcpu:
5709         free_percpu(dev->pcpu_refcnt);
5710         kfree(dev->_tx);
5711 #ifdef CONFIG_RPS
5712         kfree(dev->_rx);
5713 #endif
5714
5715 free_p:
5716         kfree(p);
5717         return NULL;
5718 }
5719 EXPORT_SYMBOL(alloc_netdev_mq);
5720
5721 /**
5722  *      free_netdev - free network device
5723  *      @dev: device
5724  *
5725  *      This function does the last stage of destroying an allocated device
5726  *      interface. The reference to the device object is released.
5727  *      If this is the last reference then it will be freed.
5728  */
5729 void free_netdev(struct net_device *dev)
5730 {
5731         struct napi_struct *p, *n;
5732
5733         release_net(dev_net(dev));
5734
5735         kfree(dev->_tx);
5736 #ifdef CONFIG_RPS
5737         kfree(dev->_rx);
5738 #endif
5739
5740         kfree(rcu_dereference_raw(dev->ingress_queue));
5741
5742         /* Flush device addresses */
5743         dev_addr_flush(dev);
5744
5745         /* Clear ethtool n-tuple list */
5746         ethtool_ntuple_flush(dev);
5747
5748         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5749                 netif_napi_del(p);
5750
5751         free_percpu(dev->pcpu_refcnt);
5752         dev->pcpu_refcnt = NULL;
5753
5754         /*  Compatibility with error handling in drivers */
5755         if (dev->reg_state == NETREG_UNINITIALIZED) {
5756                 kfree((char *)dev - dev->padded);
5757                 return;
5758         }
5759
5760         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5761         dev->reg_state = NETREG_RELEASED;
5762
5763         /* will free via device release */
5764         put_device(&dev->dev);
5765 }
5766 EXPORT_SYMBOL(free_netdev);
5767
5768 /**
5769  *      synchronize_net -  Synchronize with packet receive processing
5770  *
5771  *      Wait for packets currently being received to be done.
5772  *      Does not block later packets from starting.
5773  */
5774 void synchronize_net(void)
5775 {
5776         might_sleep();
5777         synchronize_rcu();
5778 }
5779 EXPORT_SYMBOL(synchronize_net);
5780
5781 /**
5782  *      unregister_netdevice_queue - remove device from the kernel
5783  *      @dev: device
5784  *      @head: list
5785  *
5786  *      This function shuts down a device interface and removes it
5787  *      from the kernel tables.
5788  *      If head not NULL, device is queued to be unregistered later.
5789  *
5790  *      Callers must hold the rtnl semaphore.  You may want
5791  *      unregister_netdev() instead of this.
5792  */
5793
5794 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5795 {
5796         ASSERT_RTNL();
5797
5798         if (head) {
5799                 list_move_tail(&dev->unreg_list, head);
5800         } else {
5801                 rollback_registered(dev);
5802                 /* Finish processing unregister after unlock */
5803                 net_set_todo(dev);
5804         }
5805 }
5806 EXPORT_SYMBOL(unregister_netdevice_queue);
5807
5808 /**
5809  *      unregister_netdevice_many - unregister many devices
5810  *      @head: list of devices
5811  */
5812 void unregister_netdevice_many(struct list_head *head)
5813 {
5814         struct net_device *dev;
5815
5816         if (!list_empty(head)) {
5817                 rollback_registered_many(head);
5818                 list_for_each_entry(dev, head, unreg_list)
5819                         net_set_todo(dev);
5820         }
5821 }
5822 EXPORT_SYMBOL(unregister_netdevice_many);
5823
5824 /**
5825  *      unregister_netdev - remove device from the kernel
5826  *      @dev: device
5827  *
5828  *      This function shuts down a device interface and removes it
5829  *      from the kernel tables.
5830  *
5831  *      This is just a wrapper for unregister_netdevice that takes
5832  *      the rtnl semaphore.  In general you want to use this and not
5833  *      unregister_netdevice.
5834  */
5835 void unregister_netdev(struct net_device *dev)
5836 {
5837         rtnl_lock();
5838         unregister_netdevice(dev);
5839         rtnl_unlock();
5840 }
5841 EXPORT_SYMBOL(unregister_netdev);
5842
5843 /**
5844  *      dev_change_net_namespace - move device to different nethost namespace
5845  *      @dev: device
5846  *      @net: network namespace
5847  *      @pat: If not NULL name pattern to try if the current device name
5848  *            is already taken in the destination network namespace.
5849  *
5850  *      This function shuts down a device interface and moves it
5851  *      to a new network namespace. On success 0 is returned, on
5852  *      a failure a netagive errno code is returned.
5853  *
5854  *      Callers must hold the rtnl semaphore.
5855  */
5856
5857 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5858 {
5859         int err;
5860
5861         ASSERT_RTNL();
5862
5863         /* Don't allow namespace local devices to be moved. */
5864         err = -EINVAL;
5865         if (dev->features & NETIF_F_NETNS_LOCAL)
5866                 goto out;
5867
5868         /* Ensure the device has been registrered */
5869         err = -EINVAL;
5870         if (dev->reg_state != NETREG_REGISTERED)
5871                 goto out;
5872
5873         /* Get out if there is nothing todo */
5874         err = 0;
5875         if (net_eq(dev_net(dev), net))
5876                 goto out;
5877
5878         /* Pick the destination device name, and ensure
5879          * we can use it in the destination network namespace.
5880          */
5881         err = -EEXIST;
5882         if (__dev_get_by_name(net, dev->name)) {
5883                 /* We get here if we can't use the current device name */
5884                 if (!pat)
5885                         goto out;
5886                 if (dev_get_valid_name(dev, pat, 1))
5887                         goto out;
5888         }
5889
5890         /*
5891          * And now a mini version of register_netdevice unregister_netdevice.
5892          */
5893
5894         /* If device is running close it first. */
5895         dev_close(dev);
5896
5897         /* And unlink it from device chain */
5898         err = -ENODEV;
5899         unlist_netdevice(dev);
5900
5901         synchronize_net();
5902
5903         /* Shutdown queueing discipline. */
5904         dev_shutdown(dev);
5905
5906         /* Notify protocols, that we are about to destroy
5907            this device. They should clean all the things.
5908
5909            Note that dev->reg_state stays at NETREG_REGISTERED.
5910            This is wanted because this way 8021q and macvlan know
5911            the device is just moving and can keep their slaves up.
5912         */
5913         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5914         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5915
5916         /*
5917          *      Flush the unicast and multicast chains
5918          */
5919         dev_uc_flush(dev);
5920         dev_mc_flush(dev);
5921
5922         /* Actually switch the network namespace */
5923         dev_net_set(dev, net);
5924
5925         /* If there is an ifindex conflict assign a new one */
5926         if (__dev_get_by_index(net, dev->ifindex)) {
5927                 int iflink = (dev->iflink == dev->ifindex);
5928                 dev->ifindex = dev_new_index(net);
5929                 if (iflink)
5930                         dev->iflink = dev->ifindex;
5931         }
5932
5933         /* Fixup kobjects */
5934         err = device_rename(&dev->dev, dev->name);
5935         WARN_ON(err);
5936
5937         /* Add the device back in the hashes */
5938         list_netdevice(dev);
5939
5940         /* Notify protocols, that a new device appeared. */
5941         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5942
5943         /*
5944          *      Prevent userspace races by waiting until the network
5945          *      device is fully setup before sending notifications.
5946          */
5947         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5948
5949         synchronize_net();
5950         err = 0;
5951 out:
5952         return err;
5953 }
5954 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5955
5956 static int dev_cpu_callback(struct notifier_block *nfb,
5957                             unsigned long action,
5958                             void *ocpu)
5959 {
5960         struct sk_buff **list_skb;
5961         struct sk_buff *skb;
5962         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5963         struct softnet_data *sd, *oldsd;
5964
5965         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5966                 return NOTIFY_OK;
5967
5968         local_irq_disable();
5969         cpu = smp_processor_id();
5970         sd = &per_cpu(softnet_data, cpu);
5971         oldsd = &per_cpu(softnet_data, oldcpu);
5972
5973         /* Find end of our completion_queue. */
5974         list_skb = &sd->completion_queue;
5975         while (*list_skb)
5976                 list_skb = &(*list_skb)->next;
5977         /* Append completion queue from offline CPU. */
5978         *list_skb = oldsd->completion_queue;
5979         oldsd->completion_queue = NULL;
5980
5981         /* Append output queue from offline CPU. */
5982         if (oldsd->output_queue) {
5983                 *sd->output_queue_tailp = oldsd->output_queue;
5984                 sd->output_queue_tailp = oldsd->output_queue_tailp;
5985                 oldsd->output_queue = NULL;
5986                 oldsd->output_queue_tailp = &oldsd->output_queue;
5987         }
5988
5989         raise_softirq_irqoff(NET_TX_SOFTIRQ);
5990         local_irq_enable();
5991
5992         /* Process offline CPU's input_pkt_queue */
5993         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5994                 netif_rx(skb);
5995                 input_queue_head_incr(oldsd);
5996         }
5997         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5998                 netif_rx(skb);
5999                 input_queue_head_incr(oldsd);
6000         }
6001
6002         return NOTIFY_OK;
6003 }
6004
6005
6006 /**
6007  *      netdev_increment_features - increment feature set by one
6008  *      @all: current feature set
6009  *      @one: new feature set
6010  *      @mask: mask feature set
6011  *
6012  *      Computes a new feature set after adding a device with feature set
6013  *      @one to the master device with current feature set @all.  Will not
6014  *      enable anything that is off in @mask. Returns the new feature set.
6015  */
6016 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
6017                                         unsigned long mask)
6018 {
6019         /* If device needs checksumming, downgrade to it. */
6020         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6021                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6022         else if (mask & NETIF_F_ALL_CSUM) {
6023                 /* If one device supports v4/v6 checksumming, set for all. */
6024                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6025                     !(all & NETIF_F_GEN_CSUM)) {
6026                         all &= ~NETIF_F_ALL_CSUM;
6027                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6028                 }
6029
6030                 /* If one device supports hw checksumming, set for all. */
6031                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6032                         all &= ~NETIF_F_ALL_CSUM;
6033                         all |= NETIF_F_HW_CSUM;
6034                 }
6035         }
6036
6037         one |= NETIF_F_ALL_CSUM;
6038
6039         one |= all & NETIF_F_ONE_FOR_ALL;
6040         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6041         all |= one & mask & NETIF_F_ONE_FOR_ALL;
6042
6043         return all;
6044 }
6045 EXPORT_SYMBOL(netdev_increment_features);
6046
6047 static struct hlist_head *netdev_create_hash(void)
6048 {
6049         int i;
6050         struct hlist_head *hash;
6051
6052         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6053         if (hash != NULL)
6054                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6055                         INIT_HLIST_HEAD(&hash[i]);
6056
6057         return hash;
6058 }
6059
6060 /* Initialize per network namespace state */
6061 static int __net_init netdev_init(struct net *net)
6062 {
6063         INIT_LIST_HEAD(&net->dev_base_head);
6064
6065         net->dev_name_head = netdev_create_hash();
6066         if (net->dev_name_head == NULL)
6067                 goto err_name;
6068
6069         net->dev_index_head = netdev_create_hash();
6070         if (net->dev_index_head == NULL)
6071                 goto err_idx;
6072
6073         return 0;
6074
6075 err_idx:
6076         kfree(net->dev_name_head);
6077 err_name:
6078         return -ENOMEM;
6079 }
6080
6081 /**
6082  *      netdev_drivername - network driver for the device
6083  *      @dev: network device
6084  *      @buffer: buffer for resulting name
6085  *      @len: size of buffer
6086  *
6087  *      Determine network driver for device.
6088  */
6089 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6090 {
6091         const struct device_driver *driver;
6092         const struct device *parent;
6093
6094         if (len <= 0 || !buffer)
6095                 return buffer;
6096         buffer[0] = 0;
6097
6098         parent = dev->dev.parent;
6099
6100         if (!parent)
6101                 return buffer;
6102
6103         driver = parent->driver;
6104         if (driver && driver->name)
6105                 strlcpy(buffer, driver->name, len);
6106         return buffer;
6107 }
6108
6109 static int __netdev_printk(const char *level, const struct net_device *dev,
6110                            struct va_format *vaf)
6111 {
6112         int r;
6113
6114         if (dev && dev->dev.parent)
6115                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6116                                netdev_name(dev), vaf);
6117         else if (dev)
6118                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6119         else
6120                 r = printk("%s(NULL net_device): %pV", level, vaf);
6121
6122         return r;
6123 }
6124
6125 int netdev_printk(const char *level, const struct net_device *dev,
6126                   const char *format, ...)
6127 {
6128         struct va_format vaf;
6129         va_list args;
6130         int r;
6131
6132         va_start(args, format);
6133
6134         vaf.fmt = format;
6135         vaf.va = &args;
6136
6137         r = __netdev_printk(level, dev, &vaf);
6138         va_end(args);
6139
6140         return r;
6141 }
6142 EXPORT_SYMBOL(netdev_printk);
6143
6144 #define define_netdev_printk_level(func, level)                 \
6145 int func(const struct net_device *dev, const char *fmt, ...)    \
6146 {                                                               \
6147         int r;                                                  \
6148         struct va_format vaf;                                   \
6149         va_list args;                                           \
6150                                                                 \
6151         va_start(args, fmt);                                    \
6152                                                                 \
6153         vaf.fmt = fmt;                                          \
6154         vaf.va = &args;                                         \
6155                                                                 \
6156         r = __netdev_printk(level, dev, &vaf);                  \
6157         va_end(args);                                           \
6158                                                                 \
6159         return r;                                               \
6160 }                                                               \
6161 EXPORT_SYMBOL(func);
6162
6163 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6164 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6165 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6166 define_netdev_printk_level(netdev_err, KERN_ERR);
6167 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6168 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6169 define_netdev_printk_level(netdev_info, KERN_INFO);
6170
6171 static void __net_exit netdev_exit(struct net *net)
6172 {
6173         kfree(net->dev_name_head);
6174         kfree(net->dev_index_head);
6175 }
6176
6177 static struct pernet_operations __net_initdata netdev_net_ops = {
6178         .init = netdev_init,
6179         .exit = netdev_exit,
6180 };
6181
6182 static void __net_exit default_device_exit(struct net *net)
6183 {
6184         struct net_device *dev, *aux;
6185         /*
6186          * Push all migratable network devices back to the
6187          * initial network namespace
6188          */
6189         rtnl_lock();
6190         for_each_netdev_safe(net, dev, aux) {
6191                 int err;
6192                 char fb_name[IFNAMSIZ];
6193
6194                 /* Ignore unmoveable devices (i.e. loopback) */
6195                 if (dev->features & NETIF_F_NETNS_LOCAL)
6196                         continue;
6197
6198                 /* Leave virtual devices for the generic cleanup */
6199                 if (dev->rtnl_link_ops)
6200                         continue;
6201
6202                 /* Push remaing network devices to init_net */
6203                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6204                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6205                 if (err) {
6206                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6207                                 __func__, dev->name, err);
6208                         BUG();
6209                 }
6210         }
6211         rtnl_unlock();
6212 }
6213
6214 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6215 {
6216         /* At exit all network devices most be removed from a network
6217          * namespace.  Do this in the reverse order of registeration.
6218          * Do this across as many network namespaces as possible to
6219          * improve batching efficiency.
6220          */
6221         struct net_device *dev;
6222         struct net *net;
6223         LIST_HEAD(dev_kill_list);
6224
6225         rtnl_lock();
6226         list_for_each_entry(net, net_list, exit_list) {
6227                 for_each_netdev_reverse(net, dev) {
6228                         if (dev->rtnl_link_ops)
6229                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6230                         else
6231                                 unregister_netdevice_queue(dev, &dev_kill_list);
6232                 }
6233         }
6234         unregister_netdevice_many(&dev_kill_list);
6235         rtnl_unlock();
6236 }
6237
6238 static struct pernet_operations __net_initdata default_device_ops = {
6239         .exit = default_device_exit,
6240         .exit_batch = default_device_exit_batch,
6241 };
6242
6243 /*
6244  *      Initialize the DEV module. At boot time this walks the device list and
6245  *      unhooks any devices that fail to initialise (normally hardware not
6246  *      present) and leaves us with a valid list of present and active devices.
6247  *
6248  */
6249
6250 /*
6251  *       This is called single threaded during boot, so no need
6252  *       to take the rtnl semaphore.
6253  */
6254 static int __init net_dev_init(void)
6255 {
6256         int i, rc = -ENOMEM;
6257
6258         BUG_ON(!dev_boot_phase);
6259
6260         if (dev_proc_init())
6261                 goto out;
6262
6263         if (netdev_kobject_init())
6264                 goto out;
6265
6266         INIT_LIST_HEAD(&ptype_all);
6267         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6268                 INIT_LIST_HEAD(&ptype_base[i]);
6269
6270         if (register_pernet_subsys(&netdev_net_ops))
6271                 goto out;
6272
6273         /*
6274          *      Initialise the packet receive queues.
6275          */
6276
6277         for_each_possible_cpu(i) {
6278                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6279
6280                 memset(sd, 0, sizeof(*sd));
6281                 skb_queue_head_init(&sd->input_pkt_queue);
6282                 skb_queue_head_init(&sd->process_queue);
6283                 sd->completion_queue = NULL;
6284                 INIT_LIST_HEAD(&sd->poll_list);
6285                 sd->output_queue = NULL;
6286                 sd->output_queue_tailp = &sd->output_queue;
6287 #ifdef CONFIG_RPS
6288                 sd->csd.func = rps_trigger_softirq;
6289                 sd->csd.info = sd;
6290                 sd->csd.flags = 0;
6291                 sd->cpu = i;
6292 #endif
6293
6294                 sd->backlog.poll = process_backlog;
6295                 sd->backlog.weight = weight_p;
6296                 sd->backlog.gro_list = NULL;
6297                 sd->backlog.gro_count = 0;
6298         }
6299
6300         dev_boot_phase = 0;
6301
6302         /* The loopback device is special if any other network devices
6303          * is present in a network namespace the loopback device must
6304          * be present. Since we now dynamically allocate and free the
6305          * loopback device ensure this invariant is maintained by
6306          * keeping the loopback device as the first device on the
6307          * list of network devices.  Ensuring the loopback devices
6308          * is the first device that appears and the last network device
6309          * that disappears.
6310          */
6311         if (register_pernet_device(&loopback_net_ops))
6312                 goto out;
6313
6314         if (register_pernet_device(&default_device_ops))
6315                 goto out;
6316
6317         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6318         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6319
6320         hotcpu_notifier(dev_cpu_callback, 0);
6321         dst_init();
6322         dev_mcast_init();
6323         rc = 0;
6324 out:
6325         return rc;
6326 }
6327
6328 subsys_initcall(net_dev_init);
6329
6330 static int __init initialize_hashrnd(void)
6331 {
6332         get_random_bytes(&hashrnd, sizeof(hashrnd));
6333         return 0;
6334 }
6335
6336 late_initcall_sync(initialize_hashrnd);
6337