net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135
 136 #include "net-sysfs.h"
 137
 138 /* Instead of increasing this, you should create a hash table. */
 139 #define MAX_GRO_SKBS 8
 140
 141 /* This should be increased if a protocol with a bigger head is added. */
 142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144 /*
 145  *      The list of packet types we will receive (as opposed to discard)
 146  *      and the routines to invoke.
 147  *
 148  *      Why 16. Because with 16 the only overlap we get on a hash of the
 149  *      low nibble of the protocol value is RARP/SNAP/X.25.
 150  *
 151  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 152  *             sure which should go first, but I bet it won't make much
 153  *             difference if we are running VLANs.  The good news is that
 154  *             this protocol won't be in the list unless compiled in, so
 155  *             the average user (w/out VLANs) will not be adversely affected.
 156  *             --BLG
 157  *
 158  *              0800    IP
 159  *              8100    802.1Q VLAN
 160  *              0001    802.3
 161  *              0002    AX.25
 162  *              0004    802.2
 163  *              8035    RARP
 164  *              0005    SNAP
 165  *              0805    X.25
 166  *              0806    ARP
 167  *              8137    IPX
 168  *              0009    Localtalk
 169  *              86DD    IPv6
 170  */
 171
 172 #define PTYPE_HASH_SIZE (16)
 173 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 174
 175 static DEFINE_SPINLOCK(ptype_lock);
 176 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 177 static struct list_head ptype_all __read_mostly;        /* Taps */
 178
 179 /*
 180  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 181  * semaphore.
 182  *
 183  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 184  *
 185  * Writers must hold the rtnl semaphore while they loop through the
 186  * dev_base_head list, and hold dev_base_lock for writing when they do the
 187  * actual updates.  This allows pure readers to access the list even
 188  * while a writer is preparing to update it.
 189  *
 190  * To put it another way, dev_base_lock is held for writing only to
 191  * protect against pure readers; the rtnl semaphore provides the
 192  * protection against other writers.
 193  *
 194  * See, for example usages, register_netdevice() and
 195  * unregister_netdevice(), which must be called with the rtnl
 196  * semaphore held.
 197  */
 198 DEFINE_RWLOCK(dev_base_lock);
 199 EXPORT_SYMBOL(dev_base_lock);
 200
 201 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 202 {
 203         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 204         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 205 }
 206
 207 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 208 {
 209         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 210 }
 211
 212 static inline void rps_lock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_lock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 static inline void rps_unlock(struct softnet_data *sd)
 220 {
 221 #ifdef CONFIG_RPS
 222         spin_unlock(&sd->input_pkt_queue.lock);
 223 #endif
 224 }
 225
 226 /* Device list insertion */
 227 static int list_netdevice(struct net_device *dev)
 228 {
 229         struct net *net = dev_net(dev);
 230
 231         ASSERT_RTNL();
 232
 233         write_lock_bh(&dev_base_lock);
 234         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 235         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 236         hlist_add_head_rcu(&dev->index_hlist,
 237                            dev_index_hash(net, dev->ifindex));
 238         write_unlock_bh(&dev_base_lock);
 239         return 0;
 240 }
 241
 242 /* Device list removal
 243  * caller must respect a RCU grace period before freeing/reusing dev
 244  */
 245 static void unlist_netdevice(struct net_device *dev)
 246 {
 247         ASSERT_RTNL();
 248
 249         /* Unlink dev from the device chain */
 250         write_lock_bh(&dev_base_lock);
 251         list_del_rcu(&dev->dev_list);
 252         hlist_del_rcu(&dev->name_hlist);
 253         hlist_del_rcu(&dev->index_hlist);
 254         write_unlock_bh(&dev_base_lock);
 255 }
 256
 257 /*
 258  *      Our notifier list
 259  */
 260
 261 static RAW_NOTIFIER_HEAD(netdev_chain);
 262
 263 /*
 264  *      Device drivers call our routines to queue packets here. We empty the
 265  *      queue in the local softnet handler.
 266  */
 267
 268 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 269 EXPORT_PER_CPU_SYMBOL(softnet_data);
 270
 271 #ifdef CONFIG_LOCKDEP
 272 /*
 273  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 274  * according to dev->type
 275  */
 276 static const unsigned short netdev_lock_type[] =
 277         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 278          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 279          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 280          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 281          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 282          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 283          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 284          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 285          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 286          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 287          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 288          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 289          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 290          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 291          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 292          ARPHRD_VOID, ARPHRD_NONE};
 293
 294 static const char *const netdev_lock_name[] =
 295         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 308          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 309          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 310          "_xmit_VOID", "_xmit_NONE"};
 311
 312 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314
 315 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 316 {
 317         int i;
 318
 319         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 320                 if (netdev_lock_type[i] == dev_type)
 321                         return i;
 322         /* the last key is used by default */
 323         return ARRAY_SIZE(netdev_lock_type) - 1;
 324 }
 325
 326 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 327                                                  unsigned short dev_type)
 328 {
 329         int i;
 330
 331         i = netdev_lock_pos(dev_type);
 332         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 333                                    netdev_lock_name[i]);
 334 }
 335
 336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 337 {
 338         int i;
 339
 340         i = netdev_lock_pos(dev->type);
 341         lockdep_set_class_and_name(&dev->addr_list_lock,
 342                                    &netdev_addr_lock_key[i],
 343                                    netdev_lock_name[i]);
 344 }
 345 #else
 346 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 347                                                  unsigned short dev_type)
 348 {
 349 }
 350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351 {
 352 }
 353 #endif
 354
 355 /*******************************************************************************
 356
 357                 Protocol management and registration routines
 358
 359 *******************************************************************************/
 360
 361 /*
 362  *      Add a protocol ID to the list. Now that the input handler is
 363  *      smarter we can dispense with all the messy stuff that used to be
 364  *      here.
 365  *
 366  *      BEWARE!!! Protocol handlers, mangling input packets,
 367  *      MUST BE last in hash buckets and checking protocol handlers
 368  *      MUST start from promiscuous ptype_all chain in net_bh.
 369  *      It is true now, do not change it.
 370  *      Explanation follows: if protocol handler, mangling packet, will
 371  *      be the first on list, it is not able to sense, that packet
 372  *      is cloned and should be copied-on-write, so that it will
 373  *      change it and subsequent readers will get broken packet.
 374  *                                                      --ANK (980803)
 375  */
 376
 377 static inline struct list_head *ptype_head(const struct packet_type *pt)
 378 {
 379         if (pt->type == htons(ETH_P_ALL))
 380                 return &ptype_all;
 381         else
 382                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383 }
 384
 385 /**
 386  *      dev_add_pack - add packet handler
 387  *      @pt: packet type declaration
 388  *
 389  *      Add a protocol handler to the networking stack. The passed &packet_type
 390  *      is linked into kernel lists and may not be freed until it has been
 391  *      removed from the kernel lists.
 392  *
 393  *      This call does not sleep therefore it can not
 394  *      guarantee all CPU's that are in middle of receiving packets
 395  *      will see the new packet type (until the next received packet).
 396  */
 397
 398 void dev_add_pack(struct packet_type *pt)
 399 {
 400         struct list_head *head = ptype_head(pt);
 401
 402         spin_lock(&ptype_lock);
 403         list_add_rcu(&pt->list, head);
 404         spin_unlock(&ptype_lock);
 405 }
 406 EXPORT_SYMBOL(dev_add_pack);
 407
 408 /**
 409  *      __dev_remove_pack        - remove packet handler
 410  *      @pt: packet type declaration
 411  *
 412  *      Remove a protocol handler that was previously added to the kernel
 413  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414  *      from the kernel lists and can be freed or reused once this function
 415  *      returns.
 416  *
 417  *      The packet type might still be in use by receivers
 418  *      and must not be freed until after all the CPU's have gone
 419  *      through a quiescent state.
 420  */
 421 void __dev_remove_pack(struct packet_type *pt)
 422 {
 423         struct list_head *head = ptype_head(pt);
 424         struct packet_type *pt1;
 425
 426         spin_lock(&ptype_lock);
 427
 428         list_for_each_entry(pt1, head, list) {
 429                 if (pt == pt1) {
 430                         list_del_rcu(&pt->list);
 431                         goto out;
 432                 }
 433         }
 434
 435         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 436 out:
 437         spin_unlock(&ptype_lock);
 438 }
 439 EXPORT_SYMBOL(__dev_remove_pack);
 440
 441 /**
 442  *      dev_remove_pack  - remove packet handler
 443  *      @pt: packet type declaration
 444  *
 445  *      Remove a protocol handler that was previously added to the kernel
 446  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447  *      from the kernel lists and can be freed or reused once this function
 448  *      returns.
 449  *
 450  *      This call sleeps to guarantee that no CPU is looking at the packet
 451  *      type after return.
 452  */
 453 void dev_remove_pack(struct packet_type *pt)
 454 {
 455         __dev_remove_pack(pt);
 456
 457         synchronize_net();
 458 }
 459 EXPORT_SYMBOL(dev_remove_pack);
 460
 461 /******************************************************************************
 462
 463                       Device Boot-time Settings Routines
 464
 465 *******************************************************************************/
 466
 467 /* Boot time configuration table */
 468 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 469
 470 /**
 471  *      netdev_boot_setup_add   - add new setup entry
 472  *      @name: name of the device
 473  *      @map: configured settings for the device
 474  *
 475  *      Adds new setup entry to the dev_boot_setup list.  The function
 476  *      returns 0 on error and 1 on success.  This is a generic routine to
 477  *      all netdevices.
 478  */
 479 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 480 {
 481         struct netdev_boot_setup *s;
 482         int i;
 483
 484         s = dev_boot_setup;
 485         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 486                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 487                         memset(s[i].name, 0, sizeof(s[i].name));
 488                         strlcpy(s[i].name, name, IFNAMSIZ);
 489                         memcpy(&s[i].map, map, sizeof(s[i].map));
 490                         break;
 491                 }
 492         }
 493
 494         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 495 }
 496
 497 /**
 498  *      netdev_boot_setup_check - check boot time settings
 499  *      @dev: the netdevice
 500  *
 501  *      Check boot time settings for the device.
 502  *      The found settings are set for the device to be used
 503  *      later in the device probing.
 504  *      Returns 0 if no settings found, 1 if they are.
 505  */
 506 int netdev_boot_setup_check(struct net_device *dev)
 507 {
 508         struct netdev_boot_setup *s = dev_boot_setup;
 509         int i;
 510
 511         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 512                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 513                     !strcmp(dev->name, s[i].name)) {
 514                         dev->irq        = s[i].map.irq;
 515                         dev->base_addr  = s[i].map.base_addr;
 516                         dev->mem_start  = s[i].map.mem_start;
 517                         dev->mem_end    = s[i].map.mem_end;
 518                         return 1;
 519                 }
 520         }
 521         return 0;
 522 }
 523 EXPORT_SYMBOL(netdev_boot_setup_check);
 524
 525
 526 /**
 527  *      netdev_boot_base        - get address from boot time settings
 528  *      @prefix: prefix for network device
 529  *      @unit: id for network device
 530  *
 531  *      Check boot time settings for the base address of device.
 532  *      The found settings are set for the device to be used
 533  *      later in the device probing.
 534  *      Returns 0 if no settings found.
 535  */
 536 unsigned long netdev_boot_base(const char *prefix, int unit)
 537 {
 538         const struct netdev_boot_setup *s = dev_boot_setup;
 539         char name[IFNAMSIZ];
 540         int i;
 541
 542         sprintf(name, "%s%d", prefix, unit);
 543
 544         /*
 545          * If device already registered then return base of 1
 546          * to indicate not to probe for this interface
 547          */
 548         if (__dev_get_by_name(&init_net, name))
 549                 return 1;
 550
 551         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 552                 if (!strcmp(name, s[i].name))
 553                         return s[i].map.base_addr;
 554         return 0;
 555 }
 556
 557 /*
 558  * Saves at boot time configured settings for any netdevice.
 559  */
 560 int __init netdev_boot_setup(char *str)
 561 {
 562         int ints[5];
 563         struct ifmap map;
 564
 565         str = get_options(str, ARRAY_SIZE(ints), ints);
 566         if (!str || !*str)
 567                 return 0;
 568
 569         /* Save settings */
 570         memset(&map, 0, sizeof(map));
 571         if (ints[0] > 0)
 572                 map.irq = ints[1];
 573         if (ints[0] > 1)
 574                 map.base_addr = ints[2];
 575         if (ints[0] > 2)
 576                 map.mem_start = ints[3];
 577         if (ints[0] > 3)
 578                 map.mem_end = ints[4];
 579
 580         /* Add new entry to the list */
 581         return netdev_boot_setup_add(str, &map);
 582 }
 583
 584 __setup("netdev=", netdev_boot_setup);
 585
 586 /*******************************************************************************
 587
 588                             Device Interface Subroutines
 589
 590 *******************************************************************************/
 591
 592 /**
 593  *      __dev_get_by_name       - find a device by its name
 594  *      @net: the applicable net namespace
 595  *      @name: name to find
 596  *
 597  *      Find an interface by name. Must be called under RTNL semaphore
 598  *      or @dev_base_lock. If the name is found a pointer to the device
 599  *      is returned. If the name is not found then %NULL is returned. The
 600  *      reference counters are not incremented so the caller must be
 601  *      careful with locks.
 602  */
 603
 604 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 605 {
 606         struct hlist_node *p;
 607         struct net_device *dev;
 608         struct hlist_head *head = dev_name_hash(net, name);
 609
 610         hlist_for_each_entry(dev, p, head, name_hlist)
 611                 if (!strncmp(dev->name, name, IFNAMSIZ))
 612                         return dev;
 613
 614         return NULL;
 615 }
 616 EXPORT_SYMBOL(__dev_get_by_name);
 617
 618 /**
 619  *      dev_get_by_name_rcu     - find a device by its name
 620  *      @net: the applicable net namespace
 621  *      @name: name to find
 622  *
 623  *      Find an interface by name.
 624  *      If the name is found a pointer to the device is returned.
 625  *      If the name is not found then %NULL is returned.
 626  *      The reference counters are not incremented so the caller must be
 627  *      careful with locks. The caller must hold RCU lock.
 628  */
 629
 630 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 631 {
 632         struct hlist_node *p;
 633         struct net_device *dev;
 634         struct hlist_head *head = dev_name_hash(net, name);
 635
 636         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 637                 if (!strncmp(dev->name, name, IFNAMSIZ))
 638                         return dev;
 639
 640         return NULL;
 641 }
 642 EXPORT_SYMBOL(dev_get_by_name_rcu);
 643
 644 /**
 645  *      dev_get_by_name         - find a device by its name
 646  *      @net: the applicable net namespace
 647  *      @name: name to find
 648  *
 649  *      Find an interface by name. This can be called from any
 650  *      context and does its own locking. The returned handle has
 651  *      the usage count incremented and the caller must use dev_put() to
 652  *      release it when it is no longer needed. %NULL is returned if no
 653  *      matching device is found.
 654  */
 655
 656 struct net_device *dev_get_by_name(struct net *net, const char *name)
 657 {
 658         struct net_device *dev;
 659
 660         rcu_read_lock();
 661         dev = dev_get_by_name_rcu(net, name);
 662         if (dev)
 663                 dev_hold(dev);
 664         rcu_read_unlock();
 665         return dev;
 666 }
 667 EXPORT_SYMBOL(dev_get_by_name);
 668
 669 /**
 670  *      __dev_get_by_index - find a device by its ifindex
 671  *      @net: the applicable net namespace
 672  *      @ifindex: index of device
 673  *
 674  *      Search for an interface by index. Returns %NULL if the device
 675  *      is not found or a pointer to the device. The device has not
 676  *      had its reference counter increased so the caller must be careful
 677  *      about locking. The caller must hold either the RTNL semaphore
 678  *      or @dev_base_lock.
 679  */
 680
 681 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 682 {
 683         struct hlist_node *p;
 684         struct net_device *dev;
 685         struct hlist_head *head = dev_index_hash(net, ifindex);
 686
 687         hlist_for_each_entry(dev, p, head, index_hlist)
 688                 if (dev->ifindex == ifindex)
 689                         return dev;
 690
 691         return NULL;
 692 }
 693 EXPORT_SYMBOL(__dev_get_by_index);
 694
 695 /**
 696  *      dev_get_by_index_rcu - find a device by its ifindex
 697  *      @net: the applicable net namespace
 698  *      @ifindex: index of device
 699  *
 700  *      Search for an interface by index. Returns %NULL if the device
 701  *      is not found or a pointer to the device. The device has not
 702  *      had its reference counter increased so the caller must be careful
 703  *      about locking. The caller must hold RCU lock.
 704  */
 705
 706 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 707 {
 708         struct hlist_node *p;
 709         struct net_device *dev;
 710         struct hlist_head *head = dev_index_hash(net, ifindex);
 711
 712         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 713                 if (dev->ifindex == ifindex)
 714                         return dev;
 715
 716         return NULL;
 717 }
 718 EXPORT_SYMBOL(dev_get_by_index_rcu);
 719
 720
 721 /**
 722  *      dev_get_by_index - find a device by its ifindex
 723  *      @net: the applicable net namespace
 724  *      @ifindex: index of device
 725  *
 726  *      Search for an interface by index. Returns NULL if the device
 727  *      is not found or a pointer to the device. The device returned has
 728  *      had a reference added and the pointer is safe until the user calls
 729  *      dev_put to indicate they have finished with it.
 730  */
 731
 732 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 733 {
 734         struct net_device *dev;
 735
 736         rcu_read_lock();
 737         dev = dev_get_by_index_rcu(net, ifindex);
 738         if (dev)
 739                 dev_hold(dev);
 740         rcu_read_unlock();
 741         return dev;
 742 }
 743 EXPORT_SYMBOL(dev_get_by_index);
 744
 745 /**
 746  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 747  *      @net: the applicable net namespace
 748  *      @type: media type of device
 749  *      @ha: hardware address
 750  *
 751  *      Search for an interface by MAC address. Returns NULL if the device
 752  *      is not found or a pointer to the device. The caller must hold RCU
 753  *      The returned device has not had its ref count increased
 754  *      and the caller must therefore be careful about locking
 755  *
 756  */
 757
 758 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 759                                        const char *ha)
 760 {
 761         struct net_device *dev;
 762
 763         for_each_netdev_rcu(net, dev)
 764                 if (dev->type == type &&
 765                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 766                         return dev;
 767
 768         return NULL;
 769 }
 770 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 771
 772 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 773 {
 774         struct net_device *dev;
 775
 776         ASSERT_RTNL();
 777         for_each_netdev(net, dev)
 778                 if (dev->type == type)
 779                         return dev;
 780
 781         return NULL;
 782 }
 783 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 784
 785 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 786 {
 787         struct net_device *dev, *ret = NULL;
 788
 789         rcu_read_lock();
 790         for_each_netdev_rcu(net, dev)
 791                 if (dev->type == type) {
 792                         dev_hold(dev);
 793                         ret = dev;
 794                         break;
 795                 }
 796         rcu_read_unlock();
 797         return ret;
 798 }
 799 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 800
 801 /**
 802  *      dev_get_by_flags_rcu - find any device with given flags
 803  *      @net: the applicable net namespace
 804  *      @if_flags: IFF_* values
 805  *      @mask: bitmask of bits in if_flags to check
 806  *
 807  *      Search for any interface with the given flags. Returns NULL if a device
 808  *      is not found or a pointer to the device. Must be called inside
 809  *      rcu_read_lock(), and result refcount is unchanged.
 810  */
 811
 812 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 813                                     unsigned short mask)
 814 {
 815         struct net_device *dev, *ret;
 816
 817         ret = NULL;
 818         for_each_netdev_rcu(net, dev) {
 819                 if (((dev->flags ^ if_flags) & mask) == 0) {
 820                         ret = dev;
 821                         break;
 822                 }
 823         }
 824         return ret;
 825 }
 826 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 827
 828 /**
 829  *      dev_valid_name - check if name is okay for network device
 830  *      @name: name string
 831  *
 832  *      Network device names need to be valid file names to
 833  *      to allow sysfs to work.  We also disallow any kind of
 834  *      whitespace.
 835  */
 836 int dev_valid_name(const char *name)
 837 {
 838         if (*name == '\0')
 839                 return 0;
 840         if (strlen(name) >= IFNAMSIZ)
 841                 return 0;
 842         if (!strcmp(name, ".") || !strcmp(name, ".."))
 843                 return 0;
 844
 845         while (*name) {
 846                 if (*name == '/' || isspace(*name))
 847                         return 0;
 848                 name++;
 849         }
 850         return 1;
 851 }
 852 EXPORT_SYMBOL(dev_valid_name);
 853
 854 /**
 855  *      __dev_alloc_name - allocate a name for a device
 856  *      @net: network namespace to allocate the device name in
 857  *      @name: name format string
 858  *      @buf:  scratch buffer and result name string
 859  *
 860  *      Passed a format string - eg "lt%d" it will try and find a suitable
 861  *      id. It scans list of devices to build up a free map, then chooses
 862  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 863  *      while allocating the name and adding the device in order to avoid
 864  *      duplicates.
 865  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 866  *      Returns the number of the unit assigned or a negative errno code.
 867  */
 868
 869 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 870 {
 871         int i = 0;
 872         const char *p;
 873         const int max_netdevices = 8*PAGE_SIZE;
 874         unsigned long *inuse;
 875         struct net_device *d;
 876
 877         p = strnchr(name, IFNAMSIZ-1, '%');
 878         if (p) {
 879                 /*
 880                  * Verify the string as this thing may have come from
 881                  * the user.  There must be either one "%d" and no other "%"
 882                  * characters.
 883                  */
 884                 if (p[1] != 'd' || strchr(p + 2, '%'))
 885                         return -EINVAL;
 886
 887                 /* Use one page as a bit array of possible slots */
 888                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 889                 if (!inuse)
 890                         return -ENOMEM;
 891
 892                 for_each_netdev(net, d) {
 893                         if (!sscanf(d->name, name, &i))
 894                                 continue;
 895                         if (i < 0 || i >= max_netdevices)
 896                                 continue;
 897
 898                         /*  avoid cases where sscanf is not exact inverse of printf */
 899                         snprintf(buf, IFNAMSIZ, name, i);
 900                         if (!strncmp(buf, d->name, IFNAMSIZ))
 901                                 set_bit(i, inuse);
 902                 }
 903
 904                 i = find_first_zero_bit(inuse, max_netdevices);
 905                 free_page((unsigned long) inuse);
 906         }
 907
 908         if (buf != name)
 909                 snprintf(buf, IFNAMSIZ, name, i);
 910         if (!__dev_get_by_name(net, buf))
 911                 return i;
 912
 913         /* It is possible to run out of possible slots
 914          * when the name is long and there isn't enough space left
 915          * for the digits, or if all bits are used.
 916          */
 917         return -ENFILE;
 918 }
 919
 920 /**
 921  *      dev_alloc_name - allocate a name for a device
 922  *      @dev: device
 923  *      @name: name format string
 924  *
 925  *      Passed a format string - eg "lt%d" it will try and find a suitable
 926  *      id. It scans list of devices to build up a free map, then chooses
 927  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 928  *      while allocating the name and adding the device in order to avoid
 929  *      duplicates.
 930  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 931  *      Returns the number of the unit assigned or a negative errno code.
 932  */
 933
 934 int dev_alloc_name(struct net_device *dev, const char *name)
 935 {
 936         char buf[IFNAMSIZ];
 937         struct net *net;
 938         int ret;
 939
 940         BUG_ON(!dev_net(dev));
 941         net = dev_net(dev);
 942         ret = __dev_alloc_name(net, name, buf);
 943         if (ret >= 0)
 944                 strlcpy(dev->name, buf, IFNAMSIZ);
 945         return ret;
 946 }
 947 EXPORT_SYMBOL(dev_alloc_name);
 948
 949 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 950 {
 951         struct net *net;
 952
 953         BUG_ON(!dev_net(dev));
 954         net = dev_net(dev);
 955
 956         if (!dev_valid_name(name))
 957                 return -EINVAL;
 958
 959         if (fmt && strchr(name, '%'))
 960                 return dev_alloc_name(dev, name);
 961         else if (__dev_get_by_name(net, name))
 962                 return -EEXIST;
 963         else if (dev->name != name)
 964                 strlcpy(dev->name, name, IFNAMSIZ);
 965
 966         return 0;
 967 }
 968
 969 /**
 970  *      dev_change_name - change name of a device
 971  *      @dev: device
 972  *      @newname: name (or format string) must be at least IFNAMSIZ
 973  *
 974  *      Change name of a device, can pass format strings "eth%d".
 975  *      for wildcarding.
 976  */
 977 int dev_change_name(struct net_device *dev, const char *newname)
 978 {
 979         char oldname[IFNAMSIZ];
 980         int err = 0;
 981         int ret;
 982         struct net *net;
 983
 984         ASSERT_RTNL();
 985         BUG_ON(!dev_net(dev));
 986
 987         net = dev_net(dev);
 988         if (dev->flags & IFF_UP)
 989                 return -EBUSY;
 990
 991         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 992                 return 0;
 993
 994         memcpy(oldname, dev->name, IFNAMSIZ);
 995
 996         err = dev_get_valid_name(dev, newname, 1);
 997         if (err < 0)
 998                 return err;
 999
1000 rollback:
1001         ret = device_rename(&dev->dev, dev->name);
1002         if (ret) {
1003                 memcpy(dev->name, oldname, IFNAMSIZ);
1004                 return ret;
1005         }
1006
1007         write_lock_bh(&dev_base_lock);
1008         hlist_del(&dev->name_hlist);
1009         write_unlock_bh(&dev_base_lock);
1010
1011         synchronize_rcu();
1012
1013         write_lock_bh(&dev_base_lock);
1014         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1015         write_unlock_bh(&dev_base_lock);
1016
1017         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1018         ret = notifier_to_errno(ret);
1019
1020         if (ret) {
1021                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1022                 if (err >= 0) {
1023                         err = ret;
1024                         memcpy(dev->name, oldname, IFNAMSIZ);
1025                         goto rollback;
1026                 } else {
1027                         printk(KERN_ERR
1028                                "%s: name change rollback failed: %d.\n",
1029                                dev->name, ret);
1030                 }
1031         }
1032
1033         return err;
1034 }
1035
1036 /**
1037  *      dev_set_alias - change ifalias of a device
1038  *      @dev: device
1039  *      @alias: name up to IFALIASZ
1040  *      @len: limit of bytes to copy from info
1041  *
1042  *      Set ifalias for a device,
1043  */
1044 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1045 {
1046         ASSERT_RTNL();
1047
1048         if (len >= IFALIASZ)
1049                 return -EINVAL;
1050
1051         if (!len) {
1052                 if (dev->ifalias) {
1053                         kfree(dev->ifalias);
1054                         dev->ifalias = NULL;
1055                 }
1056                 return 0;
1057         }
1058
1059         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1060         if (!dev->ifalias)
1061                 return -ENOMEM;
1062
1063         strlcpy(dev->ifalias, alias, len+1);
1064         return len;
1065 }
1066
1067
1068 /**
1069  *      netdev_features_change - device changes features
1070  *      @dev: device to cause notification
1071  *
1072  *      Called to indicate a device has changed features.
1073  */
1074 void netdev_features_change(struct net_device *dev)
1075 {
1076         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1077 }
1078 EXPORT_SYMBOL(netdev_features_change);
1079
1080 /**
1081  *      netdev_state_change - device changes state
1082  *      @dev: device to cause notification
1083  *
1084  *      Called to indicate a device has changed state. This function calls
1085  *      the notifier chains for netdev_chain and sends a NEWLINK message
1086  *      to the routing socket.
1087  */
1088 void netdev_state_change(struct net_device *dev)
1089 {
1090         if (dev->flags & IFF_UP) {
1091                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1092                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1093         }
1094 }
1095 EXPORT_SYMBOL(netdev_state_change);
1096
1097 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1098 {
1099         return call_netdevice_notifiers(event, dev);
1100 }
1101 EXPORT_SYMBOL(netdev_bonding_change);
1102
1103 /**
1104  *      dev_load        - load a network module
1105  *      @net: the applicable net namespace
1106  *      @name: name of interface
1107  *
1108  *      If a network interface is not present and the process has suitable
1109  *      privileges this function loads the module. If module loading is not
1110  *      available in this kernel then it becomes a nop.
1111  */
1112
1113 void dev_load(struct net *net, const char *name)
1114 {
1115         struct net_device *dev;
1116
1117         rcu_read_lock();
1118         dev = dev_get_by_name_rcu(net, name);
1119         rcu_read_unlock();
1120
1121         if (!dev && capable(CAP_NET_ADMIN))
1122                 request_module("%s", name);
1123 }
1124 EXPORT_SYMBOL(dev_load);
1125
1126 static int __dev_open(struct net_device *dev)
1127 {
1128         const struct net_device_ops *ops = dev->netdev_ops;
1129         int ret;
1130
1131         ASSERT_RTNL();
1132
1133         /*
1134          *      Is it even present?
1135          */
1136         if (!netif_device_present(dev))
1137                 return -ENODEV;
1138
1139         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1140         ret = notifier_to_errno(ret);
1141         if (ret)
1142                 return ret;
1143
1144         /*
1145          *      Call device private open method
1146          */
1147         set_bit(__LINK_STATE_START, &dev->state);
1148
1149         if (ops->ndo_validate_addr)
1150                 ret = ops->ndo_validate_addr(dev);
1151
1152         if (!ret && ops->ndo_open)
1153                 ret = ops->ndo_open(dev);
1154
1155         /*
1156          *      If it went open OK then:
1157          */
1158
1159         if (ret)
1160                 clear_bit(__LINK_STATE_START, &dev->state);
1161         else {
1162                 /*
1163                  *      Set the flags.
1164                  */
1165                 dev->flags |= IFF_UP;
1166
1167                 /*
1168                  *      Enable NET_DMA
1169                  */
1170                 net_dmaengine_get();
1171
1172                 /*
1173                  *      Initialize multicasting status
1174                  */
1175                 dev_set_rx_mode(dev);
1176
1177                 /*
1178                  *      Wakeup transmit queue engine
1179                  */
1180                 dev_activate(dev);
1181         }
1182
1183         return ret;
1184 }
1185
1186 /**
1187  *      dev_open        - prepare an interface for use.
1188  *      @dev:   device to open
1189  *
1190  *      Takes a device from down to up state. The device's private open
1191  *      function is invoked and then the multicast lists are loaded. Finally
1192  *      the device is moved into the up state and a %NETDEV_UP message is
1193  *      sent to the netdev notifier chain.
1194  *
1195  *      Calling this function on an active interface is a nop. On a failure
1196  *      a negative errno code is returned.
1197  */
1198 int dev_open(struct net_device *dev)
1199 {
1200         int ret;
1201
1202         /*
1203          *      Is it already up?
1204          */
1205         if (dev->flags & IFF_UP)
1206                 return 0;
1207
1208         /*
1209          *      Open device
1210          */
1211         ret = __dev_open(dev);
1212         if (ret < 0)
1213                 return ret;
1214
1215         /*
1216          *      ... and announce new interface.
1217          */
1218         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1219         call_netdevice_notifiers(NETDEV_UP, dev);
1220
1221         return ret;
1222 }
1223 EXPORT_SYMBOL(dev_open);
1224
1225 static int __dev_close_many(struct list_head *head)
1226 {
1227         struct net_device *dev;
1228
1229         ASSERT_RTNL();
1230         might_sleep();
1231
1232         list_for_each_entry(dev, head, unreg_list) {
1233                 /*
1234                  *      Tell people we are going down, so that they can
1235                  *      prepare to death, when device is still operating.
1236                  */
1237                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1238
1239                 clear_bit(__LINK_STATE_START, &dev->state);
1240
1241                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1242                  * can be even on different cpu. So just clear netif_running().
1243                  *
1244                  * dev->stop() will invoke napi_disable() on all of it's
1245                  * napi_struct instances on this device.
1246                  */
1247                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1248         }
1249
1250         dev_deactivate_many(head);
1251
1252         list_for_each_entry(dev, head, unreg_list) {
1253                 const struct net_device_ops *ops = dev->netdev_ops;
1254
1255                 /*
1256                  *      Call the device specific close. This cannot fail.
1257                  *      Only if device is UP
1258                  *
1259                  *      We allow it to be called even after a DETACH hot-plug
1260                  *      event.
1261                  */
1262                 if (ops->ndo_stop)
1263                         ops->ndo_stop(dev);
1264
1265                 /*
1266                  *      Device is now down.
1267                  */
1268
1269                 dev->flags &= ~IFF_UP;
1270
1271                 /*
1272                  *      Shutdown NET_DMA
1273                  */
1274                 net_dmaengine_put();
1275         }
1276
1277         return 0;
1278 }
1279
1280 static int __dev_close(struct net_device *dev)
1281 {
1282         LIST_HEAD(single);
1283
1284         list_add(&dev->unreg_list, &single);
1285         return __dev_close_many(&single);
1286 }
1287
1288 int dev_close_many(struct list_head *head)
1289 {
1290         struct net_device *dev, *tmp;
1291         LIST_HEAD(tmp_list);
1292
1293         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1294                 if (!(dev->flags & IFF_UP))
1295                         list_move(&dev->unreg_list, &tmp_list);
1296
1297         __dev_close_many(head);
1298
1299         /*
1300          * Tell people we are down
1301          */
1302         list_for_each_entry(dev, head, unreg_list) {
1303                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1304                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1305         }
1306
1307         /* rollback_registered_many needs the complete original list */
1308         list_splice(&tmp_list, head);
1309         return 0;
1310 }
1311
1312 /**
1313  *      dev_close - shutdown an interface.
1314  *      @dev: device to shutdown
1315  *
1316  *      This function moves an active device into down state. A
1317  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1318  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1319  *      chain.
1320  */
1321 int dev_close(struct net_device *dev)
1322 {
1323         LIST_HEAD(single);
1324
1325         list_add(&dev->unreg_list, &single);
1326         dev_close_many(&single);
1327
1328         return 0;
1329 }
1330 EXPORT_SYMBOL(dev_close);
1331
1332
1333 /**
1334  *      dev_disable_lro - disable Large Receive Offload on a device
1335  *      @dev: device
1336  *
1337  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1338  *      called under RTNL.  This is needed if received packets may be
1339  *      forwarded to another interface.
1340  */
1341 void dev_disable_lro(struct net_device *dev)
1342 {
1343         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1344             dev->ethtool_ops->set_flags) {
1345                 u32 flags = dev->ethtool_ops->get_flags(dev);
1346                 if (flags & ETH_FLAG_LRO) {
1347                         flags &= ~ETH_FLAG_LRO;
1348                         dev->ethtool_ops->set_flags(dev, flags);
1349                 }
1350         }
1351         WARN_ON(dev->features & NETIF_F_LRO);
1352 }
1353 EXPORT_SYMBOL(dev_disable_lro);
1354
1355
1356 static int dev_boot_phase = 1;
1357
1358 /*
1359  *      Device change register/unregister. These are not inline or static
1360  *      as we export them to the world.
1361  */
1362
1363 /**
1364  *      register_netdevice_notifier - register a network notifier block
1365  *      @nb: notifier
1366  *
1367  *      Register a notifier to be called when network device events occur.
1368  *      The notifier passed is linked into the kernel structures and must
1369  *      not be reused until it has been unregistered. A negative errno code
1370  *      is returned on a failure.
1371  *
1372  *      When registered all registration and up events are replayed
1373  *      to the new notifier to allow device to have a race free
1374  *      view of the network device list.
1375  */
1376
1377 int register_netdevice_notifier(struct notifier_block *nb)
1378 {
1379         struct net_device *dev;
1380         struct net_device *last;
1381         struct net *net;
1382         int err;
1383
1384         rtnl_lock();
1385         err = raw_notifier_chain_register(&netdev_chain, nb);
1386         if (err)
1387                 goto unlock;
1388         if (dev_boot_phase)
1389                 goto unlock;
1390         for_each_net(net) {
1391                 for_each_netdev(net, dev) {
1392                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1393                         err = notifier_to_errno(err);
1394                         if (err)
1395                                 goto rollback;
1396
1397                         if (!(dev->flags & IFF_UP))
1398                                 continue;
1399
1400                         nb->notifier_call(nb, NETDEV_UP, dev);
1401                 }
1402         }
1403
1404 unlock:
1405         rtnl_unlock();
1406         return err;
1407
1408 rollback:
1409         last = dev;
1410         for_each_net(net) {
1411                 for_each_netdev(net, dev) {
1412                         if (dev == last)
1413                                 break;
1414
1415                         if (dev->flags & IFF_UP) {
1416                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1417                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1418                         }
1419                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1420                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1421                 }
1422         }
1423
1424         raw_notifier_chain_unregister(&netdev_chain, nb);
1425         goto unlock;
1426 }
1427 EXPORT_SYMBOL(register_netdevice_notifier);
1428
1429 /**
1430  *      unregister_netdevice_notifier - unregister a network notifier block
1431  *      @nb: notifier
1432  *
1433  *      Unregister a notifier previously registered by
1434  *      register_netdevice_notifier(). The notifier is unlinked into the
1435  *      kernel structures and may then be reused. A negative errno code
1436  *      is returned on a failure.
1437  */
1438
1439 int unregister_netdevice_notifier(struct notifier_block *nb)
1440 {
1441         int err;
1442
1443         rtnl_lock();
1444         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1445         rtnl_unlock();
1446         return err;
1447 }
1448 EXPORT_SYMBOL(unregister_netdevice_notifier);
1449
1450 /**
1451  *      call_netdevice_notifiers - call all network notifier blocks
1452  *      @val: value passed unmodified to notifier function
1453  *      @dev: net_device pointer passed unmodified to notifier function
1454  *
1455  *      Call all network notifier blocks.  Parameters and return value
1456  *      are as for raw_notifier_call_chain().
1457  */
1458
1459 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1460 {
1461         ASSERT_RTNL();
1462         return raw_notifier_call_chain(&netdev_chain, val, dev);
1463 }
1464
1465 /* When > 0 there are consumers of rx skb time stamps */
1466 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1467
1468 void net_enable_timestamp(void)
1469 {
1470         atomic_inc(&netstamp_needed);
1471 }
1472 EXPORT_SYMBOL(net_enable_timestamp);
1473
1474 void net_disable_timestamp(void)
1475 {
1476         atomic_dec(&netstamp_needed);
1477 }
1478 EXPORT_SYMBOL(net_disable_timestamp);
1479
1480 static inline void net_timestamp_set(struct sk_buff *skb)
1481 {
1482         if (atomic_read(&netstamp_needed))
1483                 __net_timestamp(skb);
1484         else
1485                 skb->tstamp.tv64 = 0;
1486 }
1487
1488 static inline void net_timestamp_check(struct sk_buff *skb)
1489 {
1490         if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1491                 __net_timestamp(skb);
1492 }
1493
1494 /**
1495  * dev_forward_skb - loopback an skb to another netif
1496  *
1497  * @dev: destination network device
1498  * @skb: buffer to forward
1499  *
1500  * return values:
1501  *      NET_RX_SUCCESS  (no congestion)
1502  *      NET_RX_DROP     (packet was dropped, but freed)
1503  *
1504  * dev_forward_skb can be used for injecting an skb from the
1505  * start_xmit function of one device into the receive queue
1506  * of another device.
1507  *
1508  * The receiving device may be in another namespace, so
1509  * we have to clear all information in the skb that could
1510  * impact namespace isolation.
1511  */
1512 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1513 {
1514         skb_orphan(skb);
1515         nf_reset(skb);
1516
1517         if (unlikely(!(dev->flags & IFF_UP) ||
1518                      (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1519                 atomic_long_inc(&dev->rx_dropped);
1520                 kfree_skb(skb);
1521                 return NET_RX_DROP;
1522         }
1523         skb_set_dev(skb, dev);
1524         skb->tstamp.tv64 = 0;
1525         skb->pkt_type = PACKET_HOST;
1526         skb->protocol = eth_type_trans(skb, dev);
1527         return netif_rx(skb);
1528 }
1529 EXPORT_SYMBOL_GPL(dev_forward_skb);
1530
1531 static inline int deliver_skb(struct sk_buff *skb,
1532                               struct packet_type *pt_prev,
1533                               struct net_device *orig_dev)
1534 {
1535         atomic_inc(&skb->users);
1536         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1537 }
1538
1539 /*
1540  *      Support routine. Sends outgoing frames to any network
1541  *      taps currently in use.
1542  */
1543
1544 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1545 {
1546         struct packet_type *ptype;
1547         struct sk_buff *skb2 = NULL;
1548         struct packet_type *pt_prev = NULL;
1549
1550         rcu_read_lock();
1551         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1552                 /* Never send packets back to the socket
1553                  * they originated from - MvS (miquels@drinkel.ow.org)
1554                  */
1555                 if ((ptype->dev == dev || !ptype->dev) &&
1556                     (ptype->af_packet_priv == NULL ||
1557                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1558                         if (pt_prev) {
1559                                 deliver_skb(skb2, pt_prev, skb->dev);
1560                                 pt_prev = ptype;
1561                                 continue;
1562                         }
1563
1564                         skb2 = skb_clone(skb, GFP_ATOMIC);
1565                         if (!skb2)
1566                                 break;
1567
1568                         net_timestamp_set(skb2);
1569
1570                         /* skb->nh should be correctly
1571                            set by sender, so that the second statement is
1572                            just protection against buggy protocols.
1573                          */
1574                         skb_reset_mac_header(skb2);
1575
1576                         if (skb_network_header(skb2) < skb2->data ||
1577                             skb2->network_header > skb2->tail) {
1578                                 if (net_ratelimit())
1579                                         printk(KERN_CRIT "protocol %04x is "
1580                                                "buggy, dev %s\n",
1581                                                ntohs(skb2->protocol),
1582                                                dev->name);
1583                                 skb_reset_network_header(skb2);
1584                         }
1585
1586                         skb2->transport_header = skb2->network_header;
1587                         skb2->pkt_type = PACKET_OUTGOING;
1588                         pt_prev = ptype;
1589                 }
1590         }
1591         if (pt_prev)
1592                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1593         rcu_read_unlock();
1594 }
1595
1596 /*
1597  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1598  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1599  */
1600 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1601 {
1602         int rc;
1603
1604         if (txq < 1 || txq > dev->num_tx_queues)
1605                 return -EINVAL;
1606
1607         if (dev->reg_state == NETREG_REGISTERED) {
1608                 ASSERT_RTNL();
1609
1610                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1611                                                   txq);
1612                 if (rc)
1613                         return rc;
1614
1615                 if (txq < dev->real_num_tx_queues)
1616                         qdisc_reset_all_tx_gt(dev, txq);
1617         }
1618
1619         dev->real_num_tx_queues = txq;
1620         return 0;
1621 }
1622 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1623
1624 #ifdef CONFIG_RPS
1625 /**
1626  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1627  *      @dev: Network device
1628  *      @rxq: Actual number of RX queues
1629  *
1630  *      This must be called either with the rtnl_lock held or before
1631  *      registration of the net device.  Returns 0 on success, or a
1632  *      negative error code.  If called before registration, it always
1633  *      succeeds.
1634  */
1635 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1636 {
1637         int rc;
1638
1639         if (rxq < 1 || rxq > dev->num_rx_queues)
1640                 return -EINVAL;
1641
1642         if (dev->reg_state == NETREG_REGISTERED) {
1643                 ASSERT_RTNL();
1644
1645                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1646                                                   rxq);
1647                 if (rc)
1648                         return rc;
1649         }
1650
1651         dev->real_num_rx_queues = rxq;
1652         return 0;
1653 }
1654 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1655 #endif
1656
1657 static inline void __netif_reschedule(struct Qdisc *q)
1658 {
1659         struct softnet_data *sd;
1660         unsigned long flags;
1661
1662         local_irq_save(flags);
1663         sd = &__get_cpu_var(softnet_data);
1664         q->next_sched = NULL;
1665         *sd->output_queue_tailp = q;
1666         sd->output_queue_tailp = &q->next_sched;
1667         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1668         local_irq_restore(flags);
1669 }
1670
1671 void __netif_schedule(struct Qdisc *q)
1672 {
1673         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1674                 __netif_reschedule(q);
1675 }
1676 EXPORT_SYMBOL(__netif_schedule);
1677
1678 void dev_kfree_skb_irq(struct sk_buff *skb)
1679 {
1680         if (atomic_dec_and_test(&skb->users)) {
1681                 struct softnet_data *sd;
1682                 unsigned long flags;
1683
1684                 local_irq_save(flags);
1685                 sd = &__get_cpu_var(softnet_data);
1686                 skb->next = sd->completion_queue;
1687                 sd->completion_queue = skb;
1688                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1689                 local_irq_restore(flags);
1690         }
1691 }
1692 EXPORT_SYMBOL(dev_kfree_skb_irq);
1693
1694 void dev_kfree_skb_any(struct sk_buff *skb)
1695 {
1696         if (in_irq() || irqs_disabled())
1697                 dev_kfree_skb_irq(skb);
1698         else
1699                 dev_kfree_skb(skb);
1700 }
1701 EXPORT_SYMBOL(dev_kfree_skb_any);
1702
1703
1704 /**
1705  * netif_device_detach - mark device as removed
1706  * @dev: network device
1707  *
1708  * Mark device as removed from system and therefore no longer available.
1709  */
1710 void netif_device_detach(struct net_device *dev)
1711 {
1712         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1713             netif_running(dev)) {
1714                 netif_tx_stop_all_queues(dev);
1715         }
1716 }
1717 EXPORT_SYMBOL(netif_device_detach);
1718
1719 /**
1720  * netif_device_attach - mark device as attached
1721  * @dev: network device
1722  *
1723  * Mark device as attached from system and restart if needed.
1724  */
1725 void netif_device_attach(struct net_device *dev)
1726 {
1727         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1728             netif_running(dev)) {
1729                 netif_tx_wake_all_queues(dev);
1730                 __netdev_watchdog_up(dev);
1731         }
1732 }
1733 EXPORT_SYMBOL(netif_device_attach);
1734
1735 /**
1736  * skb_dev_set -- assign a new device to a buffer
1737  * @skb: buffer for the new device
1738  * @dev: network device
1739  *
1740  * If an skb is owned by a device already, we have to reset
1741  * all data private to the namespace a device belongs to
1742  * before assigning it a new device.
1743  */
1744 #ifdef CONFIG_NET_NS
1745 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1746 {
1747         skb_dst_drop(skb);
1748         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1749                 secpath_reset(skb);
1750                 nf_reset(skb);
1751                 skb_init_secmark(skb);
1752                 skb->mark = 0;
1753                 skb->priority = 0;
1754                 skb->nf_trace = 0;
1755                 skb->ipvs_property = 0;
1756 #ifdef CONFIG_NET_SCHED
1757                 skb->tc_index = 0;
1758 #endif
1759         }
1760         skb->dev = dev;
1761 }
1762 EXPORT_SYMBOL(skb_set_dev);
1763 #endif /* CONFIG_NET_NS */
1764
1765 /*
1766  * Invalidate hardware checksum when packet is to be mangled, and
1767  * complete checksum manually on outgoing path.
1768  */
1769 int skb_checksum_help(struct sk_buff *skb)
1770 {
1771         __wsum csum;
1772         int ret = 0, offset;
1773
1774         if (skb->ip_summed == CHECKSUM_COMPLETE)
1775                 goto out_set_summed;
1776
1777         if (unlikely(skb_shinfo(skb)->gso_size)) {
1778                 /* Let GSO fix up the checksum. */
1779                 goto out_set_summed;
1780         }
1781
1782         offset = skb_checksum_start_offset(skb);
1783         BUG_ON(offset >= skb_headlen(skb));
1784         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1785
1786         offset += skb->csum_offset;
1787         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1788
1789         if (skb_cloned(skb) &&
1790             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1791                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1792                 if (ret)
1793                         goto out;
1794         }
1795
1796         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1797 out_set_summed:
1798         skb->ip_summed = CHECKSUM_NONE;
1799 out:
1800         return ret;
1801 }
1802 EXPORT_SYMBOL(skb_checksum_help);
1803
1804 /**
1805  *      skb_gso_segment - Perform segmentation on skb.
1806  *      @skb: buffer to segment
1807  *      @features: features for the output path (see dev->features)
1808  *
1809  *      This function segments the given skb and returns a list of segments.
1810  *
1811  *      It may return NULL if the skb requires no segmentation.  This is
1812  *      only possible when GSO is used for verifying header integrity.
1813  */
1814 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1815 {
1816         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1817         struct packet_type *ptype;
1818         __be16 type = skb->protocol;
1819         int vlan_depth = ETH_HLEN;
1820         int err;
1821
1822         while (type == htons(ETH_P_8021Q)) {
1823                 struct vlan_hdr *vh;
1824
1825                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1826                         return ERR_PTR(-EINVAL);
1827
1828                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1829                 type = vh->h_vlan_encapsulated_proto;
1830                 vlan_depth += VLAN_HLEN;
1831         }
1832
1833         skb_reset_mac_header(skb);
1834         skb->mac_len = skb->network_header - skb->mac_header;
1835         __skb_pull(skb, skb->mac_len);
1836
1837         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1838                 struct net_device *dev = skb->dev;
1839                 struct ethtool_drvinfo info = {};
1840
1841                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1842                         dev->ethtool_ops->get_drvinfo(dev, &info);
1843
1844                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1845                      info.driver, dev ? dev->features : 0L,
1846                      skb->sk ? skb->sk->sk_route_caps : 0L,
1847                      skb->len, skb->data_len, skb->ip_summed);
1848
1849                 if (skb_header_cloned(skb) &&
1850                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1851                         return ERR_PTR(err);
1852         }
1853
1854         rcu_read_lock();
1855         list_for_each_entry_rcu(ptype,
1856                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1857                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1858                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1859                                 err = ptype->gso_send_check(skb);
1860                                 segs = ERR_PTR(err);
1861                                 if (err || skb_gso_ok(skb, features))
1862                                         break;
1863                                 __skb_push(skb, (skb->data -
1864                                                  skb_network_header(skb)));
1865                         }
1866                         segs = ptype->gso_segment(skb, features);
1867                         break;
1868                 }
1869         }
1870         rcu_read_unlock();
1871
1872         __skb_push(skb, skb->data - skb_mac_header(skb));
1873
1874         return segs;
1875 }
1876 EXPORT_SYMBOL(skb_gso_segment);
1877
1878 /* Take action when hardware reception checksum errors are detected. */
1879 #ifdef CONFIG_BUG
1880 void netdev_rx_csum_fault(struct net_device *dev)
1881 {
1882         if (net_ratelimit()) {
1883                 printk(KERN_ERR "%s: hw csum failure.\n",
1884                         dev ? dev->name : "<unknown>");
1885                 dump_stack();
1886         }
1887 }
1888 EXPORT_SYMBOL(netdev_rx_csum_fault);
1889 #endif
1890
1891 /* Actually, we should eliminate this check as soon as we know, that:
1892  * 1. IOMMU is present and allows to map all the memory.
1893  * 2. No high memory really exists on this machine.
1894  */
1895
1896 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1897 {
1898 #ifdef CONFIG_HIGHMEM
1899         int i;
1900         if (!(dev->features & NETIF_F_HIGHDMA)) {
1901                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1902                         if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1903                                 return 1;
1904         }
1905
1906         if (PCI_DMA_BUS_IS_PHYS) {
1907                 struct device *pdev = dev->dev.parent;
1908
1909                 if (!pdev)
1910                         return 0;
1911                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1912                         dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1913                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1914                                 return 1;
1915                 }
1916         }
1917 #endif
1918         return 0;
1919 }
1920
1921 struct dev_gso_cb {
1922         void (*destructor)(struct sk_buff *skb);
1923 };
1924
1925 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1926
1927 static void dev_gso_skb_destructor(struct sk_buff *skb)
1928 {
1929         struct dev_gso_cb *cb;
1930
1931         do {
1932                 struct sk_buff *nskb = skb->next;
1933
1934                 skb->next = nskb->next;
1935                 nskb->next = NULL;
1936                 kfree_skb(nskb);
1937         } while (skb->next);
1938
1939         cb = DEV_GSO_CB(skb);
1940         if (cb->destructor)
1941                 cb->destructor(skb);
1942 }
1943
1944 /**
1945  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1946  *      @skb: buffer to segment
1947  *      @features: device features as applicable to this skb
1948  *
1949  *      This function segments the given skb and stores the list of segments
1950  *      in skb->next.
1951  */
1952 static int dev_gso_segment(struct sk_buff *skb, int features)
1953 {
1954         struct sk_buff *segs;
1955
1956         segs = skb_gso_segment(skb, features);
1957
1958         /* Verifying header integrity only. */
1959         if (!segs)
1960                 return 0;
1961
1962         if (IS_ERR(segs))
1963                 return PTR_ERR(segs);
1964
1965         skb->next = segs;
1966         DEV_GSO_CB(skb)->destructor = skb->destructor;
1967         skb->destructor = dev_gso_skb_destructor;
1968
1969         return 0;
1970 }
1971
1972 /*
1973  * Try to orphan skb early, right before transmission by the device.
1974  * We cannot orphan skb if tx timestamp is requested or the sk-reference
1975  * is needed on driver level for other reasons, e.g. see net/can/raw.c
1976  */
1977 static inline void skb_orphan_try(struct sk_buff *skb)
1978 {
1979         struct sock *sk = skb->sk;
1980
1981         if (sk && !skb_shinfo(skb)->tx_flags) {
1982                 /* skb_tx_hash() wont be able to get sk.
1983                  * We copy sk_hash into skb->rxhash
1984                  */
1985                 if (!skb->rxhash)
1986                         skb->rxhash = sk->sk_hash;
1987                 skb_orphan(skb);
1988         }
1989 }
1990
1991 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1992 {
1993         return ((features & NETIF_F_GEN_CSUM) ||
1994                 ((features & NETIF_F_V4_CSUM) &&
1995                  protocol == htons(ETH_P_IP)) ||
1996                 ((features & NETIF_F_V6_CSUM) &&
1997                  protocol == htons(ETH_P_IPV6)) ||
1998                 ((features & NETIF_F_FCOE_CRC) &&
1999                  protocol == htons(ETH_P_FCOE)));
2000 }
2001
2002 static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features)
2003 {
2004         if (!can_checksum_protocol(protocol, features)) {
2005                 features &= ~NETIF_F_ALL_CSUM;
2006                 features &= ~NETIF_F_SG;
2007         } else if (illegal_highdma(skb->dev, skb)) {
2008                 features &= ~NETIF_F_SG;
2009         }
2010
2011         return features;
2012 }
2013
2014 int netif_skb_features(struct sk_buff *skb)
2015 {
2016         __be16 protocol = skb->protocol;
2017         int features = skb->dev->features;
2018
2019         if (protocol == htons(ETH_P_8021Q)) {
2020                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2021                 protocol = veh->h_vlan_encapsulated_proto;
2022         } else if (!vlan_tx_tag_present(skb)) {
2023                 return harmonize_features(skb, protocol, features);
2024         }
2025
2026         features &= skb->dev->vlan_features;
2027
2028         if (protocol != htons(ETH_P_8021Q)) {
2029                 return harmonize_features(skb, protocol, features);
2030         } else {
2031                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2032                                 NETIF_F_GEN_CSUM;
2033                 return harmonize_features(skb, protocol, features);
2034         }
2035 }
2036 EXPORT_SYMBOL(netif_skb_features);
2037
2038 /*
2039  * Returns true if either:
2040  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2041  *      2. skb is fragmented and the device does not support SG, or if
2042  *         at least one of fragments is in highmem and device does not
2043  *         support DMA from it.
2044  */
2045 static inline int skb_needs_linearize(struct sk_buff *skb,
2046                                       int features)
2047 {
2048         return skb_is_nonlinear(skb) &&
2049                         ((skb_has_frag_list(skb) &&
2050                                 !(features & NETIF_F_FRAGLIST)) ||
2051                         (skb_shinfo(skb)->nr_frags &&
2052                                 !(features & NETIF_F_SG)));
2053 }
2054
2055 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2056                         struct netdev_queue *txq)
2057 {
2058         const struct net_device_ops *ops = dev->netdev_ops;
2059         int rc = NETDEV_TX_OK;
2060
2061         if (likely(!skb->next)) {
2062                 int features;
2063
2064                 /*
2065                  * If device doesnt need skb->dst, release it right now while
2066                  * its hot in this cpu cache
2067                  */
2068                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2069                         skb_dst_drop(skb);
2070
2071                 if (!list_empty(&ptype_all))
2072                         dev_queue_xmit_nit(skb, dev);
2073
2074                 skb_orphan_try(skb);
2075
2076                 features = netif_skb_features(skb);
2077
2078                 if (vlan_tx_tag_present(skb) &&
2079                     !(features & NETIF_F_HW_VLAN_TX)) {
2080                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2081                         if (unlikely(!skb))
2082                                 goto out;
2083
2084                         skb->vlan_tci = 0;
2085                 }
2086
2087                 if (netif_needs_gso(skb, features)) {
2088                         if (unlikely(dev_gso_segment(skb, features)))
2089                                 goto out_kfree_skb;
2090                         if (skb->next)
2091                                 goto gso;
2092                 } else {
2093                         if (skb_needs_linearize(skb, features) &&
2094                             __skb_linearize(skb))
2095                                 goto out_kfree_skb;
2096
2097                         /* If packet is not checksummed and device does not
2098                          * support checksumming for this protocol, complete
2099                          * checksumming here.
2100                          */
2101                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2102                                 skb_set_transport_header(skb,
2103                                         skb_checksum_start_offset(skb));
2104                                 if (!(features & NETIF_F_ALL_CSUM) &&
2105                                      skb_checksum_help(skb))
2106                                         goto out_kfree_skb;
2107                         }
2108                 }
2109
2110                 rc = ops->ndo_start_xmit(skb, dev);
2111                 trace_net_dev_xmit(skb, rc);
2112                 if (rc == NETDEV_TX_OK)
2113                         txq_trans_update(txq);
2114                 return rc;
2115         }
2116
2117 gso:
2118         do {
2119                 struct sk_buff *nskb = skb->next;
2120
2121                 skb->next = nskb->next;
2122                 nskb->next = NULL;
2123
2124                 /*
2125                  * If device doesnt need nskb->dst, release it right now while
2126                  * its hot in this cpu cache
2127                  */
2128                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2129                         skb_dst_drop(nskb);
2130
2131                 rc = ops->ndo_start_xmit(nskb, dev);
2132                 trace_net_dev_xmit(nskb, rc);
2133                 if (unlikely(rc != NETDEV_TX_OK)) {
2134                         if (rc & ~NETDEV_TX_MASK)
2135                                 goto out_kfree_gso_skb;
2136                         nskb->next = skb->next;
2137                         skb->next = nskb;
2138                         return rc;
2139                 }
2140                 txq_trans_update(txq);
2141                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2142                         return NETDEV_TX_BUSY;
2143         } while (skb->next);
2144
2145 out_kfree_gso_skb:
2146         if (likely(skb->next == NULL))
2147                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2148 out_kfree_skb:
2149         kfree_skb(skb);
2150 out:
2151         return rc;
2152 }
2153
2154 static u32 hashrnd __read_mostly;
2155
2156 /*
2157  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2158  * to be used as a distribution range.
2159  */
2160 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2161                   unsigned int num_tx_queues)
2162 {
2163         u32 hash;
2164
2165         if (skb_rx_queue_recorded(skb)) {
2166                 hash = skb_get_rx_queue(skb);
2167                 while (unlikely(hash >= num_tx_queues))
2168                         hash -= num_tx_queues;
2169                 return hash;
2170         }
2171
2172         if (skb->sk && skb->sk->sk_hash)
2173                 hash = skb->sk->sk_hash;
2174         else
2175                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2176         hash = jhash_1word(hash, hashrnd);
2177
2178         return (u16) (((u64) hash * num_tx_queues) >> 32);
2179 }
2180 EXPORT_SYMBOL(__skb_tx_hash);
2181
2182 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2183 {
2184         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2185                 if (net_ratelimit()) {
2186                         pr_warning("%s selects TX queue %d, but "
2187                                 "real number of TX queues is %d\n",
2188                                 dev->name, queue_index, dev->real_num_tx_queues);
2189                 }
2190                 return 0;
2191         }
2192         return queue_index;
2193 }
2194
2195 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2196 {
2197 #ifdef CONFIG_XPS
2198         struct xps_dev_maps *dev_maps;
2199         struct xps_map *map;
2200         int queue_index = -1;
2201
2202         rcu_read_lock();
2203         dev_maps = rcu_dereference(dev->xps_maps);
2204         if (dev_maps) {
2205                 map = rcu_dereference(
2206                     dev_maps->cpu_map[raw_smp_processor_id()]);
2207                 if (map) {
2208                         if (map->len == 1)
2209                                 queue_index = map->queues[0];
2210                         else {
2211                                 u32 hash;
2212                                 if (skb->sk && skb->sk->sk_hash)
2213                                         hash = skb->sk->sk_hash;
2214                                 else
2215                                         hash = (__force u16) skb->protocol ^
2216                                             skb->rxhash;
2217                                 hash = jhash_1word(hash, hashrnd);
2218                                 queue_index = map->queues[
2219                                     ((u64)hash * map->len) >> 32];
2220                         }
2221                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2222                                 queue_index = -1;
2223                 }
2224         }
2225         rcu_read_unlock();
2226
2227         return queue_index;
2228 #else
2229         return -1;
2230 #endif
2231 }
2232
2233 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2234                                         struct sk_buff *skb)
2235 {
2236         int queue_index;
2237         const struct net_device_ops *ops = dev->netdev_ops;
2238
2239         if (dev->real_num_tx_queues == 1)
2240                 queue_index = 0;
2241         else if (ops->ndo_select_queue) {
2242                 queue_index = ops->ndo_select_queue(dev, skb);
2243                 queue_index = dev_cap_txqueue(dev, queue_index);
2244         } else {
2245                 struct sock *sk = skb->sk;
2246                 queue_index = sk_tx_queue_get(sk);
2247
2248                 if (queue_index < 0 || skb->ooo_okay ||
2249                     queue_index >= dev->real_num_tx_queues) {
2250                         int old_index = queue_index;
2251
2252                         queue_index = get_xps_queue(dev, skb);
2253                         if (queue_index < 0)
2254                                 queue_index = skb_tx_hash(dev, skb);
2255
2256                         if (queue_index != old_index && sk) {
2257                                 struct dst_entry *dst =
2258                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2259
2260                                 if (dst && skb_dst(skb) == dst)
2261                                         sk_tx_queue_set(sk, queue_index);
2262                         }
2263                 }
2264         }
2265
2266         skb_set_queue_mapping(skb, queue_index);
2267         return netdev_get_tx_queue(dev, queue_index);
2268 }
2269
2270 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2271                                  struct net_device *dev,
2272                                  struct netdev_queue *txq)
2273 {
2274         spinlock_t *root_lock = qdisc_lock(q);
2275         bool contended = qdisc_is_running(q);
2276         int rc;
2277
2278         /*
2279          * Heuristic to force contended enqueues to serialize on a
2280          * separate lock before trying to get qdisc main lock.
2281          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2282          * and dequeue packets faster.
2283          */
2284         if (unlikely(contended))
2285                 spin_lock(&q->busylock);
2286
2287         spin_lock(root_lock);
2288         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2289                 kfree_skb(skb);
2290                 rc = NET_XMIT_DROP;
2291         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2292                    qdisc_run_begin(q)) {
2293                 /*
2294                  * This is a work-conserving queue; there are no old skbs
2295                  * waiting to be sent out; and the qdisc is not running -
2296                  * xmit the skb directly.
2297                  */
2298                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2299                         skb_dst_force(skb);
2300
2301                 qdisc_skb_cb(skb)->pkt_len = skb->len;
2302                 qdisc_bstats_update(q, skb);
2303
2304                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2305                         if (unlikely(contended)) {
2306                                 spin_unlock(&q->busylock);
2307                                 contended = false;
2308                         }
2309                         __qdisc_run(q);
2310                 } else
2311                         qdisc_run_end(q);
2312
2313                 rc = NET_XMIT_SUCCESS;
2314         } else {
2315                 skb_dst_force(skb);
2316                 rc = qdisc_enqueue_root(skb, q);
2317                 if (qdisc_run_begin(q)) {
2318                         if (unlikely(contended)) {
2319                                 spin_unlock(&q->busylock);
2320                                 contended = false;
2321                         }
2322                         __qdisc_run(q);
2323                 }
2324         }
2325         spin_unlock(root_lock);
2326         if (unlikely(contended))
2327                 spin_unlock(&q->busylock);
2328         return rc;
2329 }
2330
2331 static DEFINE_PER_CPU(int, xmit_recursion);
2332 #define RECURSION_LIMIT 10
2333
2334 /**
2335  *      dev_queue_xmit - transmit a buffer
2336  *      @skb: buffer to transmit
2337  *
2338  *      Queue a buffer for transmission to a network device. The caller must
2339  *      have set the device and priority and built the buffer before calling
2340  *      this function. The function can be called from an interrupt.
2341  *
2342  *      A negative errno code is returned on a failure. A success does not
2343  *      guarantee the frame will be transmitted as it may be dropped due
2344  *      to congestion or traffic shaping.
2345  *
2346  * -----------------------------------------------------------------------------------
2347  *      I notice this method can also return errors from the queue disciplines,
2348  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2349  *      be positive.
2350  *
2351  *      Regardless of the return value, the skb is consumed, so it is currently
2352  *      difficult to retry a send to this method.  (You can bump the ref count
2353  *      before sending to hold a reference for retry if you are careful.)
2354  *
2355  *      When calling this method, interrupts MUST be enabled.  This is because
2356  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2357  *          --BLG
2358  */
2359 int dev_queue_xmit(struct sk_buff *skb)
2360 {
2361         struct net_device *dev = skb->dev;
2362         struct netdev_queue *txq;
2363         struct Qdisc *q;
2364         int rc = -ENOMEM;
2365
2366         /* Disable soft irqs for various locks below. Also
2367          * stops preemption for RCU.
2368          */
2369         rcu_read_lock_bh();
2370
2371         txq = dev_pick_tx(dev, skb);
2372         q = rcu_dereference_bh(txq->qdisc);
2373
2374 #ifdef CONFIG_NET_CLS_ACT
2375         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2376 #endif
2377         trace_net_dev_queue(skb);
2378         if (q->enqueue) {
2379                 rc = __dev_xmit_skb(skb, q, dev, txq);
2380                 goto out;
2381         }
2382
2383         /* The device has no queue. Common case for software devices:
2384            loopback, all the sorts of tunnels...
2385
2386            Really, it is unlikely that netif_tx_lock protection is necessary
2387            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2388            counters.)
2389            However, it is possible, that they rely on protection
2390            made by us here.
2391
2392            Check this and shot the lock. It is not prone from deadlocks.
2393            Either shot noqueue qdisc, it is even simpler 8)
2394          */
2395         if (dev->flags & IFF_UP) {
2396                 int cpu = smp_processor_id(); /* ok because BHs are off */
2397
2398                 if (txq->xmit_lock_owner != cpu) {
2399
2400                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2401                                 goto recursion_alert;
2402
2403                         HARD_TX_LOCK(dev, txq, cpu);
2404
2405                         if (!netif_tx_queue_stopped(txq)) {
2406                                 __this_cpu_inc(xmit_recursion);
2407                                 rc = dev_hard_start_xmit(skb, dev, txq);
2408                                 __this_cpu_dec(xmit_recursion);
2409                                 if (dev_xmit_complete(rc)) {
2410                                         HARD_TX_UNLOCK(dev, txq);
2411                                         goto out;
2412                                 }
2413                         }
2414                         HARD_TX_UNLOCK(dev, txq);
2415                         if (net_ratelimit())
2416                                 printk(KERN_CRIT "Virtual device %s asks to "
2417                                        "queue packet!\n", dev->name);
2418                 } else {
2419                         /* Recursion is detected! It is possible,
2420                          * unfortunately
2421                          */
2422 recursion_alert:
2423                         if (net_ratelimit())
2424                                 printk(KERN_CRIT "Dead loop on virtual device "
2425                                        "%s, fix it urgently!\n", dev->name);
2426                 }
2427         }
2428
2429         rc = -ENETDOWN;
2430         rcu_read_unlock_bh();
2431
2432         kfree_skb(skb);
2433         return rc;
2434 out:
2435         rcu_read_unlock_bh();
2436         return rc;
2437 }
2438 EXPORT_SYMBOL(dev_queue_xmit);
2439
2440
2441 /*=======================================================================
2442                         Receiver routines
2443   =======================================================================*/
2444
2445 int netdev_max_backlog __read_mostly = 1000;
2446 int netdev_tstamp_prequeue __read_mostly = 1;
2447 int netdev_budget __read_mostly = 300;
2448 int weight_p __read_mostly = 64;            /* old backlog weight */
2449
2450 /* Called with irq disabled */
2451 static inline void ____napi_schedule(struct softnet_data *sd,
2452                                      struct napi_struct *napi)
2453 {
2454         list_add_tail(&napi->poll_list, &sd->poll_list);
2455         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2456 }
2457
2458 /*
2459  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2460  * and src/dst port numbers. Returns a non-zero hash number on success
2461  * and 0 on failure.
2462  */
2463 __u32 __skb_get_rxhash(struct sk_buff *skb)
2464 {
2465         int nhoff, hash = 0, poff;
2466         struct ipv6hdr *ip6;
2467         struct iphdr *ip;
2468         u8 ip_proto;
2469         u32 addr1, addr2, ihl;
2470         union {
2471                 u32 v32;
2472                 u16 v16[2];
2473         } ports;
2474
2475         nhoff = skb_network_offset(skb);
2476
2477         switch (skb->protocol) {
2478         case __constant_htons(ETH_P_IP):
2479                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2480                         goto done;
2481
2482                 ip = (struct iphdr *) (skb->data + nhoff);
2483                 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2484                         ip_proto = 0;
2485                 else
2486                         ip_proto = ip->protocol;
2487                 addr1 = (__force u32) ip->saddr;
2488                 addr2 = (__force u32) ip->daddr;
2489                 ihl = ip->ihl;
2490                 break;
2491         case __constant_htons(ETH_P_IPV6):
2492                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2493                         goto done;
2494
2495                 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2496                 ip_proto = ip6->nexthdr;
2497                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2498                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2499                 ihl = (40 >> 2);
2500                 break;
2501         default:
2502                 goto done;
2503         }
2504
2505         ports.v32 = 0;
2506         poff = proto_ports_offset(ip_proto);
2507         if (poff >= 0) {
2508                 nhoff += ihl * 4 + poff;
2509                 if (pskb_may_pull(skb, nhoff + 4)) {
2510                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2511                         if (ports.v16[1] < ports.v16[0])
2512                                 swap(ports.v16[0], ports.v16[1]);
2513                 }
2514         }
2515
2516         /* get a consistent hash (same value on both flow directions) */
2517         if (addr2 < addr1)
2518                 swap(addr1, addr2);
2519
2520         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2521         if (!hash)
2522                 hash = 1;
2523
2524 done:
2525         return hash;
2526 }
2527 EXPORT_SYMBOL(__skb_get_rxhash);
2528
2529 #ifdef CONFIG_RPS
2530
2531 /* One global table that all flow-based protocols share. */
2532 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2533 EXPORT_SYMBOL(rps_sock_flow_table);
2534
2535 /*
2536  * get_rps_cpu is called from netif_receive_skb and returns the target
2537  * CPU from the RPS map of the receiving queue for a given skb.
2538  * rcu_read_lock must be held on entry.
2539  */
2540 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2541                        struct rps_dev_flow **rflowp)
2542 {
2543         struct netdev_rx_queue *rxqueue;
2544         struct rps_map *map;
2545         struct rps_dev_flow_table *flow_table;
2546         struct rps_sock_flow_table *sock_flow_table;
2547         int cpu = -1;
2548         u16 tcpu;
2549
2550         if (skb_rx_queue_recorded(skb)) {
2551                 u16 index = skb_get_rx_queue(skb);
2552                 if (unlikely(index >= dev->real_num_rx_queues)) {
2553                         WARN_ONCE(dev->real_num_rx_queues > 1,
2554                                   "%s received packet on queue %u, but number "
2555                                   "of RX queues is %u\n",
2556                                   dev->name, index, dev->real_num_rx_queues);
2557                         goto done;
2558                 }
2559                 rxqueue = dev->_rx + index;
2560         } else
2561                 rxqueue = dev->_rx;
2562
2563         map = rcu_dereference(rxqueue->rps_map);
2564         if (map) {
2565                 if (map->len == 1) {
2566                         tcpu = map->cpus[0];
2567                         if (cpu_online(tcpu))
2568                                 cpu = tcpu;
2569                         goto done;
2570                 }
2571         } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2572                 goto done;
2573         }
2574
2575         skb_reset_network_header(skb);
2576         if (!skb_get_rxhash(skb))
2577                 goto done;
2578
2579         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2580         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2581         if (flow_table && sock_flow_table) {
2582                 u16 next_cpu;
2583                 struct rps_dev_flow *rflow;
2584
2585                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2586                 tcpu = rflow->cpu;
2587
2588                 next_cpu = sock_flow_table->ents[skb->rxhash &
2589                     sock_flow_table->mask];
2590
2591                 /*
2592                  * If the desired CPU (where last recvmsg was done) is
2593                  * different from current CPU (one in the rx-queue flow
2594                  * table entry), switch if one of the following holds:
2595                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2596                  *   - Current CPU is offline.
2597                  *   - The current CPU's queue tail has advanced beyond the
2598                  *     last packet that was enqueued using this table entry.
2599                  *     This guarantees that all previous packets for the flow
2600                  *     have been dequeued, thus preserving in order delivery.
2601                  */
2602                 if (unlikely(tcpu != next_cpu) &&
2603                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2604                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2605                       rflow->last_qtail)) >= 0)) {
2606                         tcpu = rflow->cpu = next_cpu;
2607                         if (tcpu != RPS_NO_CPU)
2608                                 rflow->last_qtail = per_cpu(softnet_data,
2609                                     tcpu).input_queue_head;
2610                 }
2611                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2612                         *rflowp = rflow;
2613                         cpu = tcpu;
2614                         goto done;
2615                 }
2616         }
2617
2618         if (map) {
2619                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2620
2621                 if (cpu_online(tcpu)) {
2622                         cpu = tcpu;
2623                         goto done;
2624                 }
2625         }
2626
2627 done:
2628         return cpu;
2629 }
2630
2631 /* Called from hardirq (IPI) context */
2632 static void rps_trigger_softirq(void *data)
2633 {
2634         struct softnet_data *sd = data;
2635
2636         ____napi_schedule(sd, &sd->backlog);
2637         sd->received_rps++;
2638 }
2639
2640 #endif /* CONFIG_RPS */
2641
2642 /*
2643  * Check if this softnet_data structure is another cpu one
2644  * If yes, queue it to our IPI list and return 1
2645  * If no, return 0
2646  */
2647 static int rps_ipi_queued(struct softnet_data *sd)
2648 {
2649 #ifdef CONFIG_RPS
2650         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2651
2652         if (sd != mysd) {
2653                 sd->rps_ipi_next = mysd->rps_ipi_list;
2654                 mysd->rps_ipi_list = sd;
2655
2656                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2657                 return 1;
2658         }
2659 #endif /* CONFIG_RPS */
2660         return 0;
2661 }
2662
2663 /*
2664  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2665  * queue (may be a remote CPU queue).
2666  */
2667 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2668                               unsigned int *qtail)
2669 {
2670         struct softnet_data *sd;
2671         unsigned long flags;
2672
2673         sd = &per_cpu(softnet_data, cpu);
2674
2675         local_irq_save(flags);
2676
2677         rps_lock(sd);
2678         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2679                 if (skb_queue_len(&sd->input_pkt_queue)) {
2680 enqueue:
2681                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2682                         input_queue_tail_incr_save(sd, qtail);
2683                         rps_unlock(sd);
2684                         local_irq_restore(flags);
2685                         return NET_RX_SUCCESS;
2686                 }
2687
2688                 /* Schedule NAPI for backlog device
2689                  * We can use non atomic operation since we own the queue lock
2690                  */
2691                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2692                         if (!rps_ipi_queued(sd))
2693                                 ____napi_schedule(sd, &sd->backlog);
2694                 }
2695                 goto enqueue;
2696         }
2697
2698         sd->dropped++;
2699         rps_unlock(sd);
2700
2701         local_irq_restore(flags);
2702
2703         atomic_long_inc(&skb->dev->rx_dropped);
2704         kfree_skb(skb);
2705         return NET_RX_DROP;
2706 }
2707
2708 /**
2709  *      netif_rx        -       post buffer to the network code
2710  *      @skb: buffer to post
2711  *
2712  *      This function receives a packet from a device driver and queues it for
2713  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2714  *      may be dropped during processing for congestion control or by the
2715  *      protocol layers.
2716  *
2717  *      return values:
2718  *      NET_RX_SUCCESS  (no congestion)
2719  *      NET_RX_DROP     (packet was dropped)
2720  *
2721  */
2722
2723 int netif_rx(struct sk_buff *skb)
2724 {
2725         int ret;
2726
2727         /* if netpoll wants it, pretend we never saw it */
2728         if (netpoll_rx(skb))
2729                 return NET_RX_DROP;
2730
2731         if (netdev_tstamp_prequeue)
2732                 net_timestamp_check(skb);
2733
2734         trace_netif_rx(skb);
2735 #ifdef CONFIG_RPS
2736         {
2737                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2738                 int cpu;
2739
2740                 preempt_disable();
2741                 rcu_read_lock();
2742
2743                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2744                 if (cpu < 0)
2745                         cpu = smp_processor_id();
2746
2747                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2748
2749                 rcu_read_unlock();
2750                 preempt_enable();
2751         }
2752 #else
2753         {
2754                 unsigned int qtail;
2755                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2756                 put_cpu();
2757         }
2758 #endif
2759         return ret;
2760 }
2761 EXPORT_SYMBOL(netif_rx);
2762
2763 int netif_rx_ni(struct sk_buff *skb)
2764 {
2765         int err;
2766
2767         preempt_disable();
2768         err = netif_rx(skb);
2769         if (local_softirq_pending())
2770                 do_softirq();
2771         preempt_enable();
2772
2773         return err;
2774 }
2775 EXPORT_SYMBOL(netif_rx_ni);
2776
2777 static void net_tx_action(struct softirq_action *h)
2778 {
2779         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2780
2781         if (sd->completion_queue) {
2782                 struct sk_buff *clist;
2783
2784                 local_irq_disable();
2785                 clist = sd->completion_queue;
2786                 sd->completion_queue = NULL;
2787                 local_irq_enable();
2788
2789                 while (clist) {
2790                         struct sk_buff *skb = clist;
2791                         clist = clist->next;
2792
2793                         WARN_ON(atomic_read(&skb->users));
2794                         trace_kfree_skb(skb, net_tx_action);
2795                         __kfree_skb(skb);
2796                 }
2797         }
2798
2799         if (sd->output_queue) {
2800                 struct Qdisc *head;
2801
2802                 local_irq_disable();
2803                 head = sd->output_queue;
2804                 sd->output_queue = NULL;
2805                 sd->output_queue_tailp = &sd->output_queue;
2806                 local_irq_enable();
2807
2808                 while (head) {
2809                         struct Qdisc *q = head;
2810                         spinlock_t *root_lock;
2811
2812                         head = head->next_sched;
2813
2814                         root_lock = qdisc_lock(q);
2815                         if (spin_trylock(root_lock)) {
2816                                 smp_mb__before_clear_bit();
2817                                 clear_bit(__QDISC_STATE_SCHED,
2818                                           &q->state);
2819                                 qdisc_run(q);
2820                                 spin_unlock(root_lock);
2821                         } else {
2822                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2823                                               &q->state)) {
2824                                         __netif_reschedule(q);
2825                                 } else {
2826                                         smp_mb__before_clear_bit();
2827                                         clear_bit(__QDISC_STATE_SCHED,
2828                                                   &q->state);
2829                                 }
2830                         }
2831                 }
2832         }
2833 }
2834
2835 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2836     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2837 /* This hook is defined here for ATM LANE */
2838 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2839                              unsigned char *addr) __read_mostly;
2840 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2841 #endif
2842
2843 #ifdef CONFIG_NET_CLS_ACT
2844 /* TODO: Maybe we should just force sch_ingress to be compiled in
2845  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2846  * a compare and 2 stores extra right now if we dont have it on
2847  * but have CONFIG_NET_CLS_ACT
2848  * NOTE: This doesnt stop any functionality; if you dont have
2849  * the ingress scheduler, you just cant add policies on ingress.
2850  *
2851  */
2852 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2853 {
2854         struct net_device *dev = skb->dev;
2855         u32 ttl = G_TC_RTTL(skb->tc_verd);
2856         int result = TC_ACT_OK;
2857         struct Qdisc *q;
2858
2859         if (unlikely(MAX_RED_LOOP < ttl++)) {
2860                 if (net_ratelimit())
2861                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2862                                skb->skb_iif, dev->ifindex);
2863                 return TC_ACT_SHOT;
2864         }
2865
2866         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2867         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2868
2869         q = rxq->qdisc;
2870         if (q != &noop_qdisc) {
2871                 spin_lock(qdisc_lock(q));
2872                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2873                         result = qdisc_enqueue_root(skb, q);
2874                 spin_unlock(qdisc_lock(q));
2875         }
2876
2877         return result;
2878 }
2879
2880 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2881                                          struct packet_type **pt_prev,
2882                                          int *ret, struct net_device *orig_dev)
2883 {
2884         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2885
2886         if (!rxq || rxq->qdisc == &noop_qdisc)
2887                 goto out;
2888
2889         if (*pt_prev) {
2890                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2891                 *pt_prev = NULL;
2892         }
2893
2894         switch (ing_filter(skb, rxq)) {
2895         case TC_ACT_SHOT:
2896         case TC_ACT_STOLEN:
2897                 kfree_skb(skb);
2898                 return NULL;
2899         }
2900
2901 out:
2902         skb->tc_verd = 0;
2903         return skb;
2904 }
2905 #endif
2906
2907 /**
2908  *      netdev_rx_handler_register - register receive handler
2909  *      @dev: device to register a handler for
2910  *      @rx_handler: receive handler to register
2911  *      @rx_handler_data: data pointer that is used by rx handler
2912  *
2913  *      Register a receive hander for a device. This handler will then be
2914  *      called from __netif_receive_skb. A negative errno code is returned
2915  *      on a failure.
2916  *
2917  *      The caller must hold the rtnl_mutex.
2918  */
2919 int netdev_rx_handler_register(struct net_device *dev,
2920                                rx_handler_func_t *rx_handler,
2921                                void *rx_handler_data)
2922 {
2923         ASSERT_RTNL();
2924
2925         if (dev->rx_handler)
2926                 return -EBUSY;
2927
2928         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2929         rcu_assign_pointer(dev->rx_handler, rx_handler);
2930
2931         return 0;
2932 }
2933 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2934
2935 /**
2936  *      netdev_rx_handler_unregister - unregister receive handler
2937  *      @dev: device to unregister a handler from
2938  *
2939  *      Unregister a receive hander from a device.
2940  *
2941  *      The caller must hold the rtnl_mutex.
2942  */
2943 void netdev_rx_handler_unregister(struct net_device *dev)
2944 {
2945
2946         ASSERT_RTNL();
2947         rcu_assign_pointer(dev->rx_handler, NULL);
2948         rcu_assign_pointer(dev->rx_handler_data, NULL);
2949 }
2950 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2951
2952 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2953                                               struct net_device *master)
2954 {
2955         if (skb->pkt_type == PACKET_HOST) {
2956                 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2957
2958                 memcpy(dest, master->dev_addr, ETH_ALEN);
2959         }
2960 }
2961
2962 /* On bonding slaves other than the currently active slave, suppress
2963  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2964  * ARP on active-backup slaves with arp_validate enabled.
2965  */
2966 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2967 {
2968         struct net_device *dev = skb->dev;
2969
2970         if (master->priv_flags & IFF_MASTER_ARPMON)
2971                 dev->last_rx = jiffies;
2972
2973         if ((master->priv_flags & IFF_MASTER_ALB) &&
2974             (master->priv_flags & IFF_BRIDGE_PORT)) {
2975                 /* Do address unmangle. The local destination address
2976                  * will be always the one master has. Provides the right
2977                  * functionality in a bridge.
2978                  */
2979                 skb_bond_set_mac_by_master(skb, master);
2980         }
2981
2982         if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2983                 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2984                     skb->protocol == __cpu_to_be16(ETH_P_ARP))
2985                         return 0;
2986
2987                 if (master->priv_flags & IFF_MASTER_ALB) {
2988                         if (skb->pkt_type != PACKET_BROADCAST &&
2989                             skb->pkt_type != PACKET_MULTICAST)
2990                                 return 0;
2991                 }
2992                 if (master->priv_flags & IFF_MASTER_8023AD &&
2993                     skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2994                         return 0;
2995
2996                 return 1;
2997         }
2998         return 0;
2999 }
3000 EXPORT_SYMBOL(__skb_bond_should_drop);
3001
3002 static int __netif_receive_skb(struct sk_buff *skb)
3003 {
3004         struct packet_type *ptype, *pt_prev;
3005         rx_handler_func_t *rx_handler;
3006         struct net_device *orig_dev;
3007         struct net_device *master;
3008         struct net_device *null_or_orig;
3009         struct net_device *orig_or_bond;
3010         int ret = NET_RX_DROP;
3011         __be16 type;
3012
3013         if (!netdev_tstamp_prequeue)
3014                 net_timestamp_check(skb);
3015
3016         trace_netif_receive_skb(skb);
3017
3018         /* if we've gotten here through NAPI, check netpoll */
3019         if (netpoll_receive_skb(skb))
3020                 return NET_RX_DROP;
3021
3022         if (!skb->skb_iif)
3023                 skb->skb_iif = skb->dev->ifindex;
3024
3025         /*
3026          * bonding note: skbs received on inactive slaves should only
3027          * be delivered to pkt handlers that are exact matches.  Also
3028          * the deliver_no_wcard flag will be set.  If packet handlers
3029          * are sensitive to duplicate packets these skbs will need to
3030          * be dropped at the handler.
3031          */
3032         null_or_orig = NULL;
3033         orig_dev = skb->dev;
3034         master = ACCESS_ONCE(orig_dev->master);
3035         if (skb->deliver_no_wcard)
3036                 null_or_orig = orig_dev;
3037         else if (master) {
3038                 if (skb_bond_should_drop(skb, master)) {
3039                         skb->deliver_no_wcard = 1;
3040                         null_or_orig = orig_dev; /* deliver only exact match */
3041                 } else
3042                         skb->dev = master;
3043         }
3044
3045         __this_cpu_inc(softnet_data.processed);
3046         skb_reset_network_header(skb);
3047         skb_reset_transport_header(skb);
3048         skb->mac_len = skb->network_header - skb->mac_header;
3049
3050         pt_prev = NULL;
3051
3052         rcu_read_lock();
3053
3054 #ifdef CONFIG_NET_CLS_ACT
3055         if (skb->tc_verd & TC_NCLS) {
3056                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3057                 goto ncls;
3058         }
3059 #endif
3060
3061         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3062                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3063                     ptype->dev == orig_dev) {
3064                         if (pt_prev)
3065                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3066                         pt_prev = ptype;
3067                 }
3068         }
3069
3070 #ifdef CONFIG_NET_CLS_ACT
3071         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3072         if (!skb)
3073                 goto out;
3074 ncls:
3075 #endif
3076
3077         /* Handle special case of bridge or macvlan */
3078         rx_handler = rcu_dereference(skb->dev->rx_handler);
3079         if (rx_handler) {
3080                 if (pt_prev) {
3081                         ret = deliver_skb(skb, pt_prev, orig_dev);
3082                         pt_prev = NULL;
3083                 }
3084                 skb = rx_handler(skb);
3085                 if (!skb)
3086                         goto out;
3087         }
3088
3089         if (vlan_tx_tag_present(skb)) {
3090                 if (pt_prev) {
3091                         ret = deliver_skb(skb, pt_prev, orig_dev);
3092                         pt_prev = NULL;
3093                 }
3094                 if (vlan_hwaccel_do_receive(&skb)) {
3095                         ret = __netif_receive_skb(skb);
3096                         goto out;
3097                 } else if (unlikely(!skb))
3098                         goto out;
3099         }
3100
3101         /*
3102          * Make sure frames received on VLAN interfaces stacked on
3103          * bonding interfaces still make their way to any base bonding
3104          * device that may have registered for a specific ptype.  The
3105          * handler may have to adjust skb->dev and orig_dev.
3106          */
3107         orig_or_bond = orig_dev;
3108         if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3109             (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
3110                 orig_or_bond = vlan_dev_real_dev(skb->dev);
3111         }
3112
3113         type = skb->protocol;
3114         list_for_each_entry_rcu(ptype,
3115                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3116                 if (ptype->type == type && (ptype->dev == null_or_orig ||
3117                      ptype->dev == skb->dev || ptype->dev == orig_dev ||
3118                      ptype->dev == orig_or_bond)) {
3119                         if (pt_prev)
3120                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3121                         pt_prev = ptype;
3122                 }
3123         }
3124
3125         if (pt_prev) {
3126                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3127         } else {
3128                 atomic_long_inc(&skb->dev->rx_dropped);
3129                 kfree_skb(skb);
3130                 /* Jamal, now you will not able to escape explaining
3131                  * me how you were going to use this. :-)
3132                  */
3133                 ret = NET_RX_DROP;
3134         }
3135
3136 out:
3137         rcu_read_unlock();
3138         return ret;
3139 }
3140
3141 /**
3142  *      netif_receive_skb - process receive buffer from network
3143  *      @skb: buffer to process
3144  *
3145  *      netif_receive_skb() is the main receive data processing function.
3146  *      It always succeeds. The buffer may be dropped during processing
3147  *      for congestion control or by the protocol layers.
3148  *
3149  *      This function may only be called from softirq context and interrupts
3150  *      should be enabled.
3151  *
3152  *      Return values (usually ignored):
3153  *      NET_RX_SUCCESS: no congestion
3154  *      NET_RX_DROP: packet was dropped
3155  */
3156 int netif_receive_skb(struct sk_buff *skb)
3157 {
3158         if (netdev_tstamp_prequeue)
3159                 net_timestamp_check(skb);
3160
3161         if (skb_defer_rx_timestamp(skb))
3162                 return NET_RX_SUCCESS;
3163
3164 #ifdef CONFIG_RPS
3165         {
3166                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3167                 int cpu, ret;
3168
3169                 rcu_read_lock();
3170
3171                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3172
3173                 if (cpu >= 0) {
3174                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3175                         rcu_read_unlock();
3176                 } else {
3177                         rcu_read_unlock();
3178                         ret = __netif_receive_skb(skb);
3179                 }
3180
3181                 return ret;
3182         }
3183 #else
3184         return __netif_receive_skb(skb);
3185 #endif
3186 }
3187 EXPORT_SYMBOL(netif_receive_skb);
3188
3189 /* Network device is going away, flush any packets still pending
3190  * Called with irqs disabled.
3191  */
3192 static void flush_backlog(void *arg)
3193 {
3194         struct net_device *dev = arg;
3195         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3196         struct sk_buff *skb, *tmp;
3197
3198         rps_lock(sd);
3199         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3200                 if (skb->dev == dev) {
3201                         __skb_unlink(skb, &sd->input_pkt_queue);
3202                         kfree_skb(skb);
3203                         input_queue_head_incr(sd);
3204                 }
3205         }
3206         rps_unlock(sd);
3207
3208         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3209                 if (skb->dev == dev) {
3210                         __skb_unlink(skb, &sd->process_queue);
3211                         kfree_skb(skb);
3212                         input_queue_head_incr(sd);
3213                 }
3214         }
3215 }
3216
3217 static int napi_gro_complete(struct sk_buff *skb)
3218 {
3219         struct packet_type *ptype;
3220         __be16 type = skb->protocol;
3221         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3222         int err = -ENOENT;
3223
3224         if (NAPI_GRO_CB(skb)->count == 1) {
3225                 skb_shinfo(skb)->gso_size = 0;
3226                 goto out;
3227         }
3228
3229         rcu_read_lock();
3230         list_for_each_entry_rcu(ptype, head, list) {
3231                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3232                         continue;
3233
3234                 err = ptype->gro_complete(skb);
3235                 break;
3236         }
3237         rcu_read_unlock();
3238
3239         if (err) {
3240                 WARN_ON(&ptype->list == head);
3241                 kfree_skb(skb);
3242                 return NET_RX_SUCCESS;
3243         }
3244
3245 out:
3246         return netif_receive_skb(skb);
3247 }
3248
3249 inline void napi_gro_flush(struct napi_struct *napi)
3250 {
3251         struct sk_buff *skb, *next;
3252
3253         for (skb = napi->gro_list; skb; skb = next) {
3254                 next = skb->next;
3255                 skb->next = NULL;
3256                 napi_gro_complete(skb);
3257         }
3258
3259         napi->gro_count = 0;
3260         napi->gro_list = NULL;
3261 }
3262 EXPORT_SYMBOL(napi_gro_flush);
3263
3264 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3265 {
3266         struct sk_buff **pp = NULL;
3267         struct packet_type *ptype;
3268         __be16 type = skb->protocol;
3269         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3270         int same_flow;
3271         int mac_len;
3272         enum gro_result ret;
3273
3274         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3275                 goto normal;
3276
3277         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3278                 goto normal;
3279
3280         rcu_read_lock();
3281         list_for_each_entry_rcu(ptype, head, list) {
3282                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3283                         continue;
3284
3285                 skb_set_network_header(skb, skb_gro_offset(skb));
3286                 mac_len = skb->network_header - skb->mac_header;
3287                 skb->mac_len = mac_len;
3288                 NAPI_GRO_CB(skb)->same_flow = 0;
3289                 NAPI_GRO_CB(skb)->flush = 0;
3290                 NAPI_GRO_CB(skb)->free = 0;
3291
3292                 pp = ptype->gro_receive(&napi->gro_list, skb);
3293                 break;
3294         }
3295         rcu_read_unlock();
3296
3297         if (&ptype->list == head)
3298                 goto normal;
3299
3300         same_flow = NAPI_GRO_CB(skb)->same_flow;
3301         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3302
3303         if (pp) {
3304                 struct sk_buff *nskb = *pp;
3305
3306                 *pp = nskb->next;
3307                 nskb->next = NULL;
3308                 napi_gro_complete(nskb);
3309                 napi->gro_count--;
3310         }
3311
3312         if (same_flow)
3313                 goto ok;
3314
3315         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3316                 goto normal;
3317
3318         napi->gro_count++;
3319         NAPI_GRO_CB(skb)->count = 1;
3320         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3321         skb->next = napi->gro_list;
3322         napi->gro_list = skb;
3323         ret = GRO_HELD;
3324
3325 pull:
3326         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3327                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3328
3329                 BUG_ON(skb->end - skb->tail < grow);
3330
3331                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3332
3333                 skb->tail += grow;
3334                 skb->data_len -= grow;
3335
3336                 skb_shinfo(skb)->frags[0].page_offset += grow;
3337                 skb_shinfo(skb)->frags[0].size -= grow;
3338
3339                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3340                         put_page(skb_shinfo(skb)->frags[0].page);
3341                         memmove(skb_shinfo(skb)->frags,
3342                                 skb_shinfo(skb)->frags + 1,
3343                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3344                 }
3345         }
3346
3347 ok:
3348         return ret;
3349
3350 normal:
3351         ret = GRO_NORMAL;
3352         goto pull;
3353 }
3354 EXPORT_SYMBOL(dev_gro_receive);
3355
3356 static inline gro_result_t
3357 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3358 {
3359         struct sk_buff *p;
3360
3361         for (p = napi->gro_list; p; p = p->next) {
3362                 unsigned long diffs;
3363
3364                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3365                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3366                 diffs |= compare_ether_header(skb_mac_header(p),
3367                                               skb_gro_mac_header(skb));
3368                 NAPI_GRO_CB(p)->same_flow = !diffs;
3369                 NAPI_GRO_CB(p)->flush = 0;
3370         }
3371
3372         return dev_gro_receive(napi, skb);
3373 }
3374
3375 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3376 {
3377         switch (ret) {
3378         case GRO_NORMAL:
3379                 if (netif_receive_skb(skb))
3380                         ret = GRO_DROP;
3381                 break;
3382
3383         case GRO_DROP:
3384         case GRO_MERGED_FREE:
3385                 kfree_skb(skb);
3386                 break;
3387
3388         case GRO_HELD:
3389         case GRO_MERGED:
3390                 break;
3391         }
3392
3393         return ret;
3394 }
3395 EXPORT_SYMBOL(napi_skb_finish);
3396
3397 void skb_gro_reset_offset(struct sk_buff *skb)
3398 {
3399         NAPI_GRO_CB(skb)->data_offset = 0;
3400         NAPI_GRO_CB(skb)->frag0 = NULL;
3401         NAPI_GRO_CB(skb)->frag0_len = 0;
3402
3403         if (skb->mac_header == skb->tail &&
3404             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3405                 NAPI_GRO_CB(skb)->frag0 =
3406                         page_address(skb_shinfo(skb)->frags[0].page) +
3407                         skb_shinfo(skb)->frags[0].page_offset;
3408                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3409         }
3410 }
3411 EXPORT_SYMBOL(skb_gro_reset_offset);
3412
3413 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3414 {
3415         skb_gro_reset_offset(skb);
3416
3417         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3418 }
3419 EXPORT_SYMBOL(napi_gro_receive);
3420
3421 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3422 {
3423         __skb_pull(skb, skb_headlen(skb));
3424         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3425         skb->vlan_tci = 0;
3426
3427         napi->skb = skb;
3428 }
3429
3430 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3431 {
3432         struct sk_buff *skb = napi->skb;
3433
3434         if (!skb) {
3435                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3436                 if (skb)
3437                         napi->skb = skb;
3438         }
3439         return skb;
3440 }
3441 EXPORT_SYMBOL(napi_get_frags);
3442
3443 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3444                                gro_result_t ret)
3445 {
3446         switch (ret) {
3447         case GRO_NORMAL:
3448         case GRO_HELD:
3449                 skb->protocol = eth_type_trans(skb, skb->dev);
3450
3451                 if (ret == GRO_HELD)
3452                         skb_gro_pull(skb, -ETH_HLEN);
3453                 else if (netif_receive_skb(skb))
3454                         ret = GRO_DROP;
3455                 break;
3456
3457         case GRO_DROP:
3458         case GRO_MERGED_FREE:
3459                 napi_reuse_skb(napi, skb);
3460                 break;
3461
3462         case GRO_MERGED:
3463                 break;
3464         }
3465
3466         return ret;
3467 }
3468 EXPORT_SYMBOL(napi_frags_finish);
3469
3470 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3471 {
3472         struct sk_buff *skb = napi->skb;
3473         struct ethhdr *eth;
3474         unsigned int hlen;
3475         unsigned int off;
3476
3477         napi->skb = NULL;
3478
3479         skb_reset_mac_header(skb);
3480         skb_gro_reset_offset(skb);
3481
3482         off = skb_gro_offset(skb);
3483         hlen = off + sizeof(*eth);
3484         eth = skb_gro_header_fast(skb, off);
3485         if (skb_gro_header_hard(skb, hlen)) {
3486                 eth = skb_gro_header_slow(skb, hlen, off);
3487                 if (unlikely(!eth)) {
3488                         napi_reuse_skb(napi, skb);
3489                         skb = NULL;
3490                         goto out;
3491                 }
3492         }
3493
3494         skb_gro_pull(skb, sizeof(*eth));
3495
3496         /*
3497          * This works because the only protocols we care about don't require
3498          * special handling.  We'll fix it up properly at the end.
3499          */
3500         skb->protocol = eth->h_proto;
3501
3502 out:
3503         return skb;
3504 }
3505 EXPORT_SYMBOL(napi_frags_skb);
3506
3507 gro_result_t napi_gro_frags(struct napi_struct *napi)
3508 {
3509         struct sk_buff *skb = napi_frags_skb(napi);
3510
3511         if (!skb)
3512                 return GRO_DROP;
3513
3514         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3515 }
3516 EXPORT_SYMBOL(napi_gro_frags);
3517
3518 /*
3519  * net_rps_action sends any pending IPI's for rps.
3520  * Note: called with local irq disabled, but exits with local irq enabled.
3521  */
3522 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3523 {
3524 #ifdef CONFIG_RPS
3525         struct softnet_data *remsd = sd->rps_ipi_list;
3526
3527         if (remsd) {
3528                 sd->rps_ipi_list = NULL;
3529
3530                 local_irq_enable();
3531
3532                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3533                 while (remsd) {
3534                         struct softnet_data *next = remsd->rps_ipi_next;
3535
3536                         if (cpu_online(remsd->cpu))
3537                                 __smp_call_function_single(remsd->cpu,
3538                                                            &remsd->csd, 0);
3539                         remsd = next;
3540                 }
3541         } else
3542 #endif
3543                 local_irq_enable();
3544 }
3545
3546 static int process_backlog(struct napi_struct *napi, int quota)
3547 {
3548         int work = 0;
3549         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3550
3551 #ifdef CONFIG_RPS
3552         /* Check if we have pending ipi, its better to send them now,
3553          * not waiting net_rx_action() end.
3554          */
3555         if (sd->rps_ipi_list) {
3556                 local_irq_disable();
3557                 net_rps_action_and_irq_enable(sd);
3558         }
3559 #endif
3560         napi->weight = weight_p;
3561         local_irq_disable();
3562         while (work < quota) {
3563                 struct sk_buff *skb;
3564                 unsigned int qlen;
3565
3566                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3567                         local_irq_enable();
3568                         __netif_receive_skb(skb);
3569                         local_irq_disable();
3570                         input_queue_head_incr(sd);
3571                         if (++work >= quota) {
3572                                 local_irq_enable();
3573                                 return work;
3574                         }
3575                 }
3576
3577                 rps_lock(sd);
3578                 qlen = skb_queue_len(&sd->input_pkt_queue);
3579                 if (qlen)
3580                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3581                                                    &sd->process_queue);
3582
3583                 if (qlen < quota - work) {
3584                         /*
3585                          * Inline a custom version of __napi_complete().
3586                          * only current cpu owns and manipulates this napi,
3587                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3588                          * we can use a plain write instead of clear_bit(),
3589                          * and we dont need an smp_mb() memory barrier.
3590                          */
3591                         list_del(&napi->poll_list);
3592                         napi->state = 0;
3593
3594                         quota = work + qlen;
3595                 }
3596                 rps_unlock(sd);
3597         }
3598         local_irq_enable();
3599
3600         return work;
3601 }
3602
3603 /**
3604  * __napi_schedule - schedule for receive
3605  * @n: entry to schedule
3606  *
3607  * The entry's receive function will be scheduled to run
3608  */
3609 void __napi_schedule(struct napi_struct *n)
3610 {
3611         unsigned long flags;
3612
3613         local_irq_save(flags);
3614         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3615         local_irq_restore(flags);
3616 }
3617 EXPORT_SYMBOL(__napi_schedule);
3618
3619 void __napi_complete(struct napi_struct *n)
3620 {
3621         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3622         BUG_ON(n->gro_list);
3623
3624         list_del(&n->poll_list);
3625         smp_mb__before_clear_bit();
3626         clear_bit(NAPI_STATE_SCHED, &n->state);
3627 }
3628 EXPORT_SYMBOL(__napi_complete);
3629
3630 void napi_complete(struct napi_struct *n)
3631 {
3632         unsigned long flags;
3633
3634         /*
3635          * don't let napi dequeue from the cpu poll list
3636          * just in case its running on a different cpu
3637          */
3638         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3639                 return;
3640
3641         napi_gro_flush(n);
3642         local_irq_save(flags);
3643         __napi_complete(n);
3644         local_irq_restore(flags);
3645 }
3646 EXPORT_SYMBOL(napi_complete);
3647
3648 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3649                     int (*poll)(struct napi_struct *, int), int weight)
3650 {
3651         INIT_LIST_HEAD(&napi->poll_list);
3652         napi->gro_count = 0;
3653         napi->gro_list = NULL;
3654         napi->skb = NULL;
3655         napi->poll = poll;
3656         napi->weight = weight;
3657         list_add(&napi->dev_list, &dev->napi_list);
3658         napi->dev = dev;
3659 #ifdef CONFIG_NETPOLL
3660         spin_lock_init(&napi->poll_lock);
3661         napi->poll_owner = -1;
3662 #endif
3663         set_bit(NAPI_STATE_SCHED, &napi->state);
3664 }
3665 EXPORT_SYMBOL(netif_napi_add);
3666
3667 void netif_napi_del(struct napi_struct *napi)
3668 {
3669         struct sk_buff *skb, *next;
3670
3671         list_del_init(&napi->dev_list);
3672         napi_free_frags(napi);
3673
3674         for (skb = napi->gro_list; skb; skb = next) {
3675                 next = skb->next;
3676                 skb->next = NULL;
3677                 kfree_skb(skb);
3678         }
3679
3680         napi->gro_list = NULL;
3681         napi->gro_count = 0;
3682 }
3683 EXPORT_SYMBOL(netif_napi_del);
3684
3685 static void net_rx_action(struct softirq_action *h)
3686 {
3687         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3688         unsigned long time_limit = jiffies + 2;
3689         int budget = netdev_budget;
3690         void *have;
3691
3692         local_irq_disable();
3693
3694         while (!list_empty(&sd->poll_list)) {
3695                 struct napi_struct *n;
3696                 int work, weight;
3697
3698                 /* If softirq window is exhuasted then punt.
3699                  * Allow this to run for 2 jiffies since which will allow
3700                  * an average latency of 1.5/HZ.
3701                  */
3702                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3703                         goto softnet_break;
3704
3705                 local_irq_enable();
3706
3707                 /* Even though interrupts have been re-enabled, this
3708                  * access is safe because interrupts can only add new
3709                  * entries to the tail of this list, and only ->poll()
3710                  * calls can remove this head entry from the list.
3711                  */
3712                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3713
3714                 have = netpoll_poll_lock(n);
3715
3716                 weight = n->weight;
3717
3718                 /* This NAPI_STATE_SCHED test is for avoiding a race
3719                  * with netpoll's poll_napi().  Only the entity which
3720                  * obtains the lock and sees NAPI_STATE_SCHED set will
3721                  * actually make the ->poll() call.  Therefore we avoid
3722                  * accidently calling ->poll() when NAPI is not scheduled.
3723                  */
3724                 work = 0;
3725                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3726                         work = n->poll(n, weight);
3727                         trace_napi_poll(n);
3728                 }
3729
3730                 WARN_ON_ONCE(work > weight);
3731
3732                 budget -= work;
3733
3734                 local_irq_disable();
3735
3736                 /* Drivers must not modify the NAPI state if they
3737                  * consume the entire weight.  In such cases this code
3738                  * still "owns" the NAPI instance and therefore can
3739                  * move the instance around on the list at-will.
3740                  */
3741                 if (unlikely(work == weight)) {
3742                         if (unlikely(napi_disable_pending(n))) {
3743                                 local_irq_enable();
3744                                 napi_complete(n);
3745                                 local_irq_disable();
3746                         } else
3747                                 list_move_tail(&n->poll_list, &sd->poll_list);
3748                 }
3749
3750                 netpoll_poll_unlock(have);
3751         }
3752 out:
3753         net_rps_action_and_irq_enable(sd);
3754
3755 #ifdef CONFIG_NET_DMA
3756         /*
3757          * There may not be any more sk_buffs coming right now, so push
3758          * any pending DMA copies to hardware
3759          */
3760         dma_issue_pending_all();
3761 #endif
3762
3763         return;
3764
3765 softnet_break:
3766         sd->time_squeeze++;
3767         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3768         goto out;
3769 }
3770
3771 static gifconf_func_t *gifconf_list[NPROTO];
3772
3773 /**
3774  *      register_gifconf        -       register a SIOCGIF handler
3775  *      @family: Address family
3776  *      @gifconf: Function handler
3777  *
3778  *      Register protocol dependent address dumping routines. The handler
3779  *      that is passed must not be freed or reused until it has been replaced
3780  *      by another handler.
3781  */
3782 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3783 {
3784         if (family >= NPROTO)
3785                 return -EINVAL;
3786         gifconf_list[family] = gifconf;
3787         return 0;
3788 }
3789 EXPORT_SYMBOL(register_gifconf);
3790
3791
3792 /*
3793  *      Map an interface index to its name (SIOCGIFNAME)
3794  */
3795
3796 /*
3797  *      We need this ioctl for efficient implementation of the
3798  *      if_indextoname() function required by the IPv6 API.  Without
3799  *      it, we would have to search all the interfaces to find a
3800  *      match.  --pb
3801  */
3802
3803 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3804 {
3805         struct net_device *dev;
3806         struct ifreq ifr;
3807
3808         /*
3809          *      Fetch the caller's info block.
3810          */
3811
3812         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3813                 return -EFAULT;
3814
3815         rcu_read_lock();
3816         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3817         if (!dev) {
3818                 rcu_read_unlock();
3819                 return -ENODEV;
3820         }
3821
3822         strcpy(ifr.ifr_name, dev->name);
3823         rcu_read_unlock();
3824
3825         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3826                 return -EFAULT;
3827         return 0;
3828 }
3829
3830 /*
3831  *      Perform a SIOCGIFCONF call. This structure will change
3832  *      size eventually, and there is nothing I can do about it.
3833  *      Thus we will need a 'compatibility mode'.
3834  */
3835
3836 static int dev_ifconf(struct net *net, char __user *arg)
3837 {
3838         struct ifconf ifc;
3839         struct net_device *dev;
3840         char __user *pos;
3841         int len;
3842         int total;
3843         int i;
3844
3845         /*
3846          *      Fetch the caller's info block.
3847          */
3848
3849         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3850                 return -EFAULT;
3851
3852         pos = ifc.ifc_buf;
3853         len = ifc.ifc_len;
3854
3855         /*
3856          *      Loop over the interfaces, and write an info block for each.
3857          */
3858
3859         total = 0;
3860         for_each_netdev(net, dev) {
3861                 for (i = 0; i < NPROTO; i++) {
3862                         if (gifconf_list[i]) {
3863                                 int done;
3864                                 if (!pos)
3865                                         done = gifconf_list[i](dev, NULL, 0);
3866                                 else
3867                                         done = gifconf_list[i](dev, pos + total,
3868                                                                len - total);
3869                                 if (done < 0)
3870                                         return -EFAULT;
3871                                 total += done;
3872                         }
3873                 }
3874         }
3875
3876         /*
3877          *      All done.  Write the updated control block back to the caller.
3878          */
3879         ifc.ifc_len = total;
3880
3881         /*
3882          *      Both BSD and Solaris return 0 here, so we do too.
3883          */
3884         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3885 }
3886
3887 #ifdef CONFIG_PROC_FS
3888 /*
3889  *      This is invoked by the /proc filesystem handler to display a device
3890  *      in detail.
3891  */
3892 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3893         __acquires(RCU)
3894 {
3895         struct net *net = seq_file_net(seq);
3896         loff_t off;
3897         struct net_device *dev;
3898
3899         rcu_read_lock();
3900         if (!*pos)
3901                 return SEQ_START_TOKEN;
3902
3903         off = 1;
3904         for_each_netdev_rcu(net, dev)
3905                 if (off++ == *pos)
3906                         return dev;
3907
3908         return NULL;
3909 }
3910
3911 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3912 {
3913         struct net_device *dev = (v == SEQ_START_TOKEN) ?
3914                                   first_net_device(seq_file_net(seq)) :
3915                                   next_net_device((struct net_device *)v);
3916
3917         ++*pos;
3918         return rcu_dereference(dev);
3919 }
3920
3921 void dev_seq_stop(struct seq_file *seq, void *v)
3922         __releases(RCU)
3923 {
3924         rcu_read_unlock();
3925 }
3926
3927 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3928 {
3929         struct rtnl_link_stats64 temp;
3930         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3931
3932         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3933                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3934                    dev->name, stats->rx_bytes, stats->rx_packets,
3935                    stats->rx_errors,
3936                    stats->rx_dropped + stats->rx_missed_errors,
3937                    stats->rx_fifo_errors,
3938                    stats->rx_length_errors + stats->rx_over_errors +
3939                     stats->rx_crc_errors + stats->rx_frame_errors,
3940                    stats->rx_compressed, stats->multicast,
3941                    stats->tx_bytes, stats->tx_packets,
3942                    stats->tx_errors, stats->tx_dropped,
3943                    stats->tx_fifo_errors, stats->collisions,
3944                    stats->tx_carrier_errors +
3945                     stats->tx_aborted_errors +
3946                     stats->tx_window_errors +
3947                     stats->tx_heartbeat_errors,
3948                    stats->tx_compressed);
3949 }
3950
3951 /*
3952  *      Called from the PROCfs module. This now uses the new arbitrary sized
3953  *      /proc/net interface to create /proc/net/dev
3954  */
3955 static int dev_seq_show(struct seq_file *seq, void *v)
3956 {
3957         if (v == SEQ_START_TOKEN)
3958                 seq_puts(seq, "Inter-|   Receive                            "
3959                               "                    |  Transmit\n"
3960                               " face |bytes    packets errs drop fifo frame "
3961                               "compressed multicast|bytes    packets errs "
3962                               "drop fifo colls carrier compressed\n");
3963         else
3964                 dev_seq_printf_stats(seq, v);
3965         return 0;
3966 }
3967
3968 static struct softnet_data *softnet_get_online(loff_t *pos)
3969 {
3970         struct softnet_data *sd = NULL;
3971
3972         while (*pos < nr_cpu_ids)
3973                 if (cpu_online(*pos)) {
3974                         sd = &per_cpu(softnet_data, *pos);
3975                         break;
3976                 } else
3977                         ++*pos;
3978         return sd;
3979 }
3980
3981 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3982 {
3983         return softnet_get_online(pos);
3984 }
3985
3986 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3987 {
3988         ++*pos;
3989         return softnet_get_online(pos);
3990 }
3991
3992 static void softnet_seq_stop(struct seq_file *seq, void *v)
3993 {
3994 }
3995
3996 static int softnet_seq_show(struct seq_file *seq, void *v)
3997 {
3998         struct softnet_data *sd = v;
3999
4000         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4001                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4002                    0, 0, 0, 0, /* was fastroute */
4003                    sd->cpu_collision, sd->received_rps);
4004         return 0;
4005 }
4006
4007 static const struct seq_operations dev_seq_ops = {
4008         .start = dev_seq_start,
4009         .next  = dev_seq_next,
4010         .stop  = dev_seq_stop,
4011         .show  = dev_seq_show,
4012 };
4013
4014 static int dev_seq_open(struct inode *inode, struct file *file)
4015 {
4016         return seq_open_net(inode, file, &dev_seq_ops,
4017                             sizeof(struct seq_net_private));
4018 }
4019
4020 static const struct file_operations dev_seq_fops = {
4021         .owner   = THIS_MODULE,
4022         .open    = dev_seq_open,
4023         .read    = seq_read,
4024         .llseek  = seq_lseek,
4025         .release = seq_release_net,
4026 };
4027
4028 static const struct seq_operations softnet_seq_ops = {
4029         .start = softnet_seq_start,
4030         .next  = softnet_seq_next,
4031         .stop  = softnet_seq_stop,
4032         .show  = softnet_seq_show,
4033 };
4034
4035 static int softnet_seq_open(struct inode *inode, struct file *file)
4036 {
4037         return seq_open(file, &softnet_seq_ops);
4038 }
4039
4040 static const struct file_operations softnet_seq_fops = {
4041         .owner   = THIS_MODULE,
4042         .open    = softnet_seq_open,
4043         .read    = seq_read,
4044         .llseek  = seq_lseek,
4045         .release = seq_release,
4046 };
4047
4048 static void *ptype_get_idx(loff_t pos)
4049 {
4050         struct packet_type *pt = NULL;
4051         loff_t i = 0;
4052         int t;
4053
4054         list_for_each_entry_rcu(pt, &ptype_all, list) {
4055                 if (i == pos)
4056                         return pt;
4057                 ++i;
4058         }
4059
4060         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4061                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4062                         if (i == pos)
4063                                 return pt;
4064                         ++i;
4065                 }
4066         }
4067         return NULL;
4068 }
4069
4070 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4071         __acquires(RCU)
4072 {
4073         rcu_read_lock();
4074         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4075 }
4076
4077 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4078 {
4079         struct packet_type *pt;
4080         struct list_head *nxt;
4081         int hash;
4082
4083         ++*pos;
4084         if (v == SEQ_START_TOKEN)
4085                 return ptype_get_idx(0);
4086
4087         pt = v;
4088         nxt = pt->list.next;
4089         if (pt->type == htons(ETH_P_ALL)) {
4090                 if (nxt != &ptype_all)
4091                         goto found;
4092                 hash = 0;
4093                 nxt = ptype_base[0].next;
4094         } else
4095                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4096
4097         while (nxt == &ptype_base[hash]) {
4098                 if (++hash >= PTYPE_HASH_SIZE)
4099                         return NULL;
4100                 nxt = ptype_base[hash].next;
4101         }
4102 found:
4103         return list_entry(nxt, struct packet_type, list);
4104 }
4105
4106 static void ptype_seq_stop(struct seq_file *seq, void *v)
4107         __releases(RCU)
4108 {
4109         rcu_read_unlock();
4110 }
4111
4112 static int ptype_seq_show(struct seq_file *seq, void *v)
4113 {
4114         struct packet_type *pt = v;
4115
4116         if (v == SEQ_START_TOKEN)
4117                 seq_puts(seq, "Type Device      Function\n");
4118         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4119                 if (pt->type == htons(ETH_P_ALL))
4120                         seq_puts(seq, "ALL ");
4121                 else
4122                         seq_printf(seq, "%04x", ntohs(pt->type));
4123
4124                 seq_printf(seq, " %-8s %pF\n",
4125                            pt->dev ? pt->dev->name : "", pt->func);
4126         }
4127
4128         return 0;
4129 }
4130
4131 static const struct seq_operations ptype_seq_ops = {
4132         .start = ptype_seq_start,
4133         .next  = ptype_seq_next,
4134         .stop  = ptype_seq_stop,
4135         .show  = ptype_seq_show,
4136 };
4137
4138 static int ptype_seq_open(struct inode *inode, struct file *file)
4139 {
4140         return seq_open_net(inode, file, &ptype_seq_ops,
4141                         sizeof(struct seq_net_private));
4142 }
4143
4144 static const struct file_operations ptype_seq_fops = {
4145         .owner   = THIS_MODULE,
4146         .open    = ptype_seq_open,
4147         .read    = seq_read,
4148         .llseek  = seq_lseek,
4149         .release = seq_release_net,
4150 };
4151
4152
4153 static int __net_init dev_proc_net_init(struct net *net)
4154 {
4155         int rc = -ENOMEM;
4156
4157         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4158                 goto out;
4159         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4160                 goto out_dev;
4161         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4162                 goto out_softnet;
4163
4164         if (wext_proc_init(net))
4165                 goto out_ptype;
4166         rc = 0;
4167 out:
4168         return rc;
4169 out_ptype:
4170         proc_net_remove(net, "ptype");
4171 out_softnet:
4172         proc_net_remove(net, "softnet_stat");
4173 out_dev:
4174         proc_net_remove(net, "dev");
4175         goto out;
4176 }
4177
4178 static void __net_exit dev_proc_net_exit(struct net *net)
4179 {
4180         wext_proc_exit(net);
4181
4182         proc_net_remove(net, "ptype");
4183         proc_net_remove(net, "softnet_stat");
4184         proc_net_remove(net, "dev");
4185 }
4186
4187 static struct pernet_operations __net_initdata dev_proc_ops = {
4188         .init = dev_proc_net_init,
4189         .exit = dev_proc_net_exit,
4190 };
4191
4192 static int __init dev_proc_init(void)
4193 {
4194         return register_pernet_subsys(&dev_proc_ops);
4195 }
4196 #else
4197 #define dev_proc_init() 0
4198 #endif  /* CONFIG_PROC_FS */
4199
4200
4201 /**
4202  *      netdev_set_master       -       set up master/slave pair
4203  *      @slave: slave device
4204  *      @master: new master device
4205  *
4206  *      Changes the master device of the slave. Pass %NULL to break the
4207  *      bonding. The caller must hold the RTNL semaphore. On a failure
4208  *      a negative errno code is returned. On success the reference counts
4209  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4210  *      function returns zero.
4211  */
4212 int netdev_set_master(struct net_device *slave, struct net_device *master)
4213 {
4214         struct net_device *old = slave->master;
4215
4216         ASSERT_RTNL();
4217
4218         if (master) {
4219                 if (old)
4220                         return -EBUSY;
4221                 dev_hold(master);
4222         }
4223
4224         slave->master = master;
4225
4226         if (old) {
4227                 synchronize_net();
4228                 dev_put(old);
4229         }
4230         if (master)
4231                 slave->flags |= IFF_SLAVE;
4232         else
4233                 slave->flags &= ~IFF_SLAVE;
4234
4235         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4236         return 0;
4237 }
4238 EXPORT_SYMBOL(netdev_set_master);
4239
4240 static void dev_change_rx_flags(struct net_device *dev, int flags)
4241 {
4242         const struct net_device_ops *ops = dev->netdev_ops;
4243
4244         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4245                 ops->ndo_change_rx_flags(dev, flags);
4246 }
4247
4248 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4249 {
4250         unsigned short old_flags = dev->flags;
4251         uid_t uid;
4252         gid_t gid;
4253
4254         ASSERT_RTNL();
4255
4256         dev->flags |= IFF_PROMISC;
4257         dev->promiscuity += inc;
4258         if (dev->promiscuity == 0) {
4259                 /*
4260                  * Avoid overflow.
4261                  * If inc causes overflow, untouch promisc and return error.
4262                  */
4263                 if (inc < 0)
4264                         dev->flags &= ~IFF_PROMISC;
4265                 else {
4266                         dev->promiscuity -= inc;
4267                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4268                                 "set promiscuity failed, promiscuity feature "
4269                                 "of device might be broken.\n", dev->name);
4270                         return -EOVERFLOW;
4271                 }
4272         }
4273         if (dev->flags != old_flags) {
4274                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4275                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4276                                                                "left");
4277                 if (audit_enabled) {
4278                         current_uid_gid(&uid, &gid);
4279                         audit_log(current->audit_context, GFP_ATOMIC,
4280                                 AUDIT_ANOM_PROMISCUOUS,
4281                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4282                                 dev->name, (dev->flags & IFF_PROMISC),
4283                                 (old_flags & IFF_PROMISC),
4284                                 audit_get_loginuid(current),
4285                                 uid, gid,
4286                                 audit_get_sessionid(current));
4287                 }
4288
4289                 dev_change_rx_flags(dev, IFF_PROMISC);
4290         }
4291         return 0;
4292 }
4293
4294 /**
4295  *      dev_set_promiscuity     - update promiscuity count on a device
4296  *      @dev: device
4297  *      @inc: modifier
4298  *
4299  *      Add or remove promiscuity from a device. While the count in the device
4300  *      remains above zero the interface remains promiscuous. Once it hits zero
4301  *      the device reverts back to normal filtering operation. A negative inc
4302  *      value is used to drop promiscuity on the device.
4303  *      Return 0 if successful or a negative errno code on error.
4304  */
4305 int dev_set_promiscuity(struct net_device *dev, int inc)
4306 {
4307         unsigned short old_flags = dev->flags;
4308         int err;
4309
4310         err = __dev_set_promiscuity(dev, inc);
4311         if (err < 0)
4312                 return err;
4313         if (dev->flags != old_flags)
4314                 dev_set_rx_mode(dev);
4315         return err;
4316 }
4317 EXPORT_SYMBOL(dev_set_promiscuity);
4318
4319 /**
4320  *      dev_set_allmulti        - update allmulti count on a device
4321  *      @dev: device
4322  *      @inc: modifier
4323  *
4324  *      Add or remove reception of all multicast frames to a device. While the
4325  *      count in the device remains above zero the interface remains listening
4326  *      to all interfaces. Once it hits zero the device reverts back to normal
4327  *      filtering operation. A negative @inc value is used to drop the counter
4328  *      when releasing a resource needing all multicasts.
4329  *      Return 0 if successful or a negative errno code on error.
4330  */
4331
4332 int dev_set_allmulti(struct net_device *dev, int inc)
4333 {
4334         unsigned short old_flags = dev->flags;
4335
4336         ASSERT_RTNL();
4337
4338         dev->flags |= IFF_ALLMULTI;
4339         dev->allmulti += inc;
4340         if (dev->allmulti == 0) {
4341                 /*
4342                  * Avoid overflow.
4343                  * If inc causes overflow, untouch allmulti and return error.
4344                  */
4345                 if (inc < 0)
4346                         dev->flags &= ~IFF_ALLMULTI;
4347                 else {
4348                         dev->allmulti -= inc;
4349                         printk(KERN_WARNING "%s: allmulti touches roof, "
4350                                 "set allmulti failed, allmulti feature of "
4351                                 "device might be broken.\n", dev->name);
4352                         return -EOVERFLOW;
4353                 }
4354         }
4355         if (dev->flags ^ old_flags) {
4356                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4357                 dev_set_rx_mode(dev);
4358         }
4359         return 0;
4360 }
4361 EXPORT_SYMBOL(dev_set_allmulti);
4362
4363 /*
4364  *      Upload unicast and multicast address lists to device and
4365  *      configure RX filtering. When the device doesn't support unicast
4366  *      filtering it is put in promiscuous mode while unicast addresses
4367  *      are present.
4368  */
4369 void __dev_set_rx_mode(struct net_device *dev)
4370 {
4371         const struct net_device_ops *ops = dev->netdev_ops;
4372
4373         /* dev_open will call this function so the list will stay sane. */
4374         if (!(dev->flags&IFF_UP))
4375                 return;
4376
4377         if (!netif_device_present(dev))
4378                 return;
4379
4380         if (ops->ndo_set_rx_mode)
4381                 ops->ndo_set_rx_mode(dev);
4382         else {
4383                 /* Unicast addresses changes may only happen under the rtnl,
4384                  * therefore calling __dev_set_promiscuity here is safe.
4385                  */
4386                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4387                         __dev_set_promiscuity(dev, 1);
4388                         dev->uc_promisc = 1;
4389                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4390                         __dev_set_promiscuity(dev, -1);
4391                         dev->uc_promisc = 0;
4392                 }
4393
4394                 if (ops->ndo_set_multicast_list)
4395                         ops->ndo_set_multicast_list(dev);
4396         }
4397 }
4398
4399 void dev_set_rx_mode(struct net_device *dev)
4400 {
4401         netif_addr_lock_bh(dev);
4402         __dev_set_rx_mode(dev);
4403         netif_addr_unlock_bh(dev);
4404 }
4405
4406 /**
4407  *      dev_get_flags - get flags reported to userspace
4408  *      @dev: device
4409  *
4410  *      Get the combination of flag bits exported through APIs to userspace.
4411  */
4412 unsigned dev_get_flags(const struct net_device *dev)
4413 {
4414         unsigned flags;
4415
4416         flags = (dev->flags & ~(IFF_PROMISC |
4417                                 IFF_ALLMULTI |
4418                                 IFF_RUNNING |
4419                                 IFF_LOWER_UP |
4420                                 IFF_DORMANT)) |
4421                 (dev->gflags & (IFF_PROMISC |
4422                                 IFF_ALLMULTI));
4423
4424         if (netif_running(dev)) {
4425                 if (netif_oper_up(dev))
4426                         flags |= IFF_RUNNING;
4427                 if (netif_carrier_ok(dev))
4428                         flags |= IFF_LOWER_UP;
4429                 if (netif_dormant(dev))
4430                         flags |= IFF_DORMANT;
4431         }
4432
4433         return flags;
4434 }
4435 EXPORT_SYMBOL(dev_get_flags);
4436
4437 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4438 {
4439         int old_flags = dev->flags;
4440         int ret;
4441
4442         ASSERT_RTNL();
4443
4444         /*
4445          *      Set the flags on our device.
4446          */
4447
4448         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4449                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4450                                IFF_AUTOMEDIA)) |
4451                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4452                                     IFF_ALLMULTI));
4453
4454         /*
4455          *      Load in the correct multicast list now the flags have changed.
4456          */
4457
4458         if ((old_flags ^ flags) & IFF_MULTICAST)
4459                 dev_change_rx_flags(dev, IFF_MULTICAST);
4460
4461         dev_set_rx_mode(dev);
4462
4463         /*
4464          *      Have we downed the interface. We handle IFF_UP ourselves
4465          *      according to user attempts to set it, rather than blindly
4466          *      setting it.
4467          */
4468
4469         ret = 0;
4470         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4471                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4472
4473                 if (!ret)
4474                         dev_set_rx_mode(dev);
4475         }
4476
4477         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4478                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4479
4480                 dev->gflags ^= IFF_PROMISC;
4481                 dev_set_promiscuity(dev, inc);
4482         }
4483
4484         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4485            is important. Some (broken) drivers set IFF_PROMISC, when
4486            IFF_ALLMULTI is requested not asking us and not reporting.
4487          */
4488         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4489                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4490
4491                 dev->gflags ^= IFF_ALLMULTI;
4492                 dev_set_allmulti(dev, inc);
4493         }
4494
4495         return ret;
4496 }
4497
4498 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4499 {
4500         unsigned int changes = dev->flags ^ old_flags;
4501
4502         if (changes & IFF_UP) {
4503                 if (dev->flags & IFF_UP)
4504                         call_netdevice_notifiers(NETDEV_UP, dev);
4505                 else
4506                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4507         }
4508
4509         if (dev->flags & IFF_UP &&
4510             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4511                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4512 }
4513
4514 /**
4515  *      dev_change_flags - change device settings
4516  *      @dev: device
4517  *      @flags: device state flags
4518  *
4519  *      Change settings on device based state flags. The flags are
4520  *      in the userspace exported format.
4521  */
4522 int dev_change_flags(struct net_device *dev, unsigned flags)
4523 {
4524         int ret, changes;
4525         int old_flags = dev->flags;
4526
4527         ret = __dev_change_flags(dev, flags);
4528         if (ret < 0)
4529                 return ret;
4530
4531         changes = old_flags ^ dev->flags;
4532         if (changes)
4533                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4534
4535         __dev_notify_flags(dev, old_flags);
4536         return ret;
4537 }
4538 EXPORT_SYMBOL(dev_change_flags);
4539
4540 /**
4541  *      dev_set_mtu - Change maximum transfer unit
4542  *      @dev: device
4543  *      @new_mtu: new transfer unit
4544  *
4545  *      Change the maximum transfer size of the network device.
4546  */
4547 int dev_set_mtu(struct net_device *dev, int new_mtu)
4548 {
4549         const struct net_device_ops *ops = dev->netdev_ops;
4550         int err;
4551
4552         if (new_mtu == dev->mtu)
4553                 return 0;
4554
4555         /*      MTU must be positive.    */
4556         if (new_mtu < 0)
4557                 return -EINVAL;
4558
4559         if (!netif_device_present(dev))
4560                 return -ENODEV;
4561
4562         err = 0;
4563         if (ops->ndo_change_mtu)
4564                 err = ops->ndo_change_mtu(dev, new_mtu);
4565         else
4566                 dev->mtu = new_mtu;
4567
4568         if (!err && dev->flags & IFF_UP)
4569                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4570         return err;
4571 }
4572 EXPORT_SYMBOL(dev_set_mtu);
4573
4574 /**
4575  *      dev_set_mac_address - Change Media Access Control Address
4576  *      @dev: device
4577  *      @sa: new address
4578  *
4579  *      Change the hardware (MAC) address of the device
4580  */
4581 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4582 {
4583         const struct net_device_ops *ops = dev->netdev_ops;
4584         int err;
4585
4586         if (!ops->ndo_set_mac_address)
4587                 return -EOPNOTSUPP;
4588         if (sa->sa_family != dev->type)
4589                 return -EINVAL;
4590         if (!netif_device_present(dev))
4591                 return -ENODEV;
4592         err = ops->ndo_set_mac_address(dev, sa);
4593         if (!err)
4594                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4595         return err;
4596 }
4597 EXPORT_SYMBOL(dev_set_mac_address);
4598
4599 /*
4600  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4601  */
4602 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4603 {
4604         int err;
4605         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4606
4607         if (!dev)
4608                 return -ENODEV;
4609
4610         switch (cmd) {
4611         case SIOCGIFFLAGS:      /* Get interface flags */
4612                 ifr->ifr_flags = (short) dev_get_flags(dev);
4613                 return 0;
4614
4615         case SIOCGIFMETRIC:     /* Get the metric on the interface
4616                                    (currently unused) */
4617                 ifr->ifr_metric = 0;
4618                 return 0;
4619
4620         case SIOCGIFMTU:        /* Get the MTU of a device */
4621                 ifr->ifr_mtu = dev->mtu;
4622                 return 0;
4623
4624         case SIOCGIFHWADDR:
4625                 if (!dev->addr_len)
4626                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4627                 else
4628                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4629                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4630                 ifr->ifr_hwaddr.sa_family = dev->type;
4631                 return 0;
4632
4633         case SIOCGIFSLAVE:
4634                 err = -EINVAL;
4635                 break;
4636
4637         case SIOCGIFMAP:
4638                 ifr->ifr_map.mem_start = dev->mem_start;
4639                 ifr->ifr_map.mem_end   = dev->mem_end;
4640                 ifr->ifr_map.base_addr = dev->base_addr;
4641                 ifr->ifr_map.irq       = dev->irq;
4642                 ifr->ifr_map.dma       = dev->dma;
4643                 ifr->ifr_map.port      = dev->if_port;
4644                 return 0;
4645
4646         case SIOCGIFINDEX:
4647                 ifr->ifr_ifindex = dev->ifindex;
4648                 return 0;
4649
4650         case SIOCGIFTXQLEN:
4651                 ifr->ifr_qlen = dev->tx_queue_len;
4652                 return 0;
4653
4654         default:
4655                 /* dev_ioctl() should ensure this case
4656                  * is never reached
4657                  */
4658                 WARN_ON(1);
4659                 err = -EINVAL;
4660                 break;
4661
4662         }
4663         return err;
4664 }
4665
4666 /*
4667  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4668  */
4669 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4670 {
4671         int err;
4672         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4673         const struct net_device_ops *ops;
4674
4675         if (!dev)
4676                 return -ENODEV;
4677
4678         ops = dev->netdev_ops;
4679
4680         switch (cmd) {
4681         case SIOCSIFFLAGS:      /* Set interface flags */
4682                 return dev_change_flags(dev, ifr->ifr_flags);
4683
4684         case SIOCSIFMETRIC:     /* Set the metric on the interface
4685                                    (currently unused) */
4686                 return -EOPNOTSUPP;
4687
4688         case SIOCSIFMTU:        /* Set the MTU of a device */
4689                 return dev_set_mtu(dev, ifr->ifr_mtu);
4690
4691         case SIOCSIFHWADDR:
4692                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4693
4694         case SIOCSIFHWBROADCAST:
4695                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4696                         return -EINVAL;
4697                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4698                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4699                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4700                 return 0;
4701
4702         case SIOCSIFMAP:
4703                 if (ops->ndo_set_config) {
4704                         if (!netif_device_present(dev))
4705                                 return -ENODEV;
4706                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4707                 }
4708                 return -EOPNOTSUPP;
4709
4710         case SIOCADDMULTI:
4711                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4712                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4713                         return -EINVAL;
4714                 if (!netif_device_present(dev))
4715                         return -ENODEV;
4716                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4717
4718         case SIOCDELMULTI:
4719                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4720                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4721                         return -EINVAL;
4722                 if (!netif_device_present(dev))
4723                         return -ENODEV;
4724                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4725
4726         case SIOCSIFTXQLEN:
4727                 if (ifr->ifr_qlen < 0)
4728                         return -EINVAL;
4729                 dev->tx_queue_len = ifr->ifr_qlen;
4730                 return 0;
4731
4732         case SIOCSIFNAME:
4733                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4734                 return dev_change_name(dev, ifr->ifr_newname);
4735
4736         /*
4737          *      Unknown or private ioctl
4738          */
4739         default:
4740                 if ((cmd >= SIOCDEVPRIVATE &&
4741                     cmd <= SIOCDEVPRIVATE + 15) ||
4742                     cmd == SIOCBONDENSLAVE ||
4743                     cmd == SIOCBONDRELEASE ||
4744                     cmd == SIOCBONDSETHWADDR ||
4745                     cmd == SIOCBONDSLAVEINFOQUERY ||
4746                     cmd == SIOCBONDINFOQUERY ||
4747                     cmd == SIOCBONDCHANGEACTIVE ||
4748                     cmd == SIOCGMIIPHY ||
4749                     cmd == SIOCGMIIREG ||
4750                     cmd == SIOCSMIIREG ||
4751                     cmd == SIOCBRADDIF ||
4752                     cmd == SIOCBRDELIF ||
4753                     cmd == SIOCSHWTSTAMP ||
4754                     cmd == SIOCWANDEV) {
4755                         err = -EOPNOTSUPP;
4756                         if (ops->ndo_do_ioctl) {
4757                                 if (netif_device_present(dev))
4758                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4759                                 else
4760                                         err = -ENODEV;
4761                         }
4762                 } else
4763                         err = -EINVAL;
4764
4765         }
4766         return err;
4767 }
4768
4769 /*
4770  *      This function handles all "interface"-type I/O control requests. The actual
4771  *      'doing' part of this is dev_ifsioc above.
4772  */
4773
4774 /**
4775  *      dev_ioctl       -       network device ioctl
4776  *      @net: the applicable net namespace
4777  *      @cmd: command to issue
4778  *      @arg: pointer to a struct ifreq in user space
4779  *
4780  *      Issue ioctl functions to devices. This is normally called by the
4781  *      user space syscall interfaces but can sometimes be useful for
4782  *      other purposes. The return value is the return from the syscall if
4783  *      positive or a negative errno code on error.
4784  */
4785
4786 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4787 {
4788         struct ifreq ifr;
4789         int ret;
4790         char *colon;
4791
4792         /* One special case: SIOCGIFCONF takes ifconf argument
4793            and requires shared lock, because it sleeps writing
4794            to user space.
4795          */
4796
4797         if (cmd == SIOCGIFCONF) {
4798                 rtnl_lock();
4799                 ret = dev_ifconf(net, (char __user *) arg);
4800                 rtnl_unlock();
4801                 return ret;
4802         }
4803         if (cmd == SIOCGIFNAME)
4804                 return dev_ifname(net, (struct ifreq __user *)arg);
4805
4806         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4807                 return -EFAULT;
4808
4809         ifr.ifr_name[IFNAMSIZ-1] = 0;
4810
4811         colon = strchr(ifr.ifr_name, ':');
4812         if (colon)
4813                 *colon = 0;
4814
4815         /*
4816          *      See which interface the caller is talking about.
4817          */
4818
4819         switch (cmd) {
4820         /*
4821          *      These ioctl calls:
4822          *      - can be done by all.
4823          *      - atomic and do not require locking.
4824          *      - return a value
4825          */
4826         case SIOCGIFFLAGS:
4827         case SIOCGIFMETRIC:
4828         case SIOCGIFMTU:
4829         case SIOCGIFHWADDR:
4830         case SIOCGIFSLAVE:
4831         case SIOCGIFMAP:
4832         case SIOCGIFINDEX:
4833         case SIOCGIFTXQLEN:
4834                 dev_load(net, ifr.ifr_name);
4835                 rcu_read_lock();
4836                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4837                 rcu_read_unlock();
4838                 if (!ret) {
4839                         if (colon)
4840                                 *colon = ':';
4841                         if (copy_to_user(arg, &ifr,
4842                                          sizeof(struct ifreq)))
4843                                 ret = -EFAULT;
4844                 }
4845                 return ret;
4846
4847         case SIOCETHTOOL:
4848                 dev_load(net, ifr.ifr_name);
4849                 rtnl_lock();
4850                 ret = dev_ethtool(net, &ifr);
4851                 rtnl_unlock();
4852                 if (!ret) {
4853                         if (colon)
4854                                 *colon = ':';
4855                         if (copy_to_user(arg, &ifr,
4856                                          sizeof(struct ifreq)))
4857                                 ret = -EFAULT;
4858                 }
4859                 return ret;
4860
4861         /*
4862          *      These ioctl calls:
4863          *      - require superuser power.
4864          *      - require strict serialization.
4865          *      - return a value
4866          */
4867         case SIOCGMIIPHY:
4868         case SIOCGMIIREG:
4869         case SIOCSIFNAME:
4870                 if (!capable(CAP_NET_ADMIN))
4871                         return -EPERM;
4872                 dev_load(net, ifr.ifr_name);
4873                 rtnl_lock();
4874                 ret = dev_ifsioc(net, &ifr, cmd);
4875                 rtnl_unlock();
4876                 if (!ret) {
4877                         if (colon)
4878                                 *colon = ':';
4879                         if (copy_to_user(arg, &ifr,
4880                                          sizeof(struct ifreq)))
4881                                 ret = -EFAULT;
4882                 }
4883                 return ret;
4884
4885         /*
4886          *      These ioctl calls:
4887          *      - require superuser power.
4888          *      - require strict serialization.
4889          *      - do not return a value
4890          */
4891         case SIOCSIFFLAGS:
4892         case SIOCSIFMETRIC:
4893         case SIOCSIFMTU:
4894         case SIOCSIFMAP:
4895         case SIOCSIFHWADDR:
4896         case SIOCSIFSLAVE:
4897         case SIOCADDMULTI:
4898         case SIOCDELMULTI:
4899         case SIOCSIFHWBROADCAST:
4900         case SIOCSIFTXQLEN:
4901         case SIOCSMIIREG:
4902         case SIOCBONDENSLAVE:
4903         case SIOCBONDRELEASE:
4904         case SIOCBONDSETHWADDR:
4905         case SIOCBONDCHANGEACTIVE:
4906         case SIOCBRADDIF:
4907         case SIOCBRDELIF:
4908         case SIOCSHWTSTAMP:
4909                 if (!capable(CAP_NET_ADMIN))
4910                         return -EPERM;
4911                 /* fall through */
4912         case SIOCBONDSLAVEINFOQUERY:
4913         case SIOCBONDINFOQUERY:
4914                 dev_load(net, ifr.ifr_name);
4915                 rtnl_lock();
4916                 ret = dev_ifsioc(net, &ifr, cmd);
4917                 rtnl_unlock();
4918                 return ret;
4919
4920         case SIOCGIFMEM:
4921                 /* Get the per device memory space. We can add this but
4922                  * currently do not support it */
4923         case SIOCSIFMEM:
4924                 /* Set the per device memory buffer space.
4925                  * Not applicable in our case */
4926         case SIOCSIFLINK:
4927                 return -EINVAL;
4928
4929         /*
4930          *      Unknown or private ioctl.
4931          */
4932         default:
4933                 if (cmd == SIOCWANDEV ||
4934                     (cmd >= SIOCDEVPRIVATE &&
4935                      cmd <= SIOCDEVPRIVATE + 15)) {
4936                         dev_load(net, ifr.ifr_name);
4937                         rtnl_lock();
4938                         ret = dev_ifsioc(net, &ifr, cmd);
4939                         rtnl_unlock();
4940                         if (!ret && copy_to_user(arg, &ifr,
4941                                                  sizeof(struct ifreq)))
4942                                 ret = -EFAULT;
4943                         return ret;
4944                 }
4945                 /* Take care of Wireless Extensions */
4946                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4947                         return wext_handle_ioctl(net, &ifr, cmd, arg);
4948                 return -EINVAL;
4949         }
4950 }
4951
4952
4953 /**
4954  *      dev_new_index   -       allocate an ifindex
4955  *      @net: the applicable net namespace
4956  *
4957  *      Returns a suitable unique value for a new device interface
4958  *      number.  The caller must hold the rtnl semaphore or the
4959  *      dev_base_lock to be sure it remains unique.
4960  */
4961 static int dev_new_index(struct net *net)
4962 {
4963         static int ifindex;
4964         for (;;) {
4965                 if (++ifindex <= 0)
4966                         ifindex = 1;
4967                 if (!__dev_get_by_index(net, ifindex))
4968                         return ifindex;
4969         }
4970 }
4971
4972 /* Delayed registration/unregisteration */
4973 static LIST_HEAD(net_todo_list);
4974
4975 static void net_set_todo(struct net_device *dev)
4976 {
4977         list_add_tail(&dev->todo_list, &net_todo_list);
4978 }
4979
4980 static void rollback_registered_many(struct list_head *head)
4981 {
4982         struct net_device *dev, *tmp;
4983
4984         BUG_ON(dev_boot_phase);
4985         ASSERT_RTNL();
4986
4987         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4988                 /* Some devices call without registering
4989                  * for initialization unwind. Remove those
4990                  * devices and proceed with the remaining.
4991                  */
4992                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4993                         pr_debug("unregister_netdevice: device %s/%p never "
4994                                  "was registered\n", dev->name, dev);
4995
4996                         WARN_ON(1);
4997                         list_del(&dev->unreg_list);
4998                         continue;
4999                 }
5000
5001                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5002         }
5003
5004         /* If device is running, close it first. */
5005         dev_close_many(head);
5006
5007         list_for_each_entry(dev, head, unreg_list) {
5008                 /* And unlink it from device chain. */
5009                 unlist_netdevice(dev);
5010
5011                 dev->reg_state = NETREG_UNREGISTERING;
5012         }
5013
5014         synchronize_net();
5015
5016         list_for_each_entry(dev, head, unreg_list) {
5017                 /* Shutdown queueing discipline. */
5018                 dev_shutdown(dev);
5019
5020
5021                 /* Notify protocols, that we are about to destroy
5022                    this device. They should clean all the things.
5023                 */
5024                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5025
5026                 if (!dev->rtnl_link_ops ||
5027                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5028                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5029
5030                 /*
5031                  *      Flush the unicast and multicast chains
5032                  */
5033                 dev_uc_flush(dev);
5034                 dev_mc_flush(dev);
5035
5036                 if (dev->netdev_ops->ndo_uninit)
5037                         dev->netdev_ops->ndo_uninit(dev);
5038
5039                 /* Notifier chain MUST detach us from master device. */
5040                 WARN_ON(dev->master);
5041
5042                 /* Remove entries from kobject tree */
5043                 netdev_unregister_kobject(dev);
5044         }
5045
5046         /* Process any work delayed until the end of the batch */
5047         dev = list_first_entry(head, struct net_device, unreg_list);
5048         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5049
5050         rcu_barrier();
5051
5052         list_for_each_entry(dev, head, unreg_list)
5053                 dev_put(dev);
5054 }
5055
5056 static void rollback_registered(struct net_device *dev)
5057 {
5058         LIST_HEAD(single);
5059
5060         list_add(&dev->unreg_list, &single);
5061         rollback_registered_many(&single);
5062 }
5063
5064 unsigned long netdev_fix_features(unsigned long features, const char *name)
5065 {
5066         /* Fix illegal SG+CSUM combinations. */
5067         if ((features & NETIF_F_SG) &&
5068             !(features & NETIF_F_ALL_CSUM)) {
5069                 if (name)
5070                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
5071                                "checksum feature.\n", name);
5072                 features &= ~NETIF_F_SG;
5073         }
5074
5075         /* TSO requires that SG is present as well. */
5076         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5077                 if (name)
5078                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5079                                "SG feature.\n", name);
5080                 features &= ~NETIF_F_TSO;
5081         }
5082
5083         if (features & NETIF_F_UFO) {
5084                 /* maybe split UFO into V4 and V6? */
5085                 if (!((features & NETIF_F_GEN_CSUM) ||
5086                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5087                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5088                         if (name)
5089                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5090                                        "since no checksum offload features.\n",
5091                                        name);
5092                         features &= ~NETIF_F_UFO;
5093                 }
5094
5095                 if (!(features & NETIF_F_SG)) {
5096                         if (name)
5097                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5098                                        "since no NETIF_F_SG feature.\n", name);
5099                         features &= ~NETIF_F_UFO;
5100                 }
5101         }
5102
5103         return features;
5104 }
5105 EXPORT_SYMBOL(netdev_fix_features);
5106
5107 /**
5108  *      netif_stacked_transfer_operstate -      transfer operstate
5109  *      @rootdev: the root or lower level device to transfer state from
5110  *      @dev: the device to transfer operstate to
5111  *
5112  *      Transfer operational state from root to device. This is normally
5113  *      called when a stacking relationship exists between the root
5114  *      device and the device(a leaf device).
5115  */
5116 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5117                                         struct net_device *dev)
5118 {
5119         if (rootdev->operstate == IF_OPER_DORMANT)
5120                 netif_dormant_on(dev);
5121         else
5122                 netif_dormant_off(dev);
5123
5124         if (netif_carrier_ok(rootdev)) {
5125                 if (!netif_carrier_ok(dev))
5126                         netif_carrier_on(dev);
5127         } else {
5128                 if (netif_carrier_ok(dev))
5129                         netif_carrier_off(dev);
5130         }
5131 }
5132 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5133
5134 #ifdef CONFIG_RPS
5135 static int netif_alloc_rx_queues(struct net_device *dev)
5136 {
5137         unsigned int i, count = dev->num_rx_queues;
5138         struct netdev_rx_queue *rx;
5139
5140         BUG_ON(count < 1);
5141
5142         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5143         if (!rx) {
5144                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5145                 return -ENOMEM;
5146         }
5147         dev->_rx = rx;
5148
5149         for (i = 0; i < count; i++)
5150                 rx[i].dev = dev;
5151         return 0;
5152 }
5153 #endif
5154
5155 static void netdev_init_one_queue(struct net_device *dev,
5156                                   struct netdev_queue *queue, void *_unused)
5157 {
5158         /* Initialize queue lock */
5159         spin_lock_init(&queue->_xmit_lock);
5160         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5161         queue->xmit_lock_owner = -1;
5162         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5163         queue->dev = dev;
5164 }
5165
5166 static int netif_alloc_netdev_queues(struct net_device *dev)
5167 {
5168         unsigned int count = dev->num_tx_queues;
5169         struct netdev_queue *tx;
5170
5171         BUG_ON(count < 1);
5172
5173         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5174         if (!tx) {
5175                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5176                        count);
5177                 return -ENOMEM;
5178         }
5179         dev->_tx = tx;
5180
5181         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5182         spin_lock_init(&dev->tx_global_lock);
5183
5184         return 0;
5185 }
5186
5187 /**
5188  *      register_netdevice      - register a network device
5189  *      @dev: device to register
5190  *
5191  *      Take a completed network device structure and add it to the kernel
5192  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5193  *      chain. 0 is returned on success. A negative errno code is returned
5194  *      on a failure to set up the device, or if the name is a duplicate.
5195  *
5196  *      Callers must hold the rtnl semaphore. You may want
5197  *      register_netdev() instead of this.
5198  *
5199  *      BUGS:
5200  *      The locking appears insufficient to guarantee two parallel registers
5201  *      will not get the same name.
5202  */
5203
5204 int register_netdevice(struct net_device *dev)
5205 {
5206         int ret;
5207         struct net *net = dev_net(dev);
5208
5209         BUG_ON(dev_boot_phase);
5210         ASSERT_RTNL();
5211
5212         might_sleep();
5213
5214         /* When net_device's are persistent, this will be fatal. */
5215         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5216         BUG_ON(!net);
5217
5218         spin_lock_init(&dev->addr_list_lock);
5219         netdev_set_addr_lockdep_class(dev);
5220
5221         dev->iflink = -1;
5222
5223         /* Init, if this function is available */
5224         if (dev->netdev_ops->ndo_init) {
5225                 ret = dev->netdev_ops->ndo_init(dev);
5226                 if (ret) {
5227                         if (ret > 0)
5228                                 ret = -EIO;
5229                         goto out;
5230                 }
5231         }
5232
5233         ret = dev_get_valid_name(dev, dev->name, 0);
5234         if (ret)
5235                 goto err_uninit;
5236
5237         dev->ifindex = dev_new_index(net);
5238         if (dev->iflink == -1)
5239                 dev->iflink = dev->ifindex;
5240
5241         /* Fix illegal checksum combinations */
5242         if ((dev->features & NETIF_F_HW_CSUM) &&
5243             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5244                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5245                        dev->name);
5246                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5247         }
5248
5249         if ((dev->features & NETIF_F_NO_CSUM) &&
5250             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5251                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5252                        dev->name);
5253                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5254         }
5255
5256         dev->features = netdev_fix_features(dev->features, dev->name);
5257
5258         /* Enable software GSO if SG is supported. */
5259         if (dev->features & NETIF_F_SG)
5260                 dev->features |= NETIF_F_GSO;
5261
5262         /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5263          * vlan_dev_init() will do the dev->features check, so these features
5264          * are enabled only if supported by underlying device.
5265          */
5266         dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5267
5268         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5269         ret = notifier_to_errno(ret);
5270         if (ret)
5271                 goto err_uninit;
5272
5273         ret = netdev_register_kobject(dev);
5274         if (ret)
5275                 goto err_uninit;
5276         dev->reg_state = NETREG_REGISTERED;
5277
5278         /*
5279          *      Default initial state at registry is that the
5280          *      device is present.
5281          */
5282
5283         set_bit(__LINK_STATE_PRESENT, &dev->state);
5284
5285         dev_init_scheduler(dev);
5286         dev_hold(dev);
5287         list_netdevice(dev);
5288
5289         /* Notify protocols, that a new device appeared. */
5290         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5291         ret = notifier_to_errno(ret);
5292         if (ret) {
5293                 rollback_registered(dev);
5294                 dev->reg_state = NETREG_UNREGISTERED;
5295         }
5296         /*
5297          *      Prevent userspace races by waiting until the network
5298          *      device is fully setup before sending notifications.
5299          */
5300         if (!dev->rtnl_link_ops ||
5301             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5302                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5303
5304 out:
5305         return ret;
5306
5307 err_uninit:
5308         if (dev->netdev_ops->ndo_uninit)
5309                 dev->netdev_ops->ndo_uninit(dev);
5310         goto out;
5311 }
5312 EXPORT_SYMBOL(register_netdevice);
5313
5314 /**
5315  *      init_dummy_netdev       - init a dummy network device for NAPI
5316  *      @dev: device to init
5317  *
5318  *      This takes a network device structure and initialize the minimum
5319  *      amount of fields so it can be used to schedule NAPI polls without
5320  *      registering a full blown interface. This is to be used by drivers
5321  *      that need to tie several hardware interfaces to a single NAPI
5322  *      poll scheduler due to HW limitations.
5323  */
5324 int init_dummy_netdev(struct net_device *dev)
5325 {
5326         /* Clear everything. Note we don't initialize spinlocks
5327          * are they aren't supposed to be taken by any of the
5328          * NAPI code and this dummy netdev is supposed to be
5329          * only ever used for NAPI polls
5330          */
5331         memset(dev, 0, sizeof(struct net_device));
5332
5333         /* make sure we BUG if trying to hit standard
5334          * register/unregister code path
5335          */
5336         dev->reg_state = NETREG_DUMMY;
5337
5338         /* NAPI wants this */
5339         INIT_LIST_HEAD(&dev->napi_list);
5340
5341         /* a dummy interface is started by default */
5342         set_bit(__LINK_STATE_PRESENT, &dev->state);
5343         set_bit(__LINK_STATE_START, &dev->state);
5344
5345         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5346          * because users of this 'device' dont need to change
5347          * its refcount.
5348          */
5349
5350         return 0;
5351 }
5352 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5353
5354
5355 /**
5356  *      register_netdev - register a network device
5357  *      @dev: device to register
5358  *
5359  *      Take a completed network device structure and add it to the kernel
5360  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5361  *      chain. 0 is returned on success. A negative errno code is returned
5362  *      on a failure to set up the device, or if the name is a duplicate.
5363  *
5364  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5365  *      and expands the device name if you passed a format string to
5366  *      alloc_netdev.
5367  */
5368 int register_netdev(struct net_device *dev)
5369 {
5370         int err;
5371
5372         rtnl_lock();
5373
5374         /*
5375          * If the name is a format string the caller wants us to do a
5376          * name allocation.
5377          */
5378         if (strchr(dev->name, '%')) {
5379                 err = dev_alloc_name(dev, dev->name);
5380                 if (err < 0)
5381                         goto out;
5382         }
5383
5384         err = register_netdevice(dev);
5385 out:
5386         rtnl_unlock();
5387         return err;
5388 }
5389 EXPORT_SYMBOL(register_netdev);
5390
5391 int netdev_refcnt_read(const struct net_device *dev)
5392 {
5393         int i, refcnt = 0;
5394
5395         for_each_possible_cpu(i)
5396                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5397         return refcnt;
5398 }
5399 EXPORT_SYMBOL(netdev_refcnt_read);
5400
5401 /*
5402  * netdev_wait_allrefs - wait until all references are gone.
5403  *
5404  * This is called when unregistering network devices.
5405  *
5406  * Any protocol or device that holds a reference should register
5407  * for netdevice notification, and cleanup and put back the
5408  * reference if they receive an UNREGISTER event.
5409  * We can get stuck here if buggy protocols don't correctly
5410  * call dev_put.
5411  */
5412 static void netdev_wait_allrefs(struct net_device *dev)
5413 {
5414         unsigned long rebroadcast_time, warning_time;
5415         int refcnt;
5416
5417         linkwatch_forget_dev(dev);
5418
5419         rebroadcast_time = warning_time = jiffies;
5420         refcnt = netdev_refcnt_read(dev);
5421
5422         while (refcnt != 0) {
5423                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5424                         rtnl_lock();
5425
5426                         /* Rebroadcast unregister notification */
5427                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5428                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5429                          * should have already handle it the first time */
5430
5431                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5432                                      &dev->state)) {
5433                                 /* We must not have linkwatch events
5434                                  * pending on unregister. If this
5435                                  * happens, we simply run the queue
5436                                  * unscheduled, resulting in a noop
5437                                  * for this device.
5438                                  */
5439                                 linkwatch_run_queue();
5440                         }
5441
5442                         __rtnl_unlock();
5443
5444                         rebroadcast_time = jiffies;
5445                 }
5446
5447                 msleep(250);
5448
5449                 refcnt = netdev_refcnt_read(dev);
5450
5451                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5452                         printk(KERN_EMERG "unregister_netdevice: "
5453                                "waiting for %s to become free. Usage "
5454                                "count = %d\n",
5455                                dev->name, refcnt);
5456                         warning_time = jiffies;
5457                 }
5458         }
5459 }
5460
5461 /* The sequence is:
5462  *
5463  *      rtnl_lock();
5464  *      ...
5465  *      register_netdevice(x1);
5466  *      register_netdevice(x2);
5467  *      ...
5468  *      unregister_netdevice(y1);
5469  *      unregister_netdevice(y2);
5470  *      ...
5471  *      rtnl_unlock();
5472  *      free_netdev(y1);
5473  *      free_netdev(y2);
5474  *
5475  * We are invoked by rtnl_unlock().
5476  * This allows us to deal with problems:
5477  * 1) We can delete sysfs objects which invoke hotplug
5478  *    without deadlocking with linkwatch via keventd.
5479  * 2) Since we run with the RTNL semaphore not held, we can sleep
5480  *    safely in order to wait for the netdev refcnt to drop to zero.
5481  *
5482  * We must not return until all unregister events added during
5483  * the interval the lock was held have been completed.
5484  */
5485 void netdev_run_todo(void)
5486 {
5487         struct list_head list;
5488
5489         /* Snapshot list, allow later requests */
5490         list_replace_init(&net_todo_list, &list);
5491
5492         __rtnl_unlock();
5493
5494         while (!list_empty(&list)) {
5495                 struct net_device *dev
5496                         = list_first_entry(&list, struct net_device, todo_list);
5497                 list_del(&dev->todo_list);
5498
5499                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5500                         printk(KERN_ERR "network todo '%s' but state %d\n",
5501                                dev->name, dev->reg_state);
5502                         dump_stack();
5503                         continue;
5504                 }
5505
5506                 dev->reg_state = NETREG_UNREGISTERED;
5507
5508                 on_each_cpu(flush_backlog, dev, 1);
5509
5510                 netdev_wait_allrefs(dev);
5511
5512                 /* paranoia */
5513                 BUG_ON(netdev_refcnt_read(dev));
5514                 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5515                 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5516                 WARN_ON(dev->dn_ptr);
5517
5518                 if (dev->destructor)
5519                         dev->destructor(dev);
5520
5521                 /* Free network device */
5522                 kobject_put(&dev->dev.kobj);
5523         }
5524 }
5525
5526 /**
5527  *      dev_txq_stats_fold - fold tx_queues stats
5528  *      @dev: device to get statistics from
5529  *      @stats: struct rtnl_link_stats64 to hold results
5530  */
5531 void dev_txq_stats_fold(const struct net_device *dev,
5532                         struct rtnl_link_stats64 *stats)
5533 {
5534         u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5535         unsigned int i;
5536         struct netdev_queue *txq;
5537
5538         for (i = 0; i < dev->num_tx_queues; i++) {
5539                 txq = netdev_get_tx_queue(dev, i);
5540                 spin_lock_bh(&txq->_xmit_lock);
5541                 tx_bytes   += txq->tx_bytes;
5542                 tx_packets += txq->tx_packets;
5543                 tx_dropped += txq->tx_dropped;
5544                 spin_unlock_bh(&txq->_xmit_lock);
5545         }
5546         if (tx_bytes || tx_packets || tx_dropped) {
5547                 stats->tx_bytes   = tx_bytes;
5548                 stats->tx_packets = tx_packets;
5549                 stats->tx_dropped = tx_dropped;
5550         }
5551 }
5552 EXPORT_SYMBOL(dev_txq_stats_fold);
5553
5554 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5555  * fields in the same order, with only the type differing.
5556  */
5557 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5558                                     const struct net_device_stats *netdev_stats)
5559 {
5560 #if BITS_PER_LONG == 64
5561         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5562         memcpy(stats64, netdev_stats, sizeof(*stats64));
5563 #else
5564         size_t i, n = sizeof(*stats64) / sizeof(u64);
5565         const unsigned long *src = (const unsigned long *)netdev_stats;
5566         u64 *dst = (u64 *)stats64;
5567
5568         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5569                      sizeof(*stats64) / sizeof(u64));
5570         for (i = 0; i < n; i++)
5571                 dst[i] = src[i];
5572 #endif
5573 }
5574
5575 /**
5576  *      dev_get_stats   - get network device statistics
5577  *      @dev: device to get statistics from
5578  *      @storage: place to store stats
5579  *
5580  *      Get network statistics from device. Return @storage.
5581  *      The device driver may provide its own method by setting
5582  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5583  *      otherwise the internal statistics structure is used.
5584  */
5585 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5586                                         struct rtnl_link_stats64 *storage)
5587 {
5588         const struct net_device_ops *ops = dev->netdev_ops;
5589
5590         if (ops->ndo_get_stats64) {
5591                 memset(storage, 0, sizeof(*storage));
5592                 ops->ndo_get_stats64(dev, storage);
5593         } else if (ops->ndo_get_stats) {
5594                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5595         } else {
5596                 netdev_stats_to_stats64(storage, &dev->stats);
5597                 dev_txq_stats_fold(dev, storage);
5598         }
5599         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5600         return storage;
5601 }
5602 EXPORT_SYMBOL(dev_get_stats);
5603
5604 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5605 {
5606         struct netdev_queue *queue = dev_ingress_queue(dev);
5607
5608 #ifdef CONFIG_NET_CLS_ACT
5609         if (queue)
5610                 return queue;
5611         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5612         if (!queue)
5613                 return NULL;
5614         netdev_init_one_queue(dev, queue, NULL);
5615         queue->qdisc = &noop_qdisc;
5616         queue->qdisc_sleeping = &noop_qdisc;
5617         rcu_assign_pointer(dev->ingress_queue, queue);
5618 #endif
5619         return queue;
5620 }
5621
5622 /**
5623  *      alloc_netdev_mqs - allocate network device
5624  *      @sizeof_priv:   size of private data to allocate space for
5625  *      @name:          device name format string
5626  *      @setup:         callback to initialize device
5627  *      @txqs:          the number of TX subqueues to allocate
5628  *      @rxqs:          the number of RX subqueues to allocate
5629  *
5630  *      Allocates a struct net_device with private data area for driver use
5631  *      and performs basic initialization.  Also allocates subquue structs
5632  *      for each queue on the device.
5633  */
5634 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5635                 void (*setup)(struct net_device *),
5636                 unsigned int txqs, unsigned int rxqs)
5637 {
5638         struct net_device *dev;
5639         size_t alloc_size;
5640         struct net_device *p;
5641
5642         BUG_ON(strlen(name) >= sizeof(dev->name));
5643
5644         if (txqs < 1) {
5645                 pr_err("alloc_netdev: Unable to allocate device "
5646                        "with zero queues.\n");
5647                 return NULL;
5648         }
5649
5650 #ifdef CONFIG_RPS
5651         if (rxqs < 1) {
5652                 pr_err("alloc_netdev: Unable to allocate device "
5653                        "with zero RX queues.\n");
5654                 return NULL;
5655         }
5656 #endif
5657
5658         alloc_size = sizeof(struct net_device);
5659         if (sizeof_priv) {
5660                 /* ensure 32-byte alignment of private area */
5661                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5662                 alloc_size += sizeof_priv;
5663         }
5664         /* ensure 32-byte alignment of whole construct */
5665         alloc_size += NETDEV_ALIGN - 1;
5666
5667         p = kzalloc(alloc_size, GFP_KERNEL);
5668         if (!p) {
5669                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5670                 return NULL;
5671         }
5672
5673         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5674         dev->padded = (char *)dev - (char *)p;
5675
5676         dev->pcpu_refcnt = alloc_percpu(int);
5677         if (!dev->pcpu_refcnt)
5678                 goto free_p;
5679
5680         if (dev_addr_init(dev))
5681                 goto free_pcpu;
5682
5683         dev_mc_init(dev);
5684         dev_uc_init(dev);
5685
5686         dev_net_set(dev, &init_net);
5687
5688         dev->num_tx_queues = txqs;
5689         dev->real_num_tx_queues = txqs;
5690         if (netif_alloc_netdev_queues(dev))
5691                 goto free_pcpu;
5692
5693 #ifdef CONFIG_RPS
5694         dev->num_rx_queues = rxqs;
5695         dev->real_num_rx_queues = rxqs;
5696         if (netif_alloc_rx_queues(dev))
5697                 goto free_pcpu;
5698 #endif
5699
5700         dev->gso_max_size = GSO_MAX_SIZE;
5701
5702         INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5703         dev->ethtool_ntuple_list.count = 0;
5704         INIT_LIST_HEAD(&dev->napi_list);
5705         INIT_LIST_HEAD(&dev->unreg_list);
5706         INIT_LIST_HEAD(&dev->link_watch_list);
5707         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5708         setup(dev);
5709         strcpy(dev->name, name);
5710         return dev;
5711
5712 free_pcpu:
5713         free_percpu(dev->pcpu_refcnt);
5714         kfree(dev->_tx);
5715 #ifdef CONFIG_RPS
5716         kfree(dev->_rx);
5717 #endif
5718
5719 free_p:
5720         kfree(p);
5721         return NULL;
5722 }
5723 EXPORT_SYMBOL(alloc_netdev_mqs);
5724
5725 /**
5726  *      free_netdev - free network device
5727  *      @dev: device
5728  *
5729  *      This function does the last stage of destroying an allocated device
5730  *      interface. The reference to the device object is released.
5731  *      If this is the last reference then it will be freed.
5732  */
5733 void free_netdev(struct net_device *dev)
5734 {
5735         struct napi_struct *p, *n;
5736
5737         release_net(dev_net(dev));
5738
5739         kfree(dev->_tx);
5740 #ifdef CONFIG_RPS
5741         kfree(dev->_rx);
5742 #endif
5743
5744         kfree(rcu_dereference_raw(dev->ingress_queue));
5745
5746         /* Flush device addresses */
5747         dev_addr_flush(dev);
5748
5749         /* Clear ethtool n-tuple list */
5750         ethtool_ntuple_flush(dev);
5751
5752         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5753                 netif_napi_del(p);
5754
5755         free_percpu(dev->pcpu_refcnt);
5756         dev->pcpu_refcnt = NULL;
5757
5758         /*  Compatibility with error handling in drivers */
5759         if (dev->reg_state == NETREG_UNINITIALIZED) {
5760                 kfree((char *)dev - dev->padded);
5761                 return;
5762         }
5763
5764         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5765         dev->reg_state = NETREG_RELEASED;
5766
5767         /* will free via device release */
5768         put_device(&dev->dev);
5769 }
5770 EXPORT_SYMBOL(free_netdev);
5771
5772 /**
5773  *      synchronize_net -  Synchronize with packet receive processing
5774  *
5775  *      Wait for packets currently being received to be done.
5776  *      Does not block later packets from starting.
5777  */
5778 void synchronize_net(void)
5779 {
5780         might_sleep();
5781         synchronize_rcu();
5782 }
5783 EXPORT_SYMBOL(synchronize_net);
5784
5785 /**
5786  *      unregister_netdevice_queue - remove device from the kernel
5787  *      @dev: device
5788  *      @head: list
5789  *
5790  *      This function shuts down a device interface and removes it
5791  *      from the kernel tables.
5792  *      If head not NULL, device is queued to be unregistered later.
5793  *
5794  *      Callers must hold the rtnl semaphore.  You may want
5795  *      unregister_netdev() instead of this.
5796  */
5797
5798 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5799 {
5800         ASSERT_RTNL();
5801
5802         if (head) {
5803                 list_move_tail(&dev->unreg_list, head);
5804         } else {
5805                 rollback_registered(dev);
5806                 /* Finish processing unregister after unlock */
5807                 net_set_todo(dev);
5808         }
5809 }
5810 EXPORT_SYMBOL(unregister_netdevice_queue);
5811
5812 /**
5813  *      unregister_netdevice_many - unregister many devices
5814  *      @head: list of devices
5815  */
5816 void unregister_netdevice_many(struct list_head *head)
5817 {
5818         struct net_device *dev;
5819
5820         if (!list_empty(head)) {
5821                 rollback_registered_many(head);
5822                 list_for_each_entry(dev, head, unreg_list)
5823                         net_set_todo(dev);
5824         }
5825 }
5826 EXPORT_SYMBOL(unregister_netdevice_many);
5827
5828 /**
5829  *      unregister_netdev - remove device from the kernel
5830  *      @dev: device
5831  *
5832  *      This function shuts down a device interface and removes it
5833  *      from the kernel tables.
5834  *
5835  *      This is just a wrapper for unregister_netdevice that takes
5836  *      the rtnl semaphore.  In general you want to use this and not
5837  *      unregister_netdevice.
5838  */
5839 void unregister_netdev(struct net_device *dev)
5840 {
5841         rtnl_lock();
5842         unregister_netdevice(dev);
5843         rtnl_unlock();
5844 }
5845 EXPORT_SYMBOL(unregister_netdev);
5846
5847 /**
5848  *      dev_change_net_namespace - move device to different nethost namespace
5849  *      @dev: device
5850  *      @net: network namespace
5851  *      @pat: If not NULL name pattern to try if the current device name
5852  *            is already taken in the destination network namespace.
5853  *
5854  *      This function shuts down a device interface and moves it
5855  *      to a new network namespace. On success 0 is returned, on
5856  *      a failure a netagive errno code is returned.
5857  *
5858  *      Callers must hold the rtnl semaphore.
5859  */
5860
5861 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5862 {
5863         int err;
5864
5865         ASSERT_RTNL();
5866
5867         /* Don't allow namespace local devices to be moved. */
5868         err = -EINVAL;
5869         if (dev->features & NETIF_F_NETNS_LOCAL)
5870                 goto out;
5871
5872         /* Ensure the device has been registrered */
5873         err = -EINVAL;
5874         if (dev->reg_state != NETREG_REGISTERED)
5875                 goto out;
5876
5877         /* Get out if there is nothing todo */
5878         err = 0;
5879         if (net_eq(dev_net(dev), net))
5880                 goto out;
5881
5882         /* Pick the destination device name, and ensure
5883          * we can use it in the destination network namespace.
5884          */
5885         err = -EEXIST;
5886         if (__dev_get_by_name(net, dev->name)) {
5887                 /* We get here if we can't use the current device name */
5888                 if (!pat)
5889                         goto out;
5890                 if (dev_get_valid_name(dev, pat, 1))
5891                         goto out;
5892         }
5893
5894         /*
5895          * And now a mini version of register_netdevice unregister_netdevice.
5896          */
5897
5898         /* If device is running close it first. */
5899         dev_close(dev);
5900
5901         /* And unlink it from device chain */
5902         err = -ENODEV;
5903         unlist_netdevice(dev);
5904
5905         synchronize_net();
5906
5907         /* Shutdown queueing discipline. */
5908         dev_shutdown(dev);
5909
5910         /* Notify protocols, that we are about to destroy
5911            this device. They should clean all the things.
5912
5913            Note that dev->reg_state stays at NETREG_REGISTERED.
5914            This is wanted because this way 8021q and macvlan know
5915            the device is just moving and can keep their slaves up.
5916         */
5917         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5918         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5919
5920         /*
5921          *      Flush the unicast and multicast chains
5922          */
5923         dev_uc_flush(dev);
5924         dev_mc_flush(dev);
5925
5926         /* Actually switch the network namespace */
5927         dev_net_set(dev, net);
5928
5929         /* If there is an ifindex conflict assign a new one */
5930         if (__dev_get_by_index(net, dev->ifindex)) {
5931                 int iflink = (dev->iflink == dev->ifindex);
5932                 dev->ifindex = dev_new_index(net);
5933                 if (iflink)
5934                         dev->iflink = dev->ifindex;
5935         }
5936
5937         /* Fixup kobjects */
5938         err = device_rename(&dev->dev, dev->name);
5939         WARN_ON(err);
5940
5941         /* Add the device back in the hashes */
5942         list_netdevice(dev);
5943
5944         /* Notify protocols, that a new device appeared. */
5945         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5946
5947         /*
5948          *      Prevent userspace races by waiting until the network
5949          *      device is fully setup before sending notifications.
5950          */
5951         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5952
5953         synchronize_net();
5954         err = 0;
5955 out:
5956         return err;
5957 }
5958 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5959
5960 static int dev_cpu_callback(struct notifier_block *nfb,
5961                             unsigned long action,
5962                             void *ocpu)
5963 {
5964         struct sk_buff **list_skb;
5965         struct sk_buff *skb;
5966         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5967         struct softnet_data *sd, *oldsd;
5968
5969         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5970                 return NOTIFY_OK;
5971
5972         local_irq_disable();
5973         cpu = smp_processor_id();
5974         sd = &per_cpu(softnet_data, cpu);
5975         oldsd = &per_cpu(softnet_data, oldcpu);
5976
5977         /* Find end of our completion_queue. */
5978         list_skb = &sd->completion_queue;
5979         while (*list_skb)
5980                 list_skb = &(*list_skb)->next;
5981         /* Append completion queue from offline CPU. */
5982         *list_skb = oldsd->completion_queue;
5983         oldsd->completion_queue = NULL;
5984
5985         /* Append output queue from offline CPU. */
5986         if (oldsd->output_queue) {
5987                 *sd->output_queue_tailp = oldsd->output_queue;
5988                 sd->output_queue_tailp = oldsd->output_queue_tailp;
5989                 oldsd->output_queue = NULL;
5990                 oldsd->output_queue_tailp = &oldsd->output_queue;
5991         }
5992
5993         raise_softirq_irqoff(NET_TX_SOFTIRQ);
5994         local_irq_enable();
5995
5996         /* Process offline CPU's input_pkt_queue */
5997         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5998                 netif_rx(skb);
5999                 input_queue_head_incr(oldsd);
6000         }
6001         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6002                 netif_rx(skb);
6003                 input_queue_head_incr(oldsd);
6004         }
6005
6006         return NOTIFY_OK;
6007 }
6008
6009
6010 /**
6011  *      netdev_increment_features - increment feature set by one
6012  *      @all: current feature set
6013  *      @one: new feature set
6014  *      @mask: mask feature set
6015  *
6016  *      Computes a new feature set after adding a device with feature set
6017  *      @one to the master device with current feature set @all.  Will not
6018  *      enable anything that is off in @mask. Returns the new feature set.
6019  */
6020 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
6021                                         unsigned long mask)
6022 {
6023         /* If device needs checksumming, downgrade to it. */
6024         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6025                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6026         else if (mask & NETIF_F_ALL_CSUM) {
6027                 /* If one device supports v4/v6 checksumming, set for all. */
6028                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6029                     !(all & NETIF_F_GEN_CSUM)) {
6030                         all &= ~NETIF_F_ALL_CSUM;
6031                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6032                 }
6033
6034                 /* If one device supports hw checksumming, set for all. */
6035                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6036                         all &= ~NETIF_F_ALL_CSUM;
6037                         all |= NETIF_F_HW_CSUM;
6038                 }
6039         }
6040
6041         one |= NETIF_F_ALL_CSUM;
6042
6043         one |= all & NETIF_F_ONE_FOR_ALL;
6044         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6045         all |= one & mask & NETIF_F_ONE_FOR_ALL;
6046
6047         return all;
6048 }
6049 EXPORT_SYMBOL(netdev_increment_features);
6050
6051 static struct hlist_head *netdev_create_hash(void)
6052 {
6053         int i;
6054         struct hlist_head *hash;
6055
6056         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6057         if (hash != NULL)
6058                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6059                         INIT_HLIST_HEAD(&hash[i]);
6060
6061         return hash;
6062 }
6063
6064 /* Initialize per network namespace state */
6065 static int __net_init netdev_init(struct net *net)
6066 {
6067         INIT_LIST_HEAD(&net->dev_base_head);
6068
6069         net->dev_name_head = netdev_create_hash();
6070         if (net->dev_name_head == NULL)
6071                 goto err_name;
6072
6073         net->dev_index_head = netdev_create_hash();
6074         if (net->dev_index_head == NULL)
6075                 goto err_idx;
6076
6077         return 0;
6078
6079 err_idx:
6080         kfree(net->dev_name_head);
6081 err_name:
6082         return -ENOMEM;
6083 }
6084
6085 /**
6086  *      netdev_drivername - network driver for the device
6087  *      @dev: network device
6088  *      @buffer: buffer for resulting name
6089  *      @len: size of buffer
6090  *
6091  *      Determine network driver for device.
6092  */
6093 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6094 {
6095         const struct device_driver *driver;
6096         const struct device *parent;
6097
6098         if (len <= 0 || !buffer)
6099                 return buffer;
6100         buffer[0] = 0;
6101
6102         parent = dev->dev.parent;
6103
6104         if (!parent)
6105                 return buffer;
6106
6107         driver = parent->driver;
6108         if (driver && driver->name)
6109                 strlcpy(buffer, driver->name, len);
6110         return buffer;
6111 }
6112
6113 static int __netdev_printk(const char *level, const struct net_device *dev,
6114                            struct va_format *vaf)
6115 {
6116         int r;
6117
6118         if (dev && dev->dev.parent)
6119                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6120                                netdev_name(dev), vaf);
6121         else if (dev)
6122                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6123         else
6124                 r = printk("%s(NULL net_device): %pV", level, vaf);
6125
6126         return r;
6127 }
6128
6129 int netdev_printk(const char *level, const struct net_device *dev,
6130                   const char *format, ...)
6131 {
6132         struct va_format vaf;
6133         va_list args;
6134         int r;
6135
6136         va_start(args, format);
6137
6138         vaf.fmt = format;
6139         vaf.va = &args;
6140
6141         r = __netdev_printk(level, dev, &vaf);
6142         va_end(args);
6143
6144         return r;
6145 }
6146 EXPORT_SYMBOL(netdev_printk);
6147
6148 #define define_netdev_printk_level(func, level)                 \
6149 int func(const struct net_device *dev, const char *fmt, ...)    \
6150 {                                                               \
6151         int r;                                                  \
6152         struct va_format vaf;                                   \
6153         va_list args;                                           \
6154                                                                 \
6155         va_start(args, fmt);                                    \
6156                                                                 \
6157         vaf.fmt = fmt;                                          \
6158         vaf.va = &args;                                         \
6159                                                                 \
6160         r = __netdev_printk(level, dev, &vaf);                  \
6161         va_end(args);                                           \
6162                                                                 \
6163         return r;                                               \
6164 }                                                               \
6165 EXPORT_SYMBOL(func);
6166
6167 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6168 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6169 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6170 define_netdev_printk_level(netdev_err, KERN_ERR);
6171 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6172 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6173 define_netdev_printk_level(netdev_info, KERN_INFO);
6174
6175 static void __net_exit netdev_exit(struct net *net)
6176 {
6177         kfree(net->dev_name_head);
6178         kfree(net->dev_index_head);
6179 }
6180
6181 static struct pernet_operations __net_initdata netdev_net_ops = {
6182         .init = netdev_init,
6183         .exit = netdev_exit,
6184 };
6185
6186 static void __net_exit default_device_exit(struct net *net)
6187 {
6188         struct net_device *dev, *aux;
6189         /*
6190          * Push all migratable network devices back to the
6191          * initial network namespace
6192          */
6193         rtnl_lock();
6194         for_each_netdev_safe(net, dev, aux) {
6195                 int err;
6196                 char fb_name[IFNAMSIZ];
6197
6198                 /* Ignore unmoveable devices (i.e. loopback) */
6199                 if (dev->features & NETIF_F_NETNS_LOCAL)
6200                         continue;
6201
6202                 /* Leave virtual devices for the generic cleanup */
6203                 if (dev->rtnl_link_ops)
6204                         continue;
6205
6206                 /* Push remaing network devices to init_net */
6207                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6208                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6209                 if (err) {
6210                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6211                                 __func__, dev->name, err);
6212                         BUG();
6213                 }
6214         }
6215         rtnl_unlock();
6216 }
6217
6218 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6219 {
6220         /* At exit all network devices most be removed from a network
6221          * namespace.  Do this in the reverse order of registration.
6222          * Do this across as many network namespaces as possible to
6223          * improve batching efficiency.
6224          */
6225         struct net_device *dev;
6226         struct net *net;
6227         LIST_HEAD(dev_kill_list);
6228
6229         rtnl_lock();
6230         list_for_each_entry(net, net_list, exit_list) {
6231                 for_each_netdev_reverse(net, dev) {
6232                         if (dev->rtnl_link_ops)
6233                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6234                         else
6235                                 unregister_netdevice_queue(dev, &dev_kill_list);
6236                 }
6237         }
6238         unregister_netdevice_many(&dev_kill_list);
6239         rtnl_unlock();
6240 }
6241
6242 static struct pernet_operations __net_initdata default_device_ops = {
6243         .exit = default_device_exit,
6244         .exit_batch = default_device_exit_batch,
6245 };
6246
6247 /*
6248  *      Initialize the DEV module. At boot time this walks the device list and
6249  *      unhooks any devices that fail to initialise (normally hardware not
6250  *      present) and leaves us with a valid list of present and active devices.
6251  *
6252  */
6253
6254 /*
6255  *       This is called single threaded during boot, so no need
6256  *       to take the rtnl semaphore.
6257  */
6258 static int __init net_dev_init(void)
6259 {
6260         int i, rc = -ENOMEM;
6261
6262         BUG_ON(!dev_boot_phase);
6263
6264         if (dev_proc_init())
6265                 goto out;
6266
6267         if (netdev_kobject_init())
6268                 goto out;
6269
6270         INIT_LIST_HEAD(&ptype_all);
6271         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6272                 INIT_LIST_HEAD(&ptype_base[i]);
6273
6274         if (register_pernet_subsys(&netdev_net_ops))
6275                 goto out;
6276
6277         /*
6278          *      Initialise the packet receive queues.
6279          */
6280
6281         for_each_possible_cpu(i) {
6282                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6283
6284                 memset(sd, 0, sizeof(*sd));
6285                 skb_queue_head_init(&sd->input_pkt_queue);
6286                 skb_queue_head_init(&sd->process_queue);
6287                 sd->completion_queue = NULL;
6288                 INIT_LIST_HEAD(&sd->poll_list);
6289                 sd->output_queue = NULL;
6290                 sd->output_queue_tailp = &sd->output_queue;
6291 #ifdef CONFIG_RPS
6292                 sd->csd.func = rps_trigger_softirq;
6293                 sd->csd.info = sd;
6294                 sd->csd.flags = 0;
6295                 sd->cpu = i;
6296 #endif
6297
6298                 sd->backlog.poll = process_backlog;
6299                 sd->backlog.weight = weight_p;
6300                 sd->backlog.gro_list = NULL;
6301                 sd->backlog.gro_count = 0;
6302         }
6303
6304         dev_boot_phase = 0;
6305
6306         /* The loopback device is special if any other network devices
6307          * is present in a network namespace the loopback device must
6308          * be present. Since we now dynamically allocate and free the
6309          * loopback device ensure this invariant is maintained by
6310          * keeping the loopback device as the first device on the
6311          * list of network devices.  Ensuring the loopback devices
6312          * is the first device that appears and the last network device
6313          * that disappears.
6314          */
6315         if (register_pernet_device(&loopback_net_ops))
6316                 goto out;
6317
6318         if (register_pernet_device(&default_device_ops))
6319                 goto out;
6320
6321         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6322         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6323
6324         hotcpu_notifier(dev_cpu_callback, 0);
6325         dst_init();
6326         dev_mcast_init();
6327         rc = 0;
6328 out:
6329         return rc;
6330 }
6331
6332 subsys_initcall(net_dev_init);
6333
6334 static int __init initialize_hashrnd(void)
6335 {
6336         get_random_bytes(&hashrnd, sizeof(hashrnd));
6337         return 0;
6338 }
6339
6340 late_initcall_sync(initialize_hashrnd);
6341