net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/proc_fs.h>
 101 #include <linux/seq_file.h>
 102 #include <linux/stat.h>
 103 #include <net/dst.h>
 104 #include <net/pkt_sched.h>
 105 #include <net/checksum.h>
 106 #include <net/xfrm.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130 #include <trace/events/net.h>
 131 #include <trace/events/skb.h>
 132 #include <linux/pci.h>
 133 #include <linux/inetdevice.h>
 134 #include <linux/cpu_rmap.h>
 135 #include <linux/net_tstamp.h>
 136 #include <linux/static_key.h>
 137
 138 #include "net-sysfs.h"
 139
 140 /* Instead of increasing this, you should create a hash table. */
 141 #define MAX_GRO_SKBS 8
 142
 143 /* This should be increased if a protocol with a bigger head is added. */
 144 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 145
 146 /*
 147  *      The list of packet types we will receive (as opposed to discard)
 148  *      and the routines to invoke.
 149  *
 150  *      Why 16. Because with 16 the only overlap we get on a hash of the
 151  *      low nibble of the protocol value is RARP/SNAP/X.25.
 152  *
 153  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 154  *             sure which should go first, but I bet it won't make much
 155  *             difference if we are running VLANs.  The good news is that
 156  *             this protocol won't be in the list unless compiled in, so
 157  *             the average user (w/out VLANs) will not be adversely affected.
 158  *             --BLG
 159  *
 160  *              0800    IP
 161  *              8100    802.1Q VLAN
 162  *              0001    802.3
 163  *              0002    AX.25
 164  *              0004    802.2
 165  *              8035    RARP
 166  *              0005    SNAP
 167  *              0805    X.25
 168  *              0806    ARP
 169  *              8137    IPX
 170  *              0009    Localtalk
 171  *              86DD    IPv6
 172  */
 173
 174 #define PTYPE_HASH_SIZE (16)
 175 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 176
 177 static DEFINE_SPINLOCK(ptype_lock);
 178 static DEFINE_SPINLOCK(offload_lock);
 179 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 180 static struct list_head ptype_all __read_mostly;        /* Taps */
 181 static struct list_head offload_base __read_mostly;
 182
 183 /*
 184  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 185  * semaphore.
 186  *
 187  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 188  *
 189  * Writers must hold the rtnl semaphore while they loop through the
 190  * dev_base_head list, and hold dev_base_lock for writing when they do the
 191  * actual updates.  This allows pure readers to access the list even
 192  * while a writer is preparing to update it.
 193  *
 194  * To put it another way, dev_base_lock is held for writing only to
 195  * protect against pure readers; the rtnl semaphore provides the
 196  * protection against other writers.
 197  *
 198  * See, for example usages, register_netdevice() and
 199  * unregister_netdevice(), which must be called with the rtnl
 200  * semaphore held.
 201  */
 202 DEFINE_RWLOCK(dev_base_lock);
 203 EXPORT_SYMBOL(dev_base_lock);
 204
 205 seqcount_t devnet_rename_seq;
 206
 207 static inline void dev_base_seq_inc(struct net *net)
 208 {
 209         while (++net->dev_base_seq == 0);
 210 }
 211
 212 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 213 {
 214         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 215
 216         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 217 }
 218
 219 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 220 {
 221         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 222 }
 223
 224 static inline void rps_lock(struct softnet_data *sd)
 225 {
 226 #ifdef CONFIG_RPS
 227         spin_lock(&sd->input_pkt_queue.lock);
 228 #endif
 229 }
 230
 231 static inline void rps_unlock(struct softnet_data *sd)
 232 {
 233 #ifdef CONFIG_RPS
 234         spin_unlock(&sd->input_pkt_queue.lock);
 235 #endif
 236 }
 237
 238 /* Device list insertion */
 239 static int list_netdevice(struct net_device *dev)
 240 {
 241         struct net *net = dev_net(dev);
 242
 243         ASSERT_RTNL();
 244
 245         write_lock_bh(&dev_base_lock);
 246         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 247         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 248         hlist_add_head_rcu(&dev->index_hlist,
 249                            dev_index_hash(net, dev->ifindex));
 250         write_unlock_bh(&dev_base_lock);
 251
 252         dev_base_seq_inc(net);
 253
 254         return 0;
 255 }
 256
 257 /* Device list removal
 258  * caller must respect a RCU grace period before freeing/reusing dev
 259  */
 260 static void unlist_netdevice(struct net_device *dev)
 261 {
 262         ASSERT_RTNL();
 263
 264         /* Unlink dev from the device chain */
 265         write_lock_bh(&dev_base_lock);
 266         list_del_rcu(&dev->dev_list);
 267         hlist_del_rcu(&dev->name_hlist);
 268         hlist_del_rcu(&dev->index_hlist);
 269         write_unlock_bh(&dev_base_lock);
 270
 271         dev_base_seq_inc(dev_net(dev));
 272 }
 273
 274 /*
 275  *      Our notifier list
 276  */
 277
 278 static RAW_NOTIFIER_HEAD(netdev_chain);
 279
 280 /*
 281  *      Device drivers call our routines to queue packets here. We empty the
 282  *      queue in the local softnet handler.
 283  */
 284
 285 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 286 EXPORT_PER_CPU_SYMBOL(softnet_data);
 287
 288 #ifdef CONFIG_LOCKDEP
 289 /*
 290  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 291  * according to dev->type
 292  */
 293 static const unsigned short netdev_lock_type[] =
 294         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 295          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 296          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 297          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 298          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 299          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 300          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 301          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 302          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 303          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 304          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 305          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 306          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 307          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 308          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 309
 310 static const char *const netdev_lock_name[] =
 311         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 312          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 313          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 314          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 315          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 316          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 317          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 318          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 319          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 320          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 321          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 322          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 323          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 324          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 325          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 326
 327 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 328 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 329
 330 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 331 {
 332         int i;
 333
 334         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 335                 if (netdev_lock_type[i] == dev_type)
 336                         return i;
 337         /* the last key is used by default */
 338         return ARRAY_SIZE(netdev_lock_type) - 1;
 339 }
 340
 341 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 342                                                  unsigned short dev_type)
 343 {
 344         int i;
 345
 346         i = netdev_lock_pos(dev_type);
 347         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 348                                    netdev_lock_name[i]);
 349 }
 350
 351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 352 {
 353         int i;
 354
 355         i = netdev_lock_pos(dev->type);
 356         lockdep_set_class_and_name(&dev->addr_list_lock,
 357                                    &netdev_addr_lock_key[i],
 358                                    netdev_lock_name[i]);
 359 }
 360 #else
 361 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 362                                                  unsigned short dev_type)
 363 {
 364 }
 365 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 366 {
 367 }
 368 #endif
 369
 370 /*******************************************************************************
 371
 372                 Protocol management and registration routines
 373
 374 *******************************************************************************/
 375
 376 /*
 377  *      Add a protocol ID to the list. Now that the input handler is
 378  *      smarter we can dispense with all the messy stuff that used to be
 379  *      here.
 380  *
 381  *      BEWARE!!! Protocol handlers, mangling input packets,
 382  *      MUST BE last in hash buckets and checking protocol handlers
 383  *      MUST start from promiscuous ptype_all chain in net_bh.
 384  *      It is true now, do not change it.
 385  *      Explanation follows: if protocol handler, mangling packet, will
 386  *      be the first on list, it is not able to sense, that packet
 387  *      is cloned and should be copied-on-write, so that it will
 388  *      change it and subsequent readers will get broken packet.
 389  *                                                      --ANK (980803)
 390  */
 391
 392 static inline struct list_head *ptype_head(const struct packet_type *pt)
 393 {
 394         if (pt->type == htons(ETH_P_ALL))
 395                 return &ptype_all;
 396         else
 397                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 398 }
 399
 400 /**
 401  *      dev_add_pack - add packet handler
 402  *      @pt: packet type declaration
 403  *
 404  *      Add a protocol handler to the networking stack. The passed &packet_type
 405  *      is linked into kernel lists and may not be freed until it has been
 406  *      removed from the kernel lists.
 407  *
 408  *      This call does not sleep therefore it can not
 409  *      guarantee all CPU's that are in middle of receiving packets
 410  *      will see the new packet type (until the next received packet).
 411  */
 412
 413 void dev_add_pack(struct packet_type *pt)
 414 {
 415         struct list_head *head = ptype_head(pt);
 416
 417         spin_lock(&ptype_lock);
 418         list_add_rcu(&pt->list, head);
 419         spin_unlock(&ptype_lock);
 420 }
 421 EXPORT_SYMBOL(dev_add_pack);
 422
 423 /**
 424  *      __dev_remove_pack        - remove packet handler
 425  *      @pt: packet type declaration
 426  *
 427  *      Remove a protocol handler that was previously added to the kernel
 428  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 429  *      from the kernel lists and can be freed or reused once this function
 430  *      returns.
 431  *
 432  *      The packet type might still be in use by receivers
 433  *      and must not be freed until after all the CPU's have gone
 434  *      through a quiescent state.
 435  */
 436 void __dev_remove_pack(struct packet_type *pt)
 437 {
 438         struct list_head *head = ptype_head(pt);
 439         struct packet_type *pt1;
 440
 441         spin_lock(&ptype_lock);
 442
 443         list_for_each_entry(pt1, head, list) {
 444                 if (pt == pt1) {
 445                         list_del_rcu(&pt->list);
 446                         goto out;
 447                 }
 448         }
 449
 450         pr_warn("dev_remove_pack: %p not found\n", pt);
 451 out:
 452         spin_unlock(&ptype_lock);
 453 }
 454 EXPORT_SYMBOL(__dev_remove_pack);
 455
 456 /**
 457  *      dev_remove_pack  - remove packet handler
 458  *      @pt: packet type declaration
 459  *
 460  *      Remove a protocol handler that was previously added to the kernel
 461  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 462  *      from the kernel lists and can be freed or reused once this function
 463  *      returns.
 464  *
 465  *      This call sleeps to guarantee that no CPU is looking at the packet
 466  *      type after return.
 467  */
 468 void dev_remove_pack(struct packet_type *pt)
 469 {
 470         __dev_remove_pack(pt);
 471
 472         synchronize_net();
 473 }
 474 EXPORT_SYMBOL(dev_remove_pack);
 475
 476
 477 /**
 478  *      dev_add_offload - register offload handlers
 479  *      @po: protocol offload declaration
 480  *
 481  *      Add protocol offload handlers to the networking stack. The passed
 482  *      &proto_offload is linked into kernel lists and may not be freed until
 483  *      it has been removed from the kernel lists.
 484  *
 485  *      This call does not sleep therefore it can not
 486  *      guarantee all CPU's that are in middle of receiving packets
 487  *      will see the new offload handlers (until the next received packet).
 488  */
 489 void dev_add_offload(struct packet_offload *po)
 490 {
 491         struct list_head *head = &offload_base;
 492
 493         spin_lock(&offload_lock);
 494         list_add_rcu(&po->list, head);
 495         spin_unlock(&offload_lock);
 496 }
 497 EXPORT_SYMBOL(dev_add_offload);
 498
 499 /**
 500  *      __dev_remove_offload     - remove offload handler
 501  *      @po: packet offload declaration
 502  *
 503  *      Remove a protocol offload handler that was previously added to the
 504  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 505  *      is removed from the kernel lists and can be freed or reused once this
 506  *      function returns.
 507  *
 508  *      The packet type might still be in use by receivers
 509  *      and must not be freed until after all the CPU's have gone
 510  *      through a quiescent state.
 511  */
 512 void __dev_remove_offload(struct packet_offload *po)
 513 {
 514         struct list_head *head = &offload_base;
 515         struct packet_offload *po1;
 516
 517         spin_lock(&offload_lock);
 518
 519         list_for_each_entry(po1, head, list) {
 520                 if (po == po1) {
 521                         list_del_rcu(&po->list);
 522                         goto out;
 523                 }
 524         }
 525
 526         pr_warn("dev_remove_offload: %p not found\n", po);
 527 out:
 528         spin_unlock(&offload_lock);
 529 }
 530 EXPORT_SYMBOL(__dev_remove_offload);
 531
 532 /**
 533  *      dev_remove_offload       - remove packet offload handler
 534  *      @po: packet offload declaration
 535  *
 536  *      Remove a packet offload handler that was previously added to the kernel
 537  *      offload handlers by dev_add_offload(). The passed &offload_type is
 538  *      removed from the kernel lists and can be freed or reused once this
 539  *      function returns.
 540  *
 541  *      This call sleeps to guarantee that no CPU is looking at the packet
 542  *      type after return.
 543  */
 544 void dev_remove_offload(struct packet_offload *po)
 545 {
 546         __dev_remove_offload(po);
 547
 548         synchronize_net();
 549 }
 550 EXPORT_SYMBOL(dev_remove_offload);
 551
 552 /******************************************************************************
 553
 554                       Device Boot-time Settings Routines
 555
 556 *******************************************************************************/
 557
 558 /* Boot time configuration table */
 559 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 560
 561 /**
 562  *      netdev_boot_setup_add   - add new setup entry
 563  *      @name: name of the device
 564  *      @map: configured settings for the device
 565  *
 566  *      Adds new setup entry to the dev_boot_setup list.  The function
 567  *      returns 0 on error and 1 on success.  This is a generic routine to
 568  *      all netdevices.
 569  */
 570 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 571 {
 572         struct netdev_boot_setup *s;
 573         int i;
 574
 575         s = dev_boot_setup;
 576         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 577                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 578                         memset(s[i].name, 0, sizeof(s[i].name));
 579                         strlcpy(s[i].name, name, IFNAMSIZ);
 580                         memcpy(&s[i].map, map, sizeof(s[i].map));
 581                         break;
 582                 }
 583         }
 584
 585         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 586 }
 587
 588 /**
 589  *      netdev_boot_setup_check - check boot time settings
 590  *      @dev: the netdevice
 591  *
 592  *      Check boot time settings for the device.
 593  *      The found settings are set for the device to be used
 594  *      later in the device probing.
 595  *      Returns 0 if no settings found, 1 if they are.
 596  */
 597 int netdev_boot_setup_check(struct net_device *dev)
 598 {
 599         struct netdev_boot_setup *s = dev_boot_setup;
 600         int i;
 601
 602         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 603                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 604                     !strcmp(dev->name, s[i].name)) {
 605                         dev->irq        = s[i].map.irq;
 606                         dev->base_addr  = s[i].map.base_addr;
 607                         dev->mem_start  = s[i].map.mem_start;
 608                         dev->mem_end    = s[i].map.mem_end;
 609                         return 1;
 610                 }
 611         }
 612         return 0;
 613 }
 614 EXPORT_SYMBOL(netdev_boot_setup_check);
 615
 616
 617 /**
 618  *      netdev_boot_base        - get address from boot time settings
 619  *      @prefix: prefix for network device
 620  *      @unit: id for network device
 621  *
 622  *      Check boot time settings for the base address of device.
 623  *      The found settings are set for the device to be used
 624  *      later in the device probing.
 625  *      Returns 0 if no settings found.
 626  */
 627 unsigned long netdev_boot_base(const char *prefix, int unit)
 628 {
 629         const struct netdev_boot_setup *s = dev_boot_setup;
 630         char name[IFNAMSIZ];
 631         int i;
 632
 633         sprintf(name, "%s%d", prefix, unit);
 634
 635         /*
 636          * If device already registered then return base of 1
 637          * to indicate not to probe for this interface
 638          */
 639         if (__dev_get_by_name(&init_net, name))
 640                 return 1;
 641
 642         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 643                 if (!strcmp(name, s[i].name))
 644                         return s[i].map.base_addr;
 645         return 0;
 646 }
 647
 648 /*
 649  * Saves at boot time configured settings for any netdevice.
 650  */
 651 int __init netdev_boot_setup(char *str)
 652 {
 653         int ints[5];
 654         struct ifmap map;
 655
 656         str = get_options(str, ARRAY_SIZE(ints), ints);
 657         if (!str || !*str)
 658                 return 0;
 659
 660         /* Save settings */
 661         memset(&map, 0, sizeof(map));
 662         if (ints[0] > 0)
 663                 map.irq = ints[1];
 664         if (ints[0] > 1)
 665                 map.base_addr = ints[2];
 666         if (ints[0] > 2)
 667                 map.mem_start = ints[3];
 668         if (ints[0] > 3)
 669                 map.mem_end = ints[4];
 670
 671         /* Add new entry to the list */
 672         return netdev_boot_setup_add(str, &map);
 673 }
 674
 675 __setup("netdev=", netdev_boot_setup);
 676
 677 /*******************************************************************************
 678
 679                             Device Interface Subroutines
 680
 681 *******************************************************************************/
 682
 683 /**
 684  *      __dev_get_by_name       - find a device by its name
 685  *      @net: the applicable net namespace
 686  *      @name: name to find
 687  *
 688  *      Find an interface by name. Must be called under RTNL semaphore
 689  *      or @dev_base_lock. If the name is found a pointer to the device
 690  *      is returned. If the name is not found then %NULL is returned. The
 691  *      reference counters are not incremented so the caller must be
 692  *      careful with locks.
 693  */
 694
 695 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 696 {
 697         struct hlist_node *p;
 698         struct net_device *dev;
 699         struct hlist_head *head = dev_name_hash(net, name);
 700
 701         hlist_for_each_entry(dev, p, head, name_hlist)
 702                 if (!strncmp(dev->name, name, IFNAMSIZ))
 703                         return dev;
 704
 705         return NULL;
 706 }
 707 EXPORT_SYMBOL(__dev_get_by_name);
 708
 709 /**
 710  *      dev_get_by_name_rcu     - find a device by its name
 711  *      @net: the applicable net namespace
 712  *      @name: name to find
 713  *
 714  *      Find an interface by name.
 715  *      If the name is found a pointer to the device is returned.
 716  *      If the name is not found then %NULL is returned.
 717  *      The reference counters are not incremented so the caller must be
 718  *      careful with locks. The caller must hold RCU lock.
 719  */
 720
 721 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 722 {
 723         struct hlist_node *p;
 724         struct net_device *dev;
 725         struct hlist_head *head = dev_name_hash(net, name);
 726
 727         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 728                 if (!strncmp(dev->name, name, IFNAMSIZ))
 729                         return dev;
 730
 731         return NULL;
 732 }
 733 EXPORT_SYMBOL(dev_get_by_name_rcu);
 734
 735 /**
 736  *      dev_get_by_name         - find a device by its name
 737  *      @net: the applicable net namespace
 738  *      @name: name to find
 739  *
 740  *      Find an interface by name. This can be called from any
 741  *      context and does its own locking. The returned handle has
 742  *      the usage count incremented and the caller must use dev_put() to
 743  *      release it when it is no longer needed. %NULL is returned if no
 744  *      matching device is found.
 745  */
 746
 747 struct net_device *dev_get_by_name(struct net *net, const char *name)
 748 {
 749         struct net_device *dev;
 750
 751         rcu_read_lock();
 752         dev = dev_get_by_name_rcu(net, name);
 753         if (dev)
 754                 dev_hold(dev);
 755         rcu_read_unlock();
 756         return dev;
 757 }
 758 EXPORT_SYMBOL(dev_get_by_name);
 759
 760 /**
 761  *      __dev_get_by_index - find a device by its ifindex
 762  *      @net: the applicable net namespace
 763  *      @ifindex: index of device
 764  *
 765  *      Search for an interface by index. Returns %NULL if the device
 766  *      is not found or a pointer to the device. The device has not
 767  *      had its reference counter increased so the caller must be careful
 768  *      about locking. The caller must hold either the RTNL semaphore
 769  *      or @dev_base_lock.
 770  */
 771
 772 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 773 {
 774         struct hlist_node *p;
 775         struct net_device *dev;
 776         struct hlist_head *head = dev_index_hash(net, ifindex);
 777
 778         hlist_for_each_entry(dev, p, head, index_hlist)
 779                 if (dev->ifindex == ifindex)
 780                         return dev;
 781
 782         return NULL;
 783 }
 784 EXPORT_SYMBOL(__dev_get_by_index);
 785
 786 /**
 787  *      dev_get_by_index_rcu - find a device by its ifindex
 788  *      @net: the applicable net namespace
 789  *      @ifindex: index of device
 790  *
 791  *      Search for an interface by index. Returns %NULL if the device
 792  *      is not found or a pointer to the device. The device has not
 793  *      had its reference counter increased so the caller must be careful
 794  *      about locking. The caller must hold RCU lock.
 795  */
 796
 797 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 798 {
 799         struct hlist_node *p;
 800         struct net_device *dev;
 801         struct hlist_head *head = dev_index_hash(net, ifindex);
 802
 803         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 804                 if (dev->ifindex == ifindex)
 805                         return dev;
 806
 807         return NULL;
 808 }
 809 EXPORT_SYMBOL(dev_get_by_index_rcu);
 810
 811
 812 /**
 813  *      dev_get_by_index - find a device by its ifindex
 814  *      @net: the applicable net namespace
 815  *      @ifindex: index of device
 816  *
 817  *      Search for an interface by index. Returns NULL if the device
 818  *      is not found or a pointer to the device. The device returned has
 819  *      had a reference added and the pointer is safe until the user calls
 820  *      dev_put to indicate they have finished with it.
 821  */
 822
 823 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 824 {
 825         struct net_device *dev;
 826
 827         rcu_read_lock();
 828         dev = dev_get_by_index_rcu(net, ifindex);
 829         if (dev)
 830                 dev_hold(dev);
 831         rcu_read_unlock();
 832         return dev;
 833 }
 834 EXPORT_SYMBOL(dev_get_by_index);
 835
 836 /**
 837  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 838  *      @net: the applicable net namespace
 839  *      @type: media type of device
 840  *      @ha: hardware address
 841  *
 842  *      Search for an interface by MAC address. Returns NULL if the device
 843  *      is not found or a pointer to the device.
 844  *      The caller must hold RCU or RTNL.
 845  *      The returned device has not had its ref count increased
 846  *      and the caller must therefore be careful about locking
 847  *
 848  */
 849
 850 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 851                                        const char *ha)
 852 {
 853         struct net_device *dev;
 854
 855         for_each_netdev_rcu(net, dev)
 856                 if (dev->type == type &&
 857                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 858                         return dev;
 859
 860         return NULL;
 861 }
 862 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 863
 864 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 865 {
 866         struct net_device *dev;
 867
 868         ASSERT_RTNL();
 869         for_each_netdev(net, dev)
 870                 if (dev->type == type)
 871                         return dev;
 872
 873         return NULL;
 874 }
 875 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 876
 877 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 878 {
 879         struct net_device *dev, *ret = NULL;
 880
 881         rcu_read_lock();
 882         for_each_netdev_rcu(net, dev)
 883                 if (dev->type == type) {
 884                         dev_hold(dev);
 885                         ret = dev;
 886                         break;
 887                 }
 888         rcu_read_unlock();
 889         return ret;
 890 }
 891 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 892
 893 /**
 894  *      dev_get_by_flags_rcu - find any device with given flags
 895  *      @net: the applicable net namespace
 896  *      @if_flags: IFF_* values
 897  *      @mask: bitmask of bits in if_flags to check
 898  *
 899  *      Search for any interface with the given flags. Returns NULL if a device
 900  *      is not found or a pointer to the device. Must be called inside
 901  *      rcu_read_lock(), and result refcount is unchanged.
 902  */
 903
 904 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 905                                     unsigned short mask)
 906 {
 907         struct net_device *dev, *ret;
 908
 909         ret = NULL;
 910         for_each_netdev_rcu(net, dev) {
 911                 if (((dev->flags ^ if_flags) & mask) == 0) {
 912                         ret = dev;
 913                         break;
 914                 }
 915         }
 916         return ret;
 917 }
 918 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 919
 920 /**
 921  *      dev_valid_name - check if name is okay for network device
 922  *      @name: name string
 923  *
 924  *      Network device names need to be valid file names to
 925  *      to allow sysfs to work.  We also disallow any kind of
 926  *      whitespace.
 927  */
 928 bool dev_valid_name(const char *name)
 929 {
 930         if (*name == '\0')
 931                 return false;
 932         if (strlen(name) >= IFNAMSIZ)
 933                 return false;
 934         if (!strcmp(name, ".") || !strcmp(name, ".."))
 935                 return false;
 936
 937         while (*name) {
 938                 if (*name == '/' || isspace(*name))
 939                         return false;
 940                 name++;
 941         }
 942         return true;
 943 }
 944 EXPORT_SYMBOL(dev_valid_name);
 945
 946 /**
 947  *      __dev_alloc_name - allocate a name for a device
 948  *      @net: network namespace to allocate the device name in
 949  *      @name: name format string
 950  *      @buf:  scratch buffer and result name string
 951  *
 952  *      Passed a format string - eg "lt%d" it will try and find a suitable
 953  *      id. It scans list of devices to build up a free map, then chooses
 954  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 955  *      while allocating the name and adding the device in order to avoid
 956  *      duplicates.
 957  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 958  *      Returns the number of the unit assigned or a negative errno code.
 959  */
 960
 961 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 962 {
 963         int i = 0;
 964         const char *p;
 965         const int max_netdevices = 8*PAGE_SIZE;
 966         unsigned long *inuse;
 967         struct net_device *d;
 968
 969         p = strnchr(name, IFNAMSIZ-1, '%');
 970         if (p) {
 971                 /*
 972                  * Verify the string as this thing may have come from
 973                  * the user.  There must be either one "%d" and no other "%"
 974                  * characters.
 975                  */
 976                 if (p[1] != 'd' || strchr(p + 2, '%'))
 977                         return -EINVAL;
 978
 979                 /* Use one page as a bit array of possible slots */
 980                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 981                 if (!inuse)
 982                         return -ENOMEM;
 983
 984                 for_each_netdev(net, d) {
 985                         if (!sscanf(d->name, name, &i))
 986                                 continue;
 987                         if (i < 0 || i >= max_netdevices)
 988                                 continue;
 989
 990                         /*  avoid cases where sscanf is not exact inverse of printf */
 991                         snprintf(buf, IFNAMSIZ, name, i);
 992                         if (!strncmp(buf, d->name, IFNAMSIZ))
 993                                 set_bit(i, inuse);
 994                 }
 995
 996                 i = find_first_zero_bit(inuse, max_netdevices);
 997                 free_page((unsigned long) inuse);
 998         }
 999
1000         if (buf != name)
1001                 snprintf(buf, IFNAMSIZ, name, i);
1002         if (!__dev_get_by_name(net, buf))
1003                 return i;
1004
1005         /* It is possible to run out of possible slots
1006          * when the name is long and there isn't enough space left
1007          * for the digits, or if all bits are used.
1008          */
1009         return -ENFILE;
1010 }
1011
1012 /**
1013  *      dev_alloc_name - allocate a name for a device
1014  *      @dev: device
1015  *      @name: name format string
1016  *
1017  *      Passed a format string - eg "lt%d" it will try and find a suitable
1018  *      id. It scans list of devices to build up a free map, then chooses
1019  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1020  *      while allocating the name and adding the device in order to avoid
1021  *      duplicates.
1022  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1023  *      Returns the number of the unit assigned or a negative errno code.
1024  */
1025
1026 int dev_alloc_name(struct net_device *dev, const char *name)
1027 {
1028         char buf[IFNAMSIZ];
1029         struct net *net;
1030         int ret;
1031
1032         BUG_ON(!dev_net(dev));
1033         net = dev_net(dev);
1034         ret = __dev_alloc_name(net, name, buf);
1035         if (ret >= 0)
1036                 strlcpy(dev->name, buf, IFNAMSIZ);
1037         return ret;
1038 }
1039 EXPORT_SYMBOL(dev_alloc_name);
1040
1041 static int dev_alloc_name_ns(struct net *net,
1042                              struct net_device *dev,
1043                              const char *name)
1044 {
1045         char buf[IFNAMSIZ];
1046         int ret;
1047
1048         ret = __dev_alloc_name(net, name, buf);
1049         if (ret >= 0)
1050                 strlcpy(dev->name, buf, IFNAMSIZ);
1051         return ret;
1052 }
1053
1054 static int dev_get_valid_name(struct net *net,
1055                               struct net_device *dev,
1056                               const char *name)
1057 {
1058         BUG_ON(!net);
1059
1060         if (!dev_valid_name(name))
1061                 return -EINVAL;
1062
1063         if (strchr(name, '%'))
1064                 return dev_alloc_name_ns(net, dev, name);
1065         else if (__dev_get_by_name(net, name))
1066                 return -EEXIST;
1067         else if (dev->name != name)
1068                 strlcpy(dev->name, name, IFNAMSIZ);
1069
1070         return 0;
1071 }
1072
1073 /**
1074  *      dev_change_name - change name of a device
1075  *      @dev: device
1076  *      @newname: name (or format string) must be at least IFNAMSIZ
1077  *
1078  *      Change name of a device, can pass format strings "eth%d".
1079  *      for wildcarding.
1080  */
1081 int dev_change_name(struct net_device *dev, const char *newname)
1082 {
1083         char oldname[IFNAMSIZ];
1084         int err = 0;
1085         int ret;
1086         struct net *net;
1087
1088         ASSERT_RTNL();
1089         BUG_ON(!dev_net(dev));
1090
1091         net = dev_net(dev);
1092         if (dev->flags & IFF_UP)
1093                 return -EBUSY;
1094
1095         write_seqcount_begin(&devnet_rename_seq);
1096
1097         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1098                 write_seqcount_end(&devnet_rename_seq);
1099                 return 0;
1100         }
1101
1102         memcpy(oldname, dev->name, IFNAMSIZ);
1103
1104         err = dev_get_valid_name(net, dev, newname);
1105         if (err < 0) {
1106                 write_seqcount_end(&devnet_rename_seq);
1107                 return err;
1108         }
1109
1110 rollback:
1111         ret = device_rename(&dev->dev, dev->name);
1112         if (ret) {
1113                 memcpy(dev->name, oldname, IFNAMSIZ);
1114                 write_seqcount_end(&devnet_rename_seq);
1115                 return ret;
1116         }
1117
1118         write_seqcount_end(&devnet_rename_seq);
1119
1120         write_lock_bh(&dev_base_lock);
1121         hlist_del_rcu(&dev->name_hlist);
1122         write_unlock_bh(&dev_base_lock);
1123
1124         synchronize_rcu();
1125
1126         write_lock_bh(&dev_base_lock);
1127         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1128         write_unlock_bh(&dev_base_lock);
1129
1130         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1131         ret = notifier_to_errno(ret);
1132
1133         if (ret) {
1134                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1135                 if (err >= 0) {
1136                         err = ret;
1137                         write_seqcount_begin(&devnet_rename_seq);
1138                         memcpy(dev->name, oldname, IFNAMSIZ);
1139                         goto rollback;
1140                 } else {
1141                         pr_err("%s: name change rollback failed: %d\n",
1142                                dev->name, ret);
1143                 }
1144         }
1145
1146         return err;
1147 }
1148
1149 /**
1150  *      dev_set_alias - change ifalias of a device
1151  *      @dev: device
1152  *      @alias: name up to IFALIASZ
1153  *      @len: limit of bytes to copy from info
1154  *
1155  *      Set ifalias for a device,
1156  */
1157 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1158 {
1159         char *new_ifalias;
1160
1161         ASSERT_RTNL();
1162
1163         if (len >= IFALIASZ)
1164                 return -EINVAL;
1165
1166         if (!len) {
1167                 kfree(dev->ifalias);
1168                 dev->ifalias = NULL;
1169                 return 0;
1170         }
1171
1172         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1173         if (!new_ifalias)
1174                 return -ENOMEM;
1175         dev->ifalias = new_ifalias;
1176
1177         strlcpy(dev->ifalias, alias, len+1);
1178         return len;
1179 }
1180
1181
1182 /**
1183  *      netdev_features_change - device changes features
1184  *      @dev: device to cause notification
1185  *
1186  *      Called to indicate a device has changed features.
1187  */
1188 void netdev_features_change(struct net_device *dev)
1189 {
1190         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1191 }
1192 EXPORT_SYMBOL(netdev_features_change);
1193
1194 /**
1195  *      netdev_state_change - device changes state
1196  *      @dev: device to cause notification
1197  *
1198  *      Called to indicate a device has changed state. This function calls
1199  *      the notifier chains for netdev_chain and sends a NEWLINK message
1200  *      to the routing socket.
1201  */
1202 void netdev_state_change(struct net_device *dev)
1203 {
1204         if (dev->flags & IFF_UP) {
1205                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1206                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1207         }
1208 }
1209 EXPORT_SYMBOL(netdev_state_change);
1210
1211 /**
1212  *      netdev_notify_peers - notify network peers about existence of @dev
1213  *      @dev: network device
1214  *
1215  * Generate traffic such that interested network peers are aware of
1216  * @dev, such as by generating a gratuitous ARP. This may be used when
1217  * a device wants to inform the rest of the network about some sort of
1218  * reconfiguration such as a failover event or virtual machine
1219  * migration.
1220  */
1221 void netdev_notify_peers(struct net_device *dev)
1222 {
1223         rtnl_lock();
1224         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1225         rtnl_unlock();
1226 }
1227 EXPORT_SYMBOL(netdev_notify_peers);
1228
1229 /**
1230  *      dev_load        - load a network module
1231  *      @net: the applicable net namespace
1232  *      @name: name of interface
1233  *
1234  *      If a network interface is not present and the process has suitable
1235  *      privileges this function loads the module. If module loading is not
1236  *      available in this kernel then it becomes a nop.
1237  */
1238
1239 void dev_load(struct net *net, const char *name)
1240 {
1241         struct net_device *dev;
1242         int no_module;
1243
1244         rcu_read_lock();
1245         dev = dev_get_by_name_rcu(net, name);
1246         rcu_read_unlock();
1247
1248         no_module = !dev;
1249         if (no_module && capable(CAP_NET_ADMIN))
1250                 no_module = request_module("netdev-%s", name);
1251         if (no_module && capable(CAP_SYS_MODULE)) {
1252                 if (!request_module("%s", name))
1253                         pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1254                                 name);
1255         }
1256 }
1257 EXPORT_SYMBOL(dev_load);
1258
1259 static int __dev_open(struct net_device *dev)
1260 {
1261         const struct net_device_ops *ops = dev->netdev_ops;
1262         int ret;
1263
1264         ASSERT_RTNL();
1265
1266         if (!netif_device_present(dev))
1267                 return -ENODEV;
1268
1269         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1270         ret = notifier_to_errno(ret);
1271         if (ret)
1272                 return ret;
1273
1274         set_bit(__LINK_STATE_START, &dev->state);
1275
1276         if (ops->ndo_validate_addr)
1277                 ret = ops->ndo_validate_addr(dev);
1278
1279         if (!ret && ops->ndo_open)
1280                 ret = ops->ndo_open(dev);
1281
1282         if (ret)
1283                 clear_bit(__LINK_STATE_START, &dev->state);
1284         else {
1285                 dev->flags |= IFF_UP;
1286                 net_dmaengine_get();
1287                 dev_set_rx_mode(dev);
1288                 dev_activate(dev);
1289                 add_device_randomness(dev->dev_addr, dev->addr_len);
1290         }
1291
1292         return ret;
1293 }
1294
1295 /**
1296  *      dev_open        - prepare an interface for use.
1297  *      @dev:   device to open
1298  *
1299  *      Takes a device from down to up state. The device's private open
1300  *      function is invoked and then the multicast lists are loaded. Finally
1301  *      the device is moved into the up state and a %NETDEV_UP message is
1302  *      sent to the netdev notifier chain.
1303  *
1304  *      Calling this function on an active interface is a nop. On a failure
1305  *      a negative errno code is returned.
1306  */
1307 int dev_open(struct net_device *dev)
1308 {
1309         int ret;
1310
1311         if (dev->flags & IFF_UP)
1312                 return 0;
1313
1314         ret = __dev_open(dev);
1315         if (ret < 0)
1316                 return ret;
1317
1318         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1319         call_netdevice_notifiers(NETDEV_UP, dev);
1320
1321         return ret;
1322 }
1323 EXPORT_SYMBOL(dev_open);
1324
1325 static int __dev_close_many(struct list_head *head)
1326 {
1327         struct net_device *dev;
1328
1329         ASSERT_RTNL();
1330         might_sleep();
1331
1332         list_for_each_entry(dev, head, unreg_list) {
1333                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1334
1335                 clear_bit(__LINK_STATE_START, &dev->state);
1336
1337                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1338                  * can be even on different cpu. So just clear netif_running().
1339                  *
1340                  * dev->stop() will invoke napi_disable() on all of it's
1341                  * napi_struct instances on this device.
1342                  */
1343                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1344         }
1345
1346         dev_deactivate_many(head);
1347
1348         list_for_each_entry(dev, head, unreg_list) {
1349                 const struct net_device_ops *ops = dev->netdev_ops;
1350
1351                 /*
1352                  *      Call the device specific close. This cannot fail.
1353                  *      Only if device is UP
1354                  *
1355                  *      We allow it to be called even after a DETACH hot-plug
1356                  *      event.
1357                  */
1358                 if (ops->ndo_stop)
1359                         ops->ndo_stop(dev);
1360
1361                 dev->flags &= ~IFF_UP;
1362                 net_dmaengine_put();
1363         }
1364
1365         return 0;
1366 }
1367
1368 static int __dev_close(struct net_device *dev)
1369 {
1370         int retval;
1371         LIST_HEAD(single);
1372
1373         list_add(&dev->unreg_list, &single);
1374         retval = __dev_close_many(&single);
1375         list_del(&single);
1376         return retval;
1377 }
1378
1379 static int dev_close_many(struct list_head *head)
1380 {
1381         struct net_device *dev, *tmp;
1382         LIST_HEAD(tmp_list);
1383
1384         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1385                 if (!(dev->flags & IFF_UP))
1386                         list_move(&dev->unreg_list, &tmp_list);
1387
1388         __dev_close_many(head);
1389
1390         list_for_each_entry(dev, head, unreg_list) {
1391                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1392                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1393         }
1394
1395         /* rollback_registered_many needs the complete original list */
1396         list_splice(&tmp_list, head);
1397         return 0;
1398 }
1399
1400 /**
1401  *      dev_close - shutdown an interface.
1402  *      @dev: device to shutdown
1403  *
1404  *      This function moves an active device into down state. A
1405  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1406  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1407  *      chain.
1408  */
1409 int dev_close(struct net_device *dev)
1410 {
1411         if (dev->flags & IFF_UP) {
1412                 LIST_HEAD(single);
1413
1414                 list_add(&dev->unreg_list, &single);
1415                 dev_close_many(&single);
1416                 list_del(&single);
1417         }
1418         return 0;
1419 }
1420 EXPORT_SYMBOL(dev_close);
1421
1422
1423 /**
1424  *      dev_disable_lro - disable Large Receive Offload on a device
1425  *      @dev: device
1426  *
1427  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1428  *      called under RTNL.  This is needed if received packets may be
1429  *      forwarded to another interface.
1430  */
1431 void dev_disable_lro(struct net_device *dev)
1432 {
1433         /*
1434          * If we're trying to disable lro on a vlan device
1435          * use the underlying physical device instead
1436          */
1437         if (is_vlan_dev(dev))
1438                 dev = vlan_dev_real_dev(dev);
1439
1440         dev->wanted_features &= ~NETIF_F_LRO;
1441         netdev_update_features(dev);
1442
1443         if (unlikely(dev->features & NETIF_F_LRO))
1444                 netdev_WARN(dev, "failed to disable LRO!\n");
1445 }
1446 EXPORT_SYMBOL(dev_disable_lro);
1447
1448
1449 static int dev_boot_phase = 1;
1450
1451 /**
1452  *      register_netdevice_notifier - register a network notifier block
1453  *      @nb: notifier
1454  *
1455  *      Register a notifier to be called when network device events occur.
1456  *      The notifier passed is linked into the kernel structures and must
1457  *      not be reused until it has been unregistered. A negative errno code
1458  *      is returned on a failure.
1459  *
1460  *      When registered all registration and up events are replayed
1461  *      to the new notifier to allow device to have a race free
1462  *      view of the network device list.
1463  */
1464
1465 int register_netdevice_notifier(struct notifier_block *nb)
1466 {
1467         struct net_device *dev;
1468         struct net_device *last;
1469         struct net *net;
1470         int err;
1471
1472         rtnl_lock();
1473         err = raw_notifier_chain_register(&netdev_chain, nb);
1474         if (err)
1475                 goto unlock;
1476         if (dev_boot_phase)
1477                 goto unlock;
1478         for_each_net(net) {
1479                 for_each_netdev(net, dev) {
1480                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1481                         err = notifier_to_errno(err);
1482                         if (err)
1483                                 goto rollback;
1484
1485                         if (!(dev->flags & IFF_UP))
1486                                 continue;
1487
1488                         nb->notifier_call(nb, NETDEV_UP, dev);
1489                 }
1490         }
1491
1492 unlock:
1493         rtnl_unlock();
1494         return err;
1495
1496 rollback:
1497         last = dev;
1498         for_each_net(net) {
1499                 for_each_netdev(net, dev) {
1500                         if (dev == last)
1501                                 goto outroll;
1502
1503                         if (dev->flags & IFF_UP) {
1504                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1505                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1506                         }
1507                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1508                 }
1509         }
1510
1511 outroll:
1512         raw_notifier_chain_unregister(&netdev_chain, nb);
1513         goto unlock;
1514 }
1515 EXPORT_SYMBOL(register_netdevice_notifier);
1516
1517 /**
1518  *      unregister_netdevice_notifier - unregister a network notifier block
1519  *      @nb: notifier
1520  *
1521  *      Unregister a notifier previously registered by
1522  *      register_netdevice_notifier(). The notifier is unlinked into the
1523  *      kernel structures and may then be reused. A negative errno code
1524  *      is returned on a failure.
1525  *
1526  *      After unregistering unregister and down device events are synthesized
1527  *      for all devices on the device list to the removed notifier to remove
1528  *      the need for special case cleanup code.
1529  */
1530
1531 int unregister_netdevice_notifier(struct notifier_block *nb)
1532 {
1533         struct net_device *dev;
1534         struct net *net;
1535         int err;
1536
1537         rtnl_lock();
1538         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1539         if (err)
1540                 goto unlock;
1541
1542         for_each_net(net) {
1543                 for_each_netdev(net, dev) {
1544                         if (dev->flags & IFF_UP) {
1545                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1546                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1547                         }
1548                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1549                 }
1550         }
1551 unlock:
1552         rtnl_unlock();
1553         return err;
1554 }
1555 EXPORT_SYMBOL(unregister_netdevice_notifier);
1556
1557 /**
1558  *      call_netdevice_notifiers - call all network notifier blocks
1559  *      @val: value passed unmodified to notifier function
1560  *      @dev: net_device pointer passed unmodified to notifier function
1561  *
1562  *      Call all network notifier blocks.  Parameters and return value
1563  *      are as for raw_notifier_call_chain().
1564  */
1565
1566 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1567 {
1568         ASSERT_RTNL();
1569         return raw_notifier_call_chain(&netdev_chain, val, dev);
1570 }
1571 EXPORT_SYMBOL(call_netdevice_notifiers);
1572
1573 static struct static_key netstamp_needed __read_mostly;
1574 #ifdef HAVE_JUMP_LABEL
1575 /* We are not allowed to call static_key_slow_dec() from irq context
1576  * If net_disable_timestamp() is called from irq context, defer the
1577  * static_key_slow_dec() calls.
1578  */
1579 static atomic_t netstamp_needed_deferred;
1580 #endif
1581
1582 void net_enable_timestamp(void)
1583 {
1584 #ifdef HAVE_JUMP_LABEL
1585         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1586
1587         if (deferred) {
1588                 while (--deferred)
1589                         static_key_slow_dec(&netstamp_needed);
1590                 return;
1591         }
1592 #endif
1593         WARN_ON(in_interrupt());
1594         static_key_slow_inc(&netstamp_needed);
1595 }
1596 EXPORT_SYMBOL(net_enable_timestamp);
1597
1598 void net_disable_timestamp(void)
1599 {
1600 #ifdef HAVE_JUMP_LABEL
1601         if (in_interrupt()) {
1602                 atomic_inc(&netstamp_needed_deferred);
1603                 return;
1604         }
1605 #endif
1606         static_key_slow_dec(&netstamp_needed);
1607 }
1608 EXPORT_SYMBOL(net_disable_timestamp);
1609
1610 static inline void net_timestamp_set(struct sk_buff *skb)
1611 {
1612         skb->tstamp.tv64 = 0;
1613         if (static_key_false(&netstamp_needed))
1614                 __net_timestamp(skb);
1615 }
1616
1617 #define net_timestamp_check(COND, SKB)                  \
1618         if (static_key_false(&netstamp_needed)) {               \
1619                 if ((COND) && !(SKB)->tstamp.tv64)      \
1620                         __net_timestamp(SKB);           \
1621         }                                               \
1622
1623 static int net_hwtstamp_validate(struct ifreq *ifr)
1624 {
1625         struct hwtstamp_config cfg;
1626         enum hwtstamp_tx_types tx_type;
1627         enum hwtstamp_rx_filters rx_filter;
1628         int tx_type_valid = 0;
1629         int rx_filter_valid = 0;
1630
1631         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1632                 return -EFAULT;
1633
1634         if (cfg.flags) /* reserved for future extensions */
1635                 return -EINVAL;
1636
1637         tx_type = cfg.tx_type;
1638         rx_filter = cfg.rx_filter;
1639
1640         switch (tx_type) {
1641         case HWTSTAMP_TX_OFF:
1642         case HWTSTAMP_TX_ON:
1643         case HWTSTAMP_TX_ONESTEP_SYNC:
1644                 tx_type_valid = 1;
1645                 break;
1646         }
1647
1648         switch (rx_filter) {
1649         case HWTSTAMP_FILTER_NONE:
1650         case HWTSTAMP_FILTER_ALL:
1651         case HWTSTAMP_FILTER_SOME:
1652         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1653         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1654         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1655         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1656         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1657         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1658         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1659         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1660         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1661         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1662         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1663         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1664                 rx_filter_valid = 1;
1665                 break;
1666         }
1667
1668         if (!tx_type_valid || !rx_filter_valid)
1669                 return -ERANGE;
1670
1671         return 0;
1672 }
1673
1674 static inline bool is_skb_forwardable(struct net_device *dev,
1675                                       struct sk_buff *skb)
1676 {
1677         unsigned int len;
1678
1679         if (!(dev->flags & IFF_UP))
1680                 return false;
1681
1682         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1683         if (skb->len <= len)
1684                 return true;
1685
1686         /* if TSO is enabled, we don't care about the length as the packet
1687          * could be forwarded without being segmented before
1688          */
1689         if (skb_is_gso(skb))
1690                 return true;
1691
1692         return false;
1693 }
1694
1695 /**
1696  * dev_forward_skb - loopback an skb to another netif
1697  *
1698  * @dev: destination network device
1699  * @skb: buffer to forward
1700  *
1701  * return values:
1702  *      NET_RX_SUCCESS  (no congestion)
1703  *      NET_RX_DROP     (packet was dropped, but freed)
1704  *
1705  * dev_forward_skb can be used for injecting an skb from the
1706  * start_xmit function of one device into the receive queue
1707  * of another device.
1708  *
1709  * The receiving device may be in another namespace, so
1710  * we have to clear all information in the skb that could
1711  * impact namespace isolation.
1712  */
1713 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1714 {
1715         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1716                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1717                         atomic_long_inc(&dev->rx_dropped);
1718                         kfree_skb(skb);
1719                         return NET_RX_DROP;
1720                 }
1721         }
1722
1723         skb_orphan(skb);
1724         nf_reset(skb);
1725
1726         if (unlikely(!is_skb_forwardable(dev, skb))) {
1727                 atomic_long_inc(&dev->rx_dropped);
1728                 kfree_skb(skb);
1729                 return NET_RX_DROP;
1730         }
1731         skb->skb_iif = 0;
1732         skb->dev = dev;
1733         skb_dst_drop(skb);
1734         skb->tstamp.tv64 = 0;
1735         skb->pkt_type = PACKET_HOST;
1736         skb->protocol = eth_type_trans(skb, dev);
1737         skb->mark = 0;
1738         secpath_reset(skb);
1739         nf_reset(skb);
1740         return netif_rx(skb);
1741 }
1742 EXPORT_SYMBOL_GPL(dev_forward_skb);
1743
1744 static inline int deliver_skb(struct sk_buff *skb,
1745                               struct packet_type *pt_prev,
1746                               struct net_device *orig_dev)
1747 {
1748         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1749                 return -ENOMEM;
1750         atomic_inc(&skb->users);
1751         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1752 }
1753
1754 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1755 {
1756         if (!ptype->af_packet_priv || !skb->sk)
1757                 return false;
1758
1759         if (ptype->id_match)
1760                 return ptype->id_match(ptype, skb->sk);
1761         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1762                 return true;
1763
1764         return false;
1765 }
1766
1767 /*
1768  *      Support routine. Sends outgoing frames to any network
1769  *      taps currently in use.
1770  */
1771
1772 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1773 {
1774         struct packet_type *ptype;
1775         struct sk_buff *skb2 = NULL;
1776         struct packet_type *pt_prev = NULL;
1777
1778         rcu_read_lock();
1779         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1780                 /* Never send packets back to the socket
1781                  * they originated from - MvS (miquels@drinkel.ow.org)
1782                  */
1783                 if ((ptype->dev == dev || !ptype->dev) &&
1784                     (!skb_loop_sk(ptype, skb))) {
1785                         if (pt_prev) {
1786                                 deliver_skb(skb2, pt_prev, skb->dev);
1787                                 pt_prev = ptype;
1788                                 continue;
1789                         }
1790
1791                         skb2 = skb_clone(skb, GFP_ATOMIC);
1792                         if (!skb2)
1793                                 break;
1794
1795                         net_timestamp_set(skb2);
1796
1797                         /* skb->nh should be correctly
1798                            set by sender, so that the second statement is
1799                            just protection against buggy protocols.
1800                          */
1801                         skb_reset_mac_header(skb2);
1802
1803                         if (skb_network_header(skb2) < skb2->data ||
1804                             skb2->network_header > skb2->tail) {
1805                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1806                                                      ntohs(skb2->protocol),
1807                                                      dev->name);
1808                                 skb_reset_network_header(skb2);
1809                         }
1810
1811                         skb2->transport_header = skb2->network_header;
1812                         skb2->pkt_type = PACKET_OUTGOING;
1813                         pt_prev = ptype;
1814                 }
1815         }
1816         if (pt_prev)
1817                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1818         rcu_read_unlock();
1819 }
1820
1821 /**
1822  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1823  * @dev: Network device
1824  * @txq: number of queues available
1825  *
1826  * If real_num_tx_queues is changed the tc mappings may no longer be
1827  * valid. To resolve this verify the tc mapping remains valid and if
1828  * not NULL the mapping. With no priorities mapping to this
1829  * offset/count pair it will no longer be used. In the worst case TC0
1830  * is invalid nothing can be done so disable priority mappings. If is
1831  * expected that drivers will fix this mapping if they can before
1832  * calling netif_set_real_num_tx_queues.
1833  */
1834 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1835 {
1836         int i;
1837         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1838
1839         /* If TC0 is invalidated disable TC mapping */
1840         if (tc->offset + tc->count > txq) {
1841                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1842                 dev->num_tc = 0;
1843                 return;
1844         }
1845
1846         /* Invalidated prio to tc mappings set to TC0 */
1847         for (i = 1; i < TC_BITMASK + 1; i++) {
1848                 int q = netdev_get_prio_tc_map(dev, i);
1849
1850                 tc = &dev->tc_to_txq[q];
1851                 if (tc->offset + tc->count > txq) {
1852                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1853                                 i, q);
1854                         netdev_set_prio_tc_map(dev, i, 0);
1855                 }
1856         }
1857 }
1858
1859 #ifdef CONFIG_XPS
1860 static DEFINE_MUTEX(xps_map_mutex);
1861 #define xmap_dereference(P)             \
1862         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1863
1864 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1865                                         int cpu, u16 index)
1866 {
1867         struct xps_map *map = NULL;
1868         int pos;
1869
1870         if (dev_maps)
1871                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1872
1873         for (pos = 0; map && pos < map->len; pos++) {
1874                 if (map->queues[pos] == index) {
1875                         if (map->len > 1) {
1876                                 map->queues[pos] = map->queues[--map->len];
1877                         } else {
1878                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1879                                 kfree_rcu(map, rcu);
1880                                 map = NULL;
1881                         }
1882                         break;
1883                 }
1884         }
1885
1886         return map;
1887 }
1888
1889 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1890 {
1891         struct xps_dev_maps *dev_maps;
1892         int cpu, i;
1893         bool active = false;
1894
1895         mutex_lock(&xps_map_mutex);
1896         dev_maps = xmap_dereference(dev->xps_maps);
1897
1898         if (!dev_maps)
1899                 goto out_no_maps;
1900
1901         for_each_possible_cpu(cpu) {
1902                 for (i = index; i < dev->num_tx_queues; i++) {
1903                         if (!remove_xps_queue(dev_maps, cpu, i))
1904                                 break;
1905                 }
1906                 if (i == dev->num_tx_queues)
1907                         active = true;
1908         }
1909
1910         if (!active) {
1911                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1912                 kfree_rcu(dev_maps, rcu);
1913         }
1914
1915         for (i = index; i < dev->num_tx_queues; i++)
1916                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1917                                              NUMA_NO_NODE);
1918
1919 out_no_maps:
1920         mutex_unlock(&xps_map_mutex);
1921 }
1922
1923 static struct xps_map *expand_xps_map(struct xps_map *map,
1924                                       int cpu, u16 index)
1925 {
1926         struct xps_map *new_map;
1927         int alloc_len = XPS_MIN_MAP_ALLOC;
1928         int i, pos;
1929
1930         for (pos = 0; map && pos < map->len; pos++) {
1931                 if (map->queues[pos] != index)
1932                         continue;
1933                 return map;
1934         }
1935
1936         /* Need to add queue to this CPU's existing map */
1937         if (map) {
1938                 if (pos < map->alloc_len)
1939                         return map;
1940
1941                 alloc_len = map->alloc_len * 2;
1942         }
1943
1944         /* Need to allocate new map to store queue on this CPU's map */
1945         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1946                                cpu_to_node(cpu));
1947         if (!new_map)
1948                 return NULL;
1949
1950         for (i = 0; i < pos; i++)
1951                 new_map->queues[i] = map->queues[i];
1952         new_map->alloc_len = alloc_len;
1953         new_map->len = pos;
1954
1955         return new_map;
1956 }
1957
1958 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1959 {
1960         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1961         struct xps_map *map, *new_map;
1962         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1963         int cpu, numa_node_id = -2;
1964         bool active = false;
1965
1966         mutex_lock(&xps_map_mutex);
1967
1968         dev_maps = xmap_dereference(dev->xps_maps);
1969
1970         /* allocate memory for queue storage */
1971         for_each_online_cpu(cpu) {
1972                 if (!cpumask_test_cpu(cpu, mask))
1973                         continue;
1974
1975                 if (!new_dev_maps)
1976                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1977                 if (!new_dev_maps)
1978                         return -ENOMEM;
1979
1980                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1981                                  NULL;
1982
1983                 map = expand_xps_map(map, cpu, index);
1984                 if (!map)
1985                         goto error;
1986
1987                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1988         }
1989
1990         if (!new_dev_maps)
1991                 goto out_no_new_maps;
1992
1993         for_each_possible_cpu(cpu) {
1994                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1995                         /* add queue to CPU maps */
1996                         int pos = 0;
1997
1998                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1999                         while ((pos < map->len) && (map->queues[pos] != index))
2000                                 pos++;
2001
2002                         if (pos == map->len)
2003                                 map->queues[map->len++] = index;
2004 #ifdef CONFIG_NUMA
2005                         if (numa_node_id == -2)
2006                                 numa_node_id = cpu_to_node(cpu);
2007                         else if (numa_node_id != cpu_to_node(cpu))
2008                                 numa_node_id = -1;
2009 #endif
2010                 } else if (dev_maps) {
2011                         /* fill in the new device map from the old device map */
2012                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2013                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2014                 }
2015
2016         }
2017
2018         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2019
2020         /* Cleanup old maps */
2021         if (dev_maps) {
2022                 for_each_possible_cpu(cpu) {
2023                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2024                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2025                         if (map && map != new_map)
2026                                 kfree_rcu(map, rcu);
2027                 }
2028
2029                 kfree_rcu(dev_maps, rcu);
2030         }
2031
2032         dev_maps = new_dev_maps;
2033         active = true;
2034
2035 out_no_new_maps:
2036         /* update Tx queue numa node */
2037         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2038                                      (numa_node_id >= 0) ? numa_node_id :
2039                                      NUMA_NO_NODE);
2040
2041         if (!dev_maps)
2042                 goto out_no_maps;
2043
2044         /* removes queue from unused CPUs */
2045         for_each_possible_cpu(cpu) {
2046                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2047                         continue;
2048
2049                 if (remove_xps_queue(dev_maps, cpu, index))
2050                         active = true;
2051         }
2052
2053         /* free map if not active */
2054         if (!active) {
2055                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2056                 kfree_rcu(dev_maps, rcu);
2057         }
2058
2059 out_no_maps:
2060         mutex_unlock(&xps_map_mutex);
2061
2062         return 0;
2063 error:
2064         /* remove any maps that we added */
2065         for_each_possible_cpu(cpu) {
2066                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2067                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2068                                  NULL;
2069                 if (new_map && new_map != map)
2070                         kfree(new_map);
2071         }
2072
2073         mutex_unlock(&xps_map_mutex);
2074
2075         kfree(new_dev_maps);
2076         return -ENOMEM;
2077 }
2078 EXPORT_SYMBOL(netif_set_xps_queue);
2079
2080 #endif
2081 /*
2082  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2083  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2084  */
2085 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2086 {
2087         int rc;
2088
2089         if (txq < 1 || txq > dev->num_tx_queues)
2090                 return -EINVAL;
2091
2092         if (dev->reg_state == NETREG_REGISTERED ||
2093             dev->reg_state == NETREG_UNREGISTERING) {
2094                 ASSERT_RTNL();
2095
2096                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2097                                                   txq);
2098                 if (rc)
2099                         return rc;
2100
2101                 if (dev->num_tc)
2102                         netif_setup_tc(dev, txq);
2103
2104                 if (txq < dev->real_num_tx_queues) {
2105                         qdisc_reset_all_tx_gt(dev, txq);
2106 #ifdef CONFIG_XPS
2107                         netif_reset_xps_queues_gt(dev, txq);
2108 #endif
2109                 }
2110         }
2111
2112         dev->real_num_tx_queues = txq;
2113         return 0;
2114 }
2115 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2116
2117 #ifdef CONFIG_RPS
2118 /**
2119  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2120  *      @dev: Network device
2121  *      @rxq: Actual number of RX queues
2122  *
2123  *      This must be called either with the rtnl_lock held or before
2124  *      registration of the net device.  Returns 0 on success, or a
2125  *      negative error code.  If called before registration, it always
2126  *      succeeds.
2127  */
2128 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2129 {
2130         int rc;
2131
2132         if (rxq < 1 || rxq > dev->num_rx_queues)
2133                 return -EINVAL;
2134
2135         if (dev->reg_state == NETREG_REGISTERED) {
2136                 ASSERT_RTNL();
2137
2138                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2139                                                   rxq);
2140                 if (rc)
2141                         return rc;
2142         }
2143
2144         dev->real_num_rx_queues = rxq;
2145         return 0;
2146 }
2147 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2148 #endif
2149
2150 /**
2151  * netif_get_num_default_rss_queues - default number of RSS queues
2152  *
2153  * This routine should set an upper limit on the number of RSS queues
2154  * used by default by multiqueue devices.
2155  */
2156 int netif_get_num_default_rss_queues(void)
2157 {
2158         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2159 }
2160 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2161
2162 static inline void __netif_reschedule(struct Qdisc *q)
2163 {
2164         struct softnet_data *sd;
2165         unsigned long flags;
2166
2167         local_irq_save(flags);
2168         sd = &__get_cpu_var(softnet_data);
2169         q->next_sched = NULL;
2170         *sd->output_queue_tailp = q;
2171         sd->output_queue_tailp = &q->next_sched;
2172         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2173         local_irq_restore(flags);
2174 }
2175
2176 void __netif_schedule(struct Qdisc *q)
2177 {
2178         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2179                 __netif_reschedule(q);
2180 }
2181 EXPORT_SYMBOL(__netif_schedule);
2182
2183 void dev_kfree_skb_irq(struct sk_buff *skb)
2184 {
2185         if (atomic_dec_and_test(&skb->users)) {
2186                 struct softnet_data *sd;
2187                 unsigned long flags;
2188
2189                 local_irq_save(flags);
2190                 sd = &__get_cpu_var(softnet_data);
2191                 skb->next = sd->completion_queue;
2192                 sd->completion_queue = skb;
2193                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2194                 local_irq_restore(flags);
2195         }
2196 }
2197 EXPORT_SYMBOL(dev_kfree_skb_irq);
2198
2199 void dev_kfree_skb_any(struct sk_buff *skb)
2200 {
2201         if (in_irq() || irqs_disabled())
2202                 dev_kfree_skb_irq(skb);
2203         else
2204                 dev_kfree_skb(skb);
2205 }
2206 EXPORT_SYMBOL(dev_kfree_skb_any);
2207
2208
2209 /**
2210  * netif_device_detach - mark device as removed
2211  * @dev: network device
2212  *
2213  * Mark device as removed from system and therefore no longer available.
2214  */
2215 void netif_device_detach(struct net_device *dev)
2216 {
2217         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2218             netif_running(dev)) {
2219                 netif_tx_stop_all_queues(dev);
2220         }
2221 }
2222 EXPORT_SYMBOL(netif_device_detach);
2223
2224 /**
2225  * netif_device_attach - mark device as attached
2226  * @dev: network device
2227  *
2228  * Mark device as attached from system and restart if needed.
2229  */
2230 void netif_device_attach(struct net_device *dev)
2231 {
2232         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2233             netif_running(dev)) {
2234                 netif_tx_wake_all_queues(dev);
2235                 __netdev_watchdog_up(dev);
2236         }
2237 }
2238 EXPORT_SYMBOL(netif_device_attach);
2239
2240 static void skb_warn_bad_offload(const struct sk_buff *skb)
2241 {
2242         static const netdev_features_t null_features = 0;
2243         struct net_device *dev = skb->dev;
2244         const char *driver = "";
2245
2246         if (dev && dev->dev.parent)
2247                 driver = dev_driver_string(dev->dev.parent);
2248
2249         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2250              "gso_type=%d ip_summed=%d\n",
2251              driver, dev ? &dev->features : &null_features,
2252              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2253              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2254              skb_shinfo(skb)->gso_type, skb->ip_summed);
2255 }
2256
2257 /*
2258  * Invalidate hardware checksum when packet is to be mangled, and
2259  * complete checksum manually on outgoing path.
2260  */
2261 int skb_checksum_help(struct sk_buff *skb)
2262 {
2263         __wsum csum;
2264         int ret = 0, offset;
2265
2266         if (skb->ip_summed == CHECKSUM_COMPLETE)
2267                 goto out_set_summed;
2268
2269         if (unlikely(skb_shinfo(skb)->gso_size)) {
2270                 skb_warn_bad_offload(skb);
2271                 return -EINVAL;
2272         }
2273
2274         offset = skb_checksum_start_offset(skb);
2275         BUG_ON(offset >= skb_headlen(skb));
2276         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2277
2278         offset += skb->csum_offset;
2279         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2280
2281         if (skb_cloned(skb) &&
2282             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2283                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2284                 if (ret)
2285                         goto out;
2286         }
2287
2288         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2289 out_set_summed:
2290         skb->ip_summed = CHECKSUM_NONE;
2291 out:
2292         return ret;
2293 }
2294 EXPORT_SYMBOL(skb_checksum_help);
2295
2296 /**
2297  *      skb_gso_segment - Perform segmentation on skb.
2298  *      @skb: buffer to segment
2299  *      @features: features for the output path (see dev->features)
2300  *
2301  *      This function segments the given skb and returns a list of segments.
2302  *
2303  *      It may return NULL if the skb requires no segmentation.  This is
2304  *      only possible when GSO is used for verifying header integrity.
2305  */
2306 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
2307         netdev_features_t features)
2308 {
2309         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2310         struct packet_offload *ptype;
2311         __be16 type = skb->protocol;
2312         int vlan_depth = ETH_HLEN;
2313         int err;
2314
2315         while (type == htons(ETH_P_8021Q)) {
2316                 struct vlan_hdr *vh;
2317
2318                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2319                         return ERR_PTR(-EINVAL);
2320
2321                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2322                 type = vh->h_vlan_encapsulated_proto;
2323                 vlan_depth += VLAN_HLEN;
2324         }
2325
2326         skb_reset_mac_header(skb);
2327         skb->mac_len = skb->network_header - skb->mac_header;
2328         __skb_pull(skb, skb->mac_len);
2329
2330         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2331                 skb_warn_bad_offload(skb);
2332
2333                 if (skb_header_cloned(skb) &&
2334                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2335                         return ERR_PTR(err);
2336         }
2337
2338         rcu_read_lock();
2339         list_for_each_entry_rcu(ptype, &offload_base, list) {
2340                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2341                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2342                                 err = ptype->callbacks.gso_send_check(skb);
2343                                 segs = ERR_PTR(err);
2344                                 if (err || skb_gso_ok(skb, features))
2345                                         break;
2346                                 __skb_push(skb, (skb->data -
2347                                                  skb_network_header(skb)));
2348                         }
2349                         segs = ptype->callbacks.gso_segment(skb, features);
2350                         break;
2351                 }
2352         }
2353         rcu_read_unlock();
2354
2355         __skb_push(skb, skb->data - skb_mac_header(skb));
2356
2357         return segs;
2358 }
2359 EXPORT_SYMBOL(skb_gso_segment);
2360
2361 /* Take action when hardware reception checksum errors are detected. */
2362 #ifdef CONFIG_BUG
2363 void netdev_rx_csum_fault(struct net_device *dev)
2364 {
2365         if (net_ratelimit()) {
2366                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2367                 dump_stack();
2368         }
2369 }
2370 EXPORT_SYMBOL(netdev_rx_csum_fault);
2371 #endif
2372
2373 /* Actually, we should eliminate this check as soon as we know, that:
2374  * 1. IOMMU is present and allows to map all the memory.
2375  * 2. No high memory really exists on this machine.
2376  */
2377
2378 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2379 {
2380 #ifdef CONFIG_HIGHMEM
2381         int i;
2382         if (!(dev->features & NETIF_F_HIGHDMA)) {
2383                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2384                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2385                         if (PageHighMem(skb_frag_page(frag)))
2386                                 return 1;
2387                 }
2388         }
2389
2390         if (PCI_DMA_BUS_IS_PHYS) {
2391                 struct device *pdev = dev->dev.parent;
2392
2393                 if (!pdev)
2394                         return 0;
2395                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2396                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2397                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2398                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2399                                 return 1;
2400                 }
2401         }
2402 #endif
2403         return 0;
2404 }
2405
2406 struct dev_gso_cb {
2407         void (*destructor)(struct sk_buff *skb);
2408 };
2409
2410 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2411
2412 static void dev_gso_skb_destructor(struct sk_buff *skb)
2413 {
2414         struct dev_gso_cb *cb;
2415
2416         do {
2417                 struct sk_buff *nskb = skb->next;
2418
2419                 skb->next = nskb->next;
2420                 nskb->next = NULL;
2421                 kfree_skb(nskb);
2422         } while (skb->next);
2423
2424         cb = DEV_GSO_CB(skb);
2425         if (cb->destructor)
2426                 cb->destructor(skb);
2427 }
2428
2429 /**
2430  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2431  *      @skb: buffer to segment
2432  *      @features: device features as applicable to this skb
2433  *
2434  *      This function segments the given skb and stores the list of segments
2435  *      in skb->next.
2436  */
2437 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2438 {
2439         struct sk_buff *segs;
2440
2441         segs = skb_gso_segment(skb, features);
2442
2443         /* Verifying header integrity only. */
2444         if (!segs)
2445                 return 0;
2446
2447         if (IS_ERR(segs))
2448                 return PTR_ERR(segs);
2449
2450         skb->next = segs;
2451         DEV_GSO_CB(skb)->destructor = skb->destructor;
2452         skb->destructor = dev_gso_skb_destructor;
2453
2454         return 0;
2455 }
2456
2457 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2458 {
2459         return ((features & NETIF_F_GEN_CSUM) ||
2460                 ((features & NETIF_F_V4_CSUM) &&
2461                  protocol == htons(ETH_P_IP)) ||
2462                 ((features & NETIF_F_V6_CSUM) &&
2463                  protocol == htons(ETH_P_IPV6)) ||
2464                 ((features & NETIF_F_FCOE_CRC) &&
2465                  protocol == htons(ETH_P_FCOE)));
2466 }
2467
2468 static netdev_features_t harmonize_features(struct sk_buff *skb,
2469         __be16 protocol, netdev_features_t features)
2470 {
2471         if (skb->ip_summed != CHECKSUM_NONE &&
2472             !can_checksum_protocol(features, protocol)) {
2473                 features &= ~NETIF_F_ALL_CSUM;
2474                 features &= ~NETIF_F_SG;
2475         } else if (illegal_highdma(skb->dev, skb)) {
2476                 features &= ~NETIF_F_SG;
2477         }
2478
2479         return features;
2480 }
2481
2482 netdev_features_t netif_skb_features(struct sk_buff *skb)
2483 {
2484         __be16 protocol = skb->protocol;
2485         netdev_features_t features = skb->dev->features;
2486
2487         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2488                 features &= ~NETIF_F_GSO_MASK;
2489
2490         if (protocol == htons(ETH_P_8021Q)) {
2491                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2492                 protocol = veh->h_vlan_encapsulated_proto;
2493         } else if (!vlan_tx_tag_present(skb)) {
2494                 return harmonize_features(skb, protocol, features);
2495         }
2496
2497         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2498
2499         if (protocol != htons(ETH_P_8021Q)) {
2500                 return harmonize_features(skb, protocol, features);
2501         } else {
2502                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2503                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2504                 return harmonize_features(skb, protocol, features);
2505         }
2506 }
2507 EXPORT_SYMBOL(netif_skb_features);
2508
2509 /*
2510  * Returns true if either:
2511  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2512  *      2. skb is fragmented and the device does not support SG.
2513  */
2514 static inline int skb_needs_linearize(struct sk_buff *skb,
2515                                       int features)
2516 {
2517         return skb_is_nonlinear(skb) &&
2518                         ((skb_has_frag_list(skb) &&
2519                                 !(features & NETIF_F_FRAGLIST)) ||
2520                         (skb_shinfo(skb)->nr_frags &&
2521                                 !(features & NETIF_F_SG)));
2522 }
2523
2524 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2525                         struct netdev_queue *txq)
2526 {
2527         const struct net_device_ops *ops = dev->netdev_ops;
2528         int rc = NETDEV_TX_OK;
2529         unsigned int skb_len;
2530
2531         if (likely(!skb->next)) {
2532                 netdev_features_t features;
2533
2534                 /*
2535                  * If device doesn't need skb->dst, release it right now while
2536                  * its hot in this cpu cache
2537                  */
2538                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2539                         skb_dst_drop(skb);
2540
2541                 features = netif_skb_features(skb);
2542
2543                 if (vlan_tx_tag_present(skb) &&
2544                     !(features & NETIF_F_HW_VLAN_TX)) {
2545                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2546                         if (unlikely(!skb))
2547                                 goto out;
2548
2549                         skb->vlan_tci = 0;
2550                 }
2551
2552                 /* If encapsulation offload request, verify we are testing
2553                  * hardware encapsulation features instead of standard
2554                  * features for the netdev
2555                  */
2556                 if (skb->encapsulation)
2557                         features &= dev->hw_enc_features;
2558
2559                 if (netif_needs_gso(skb, features)) {
2560                         if (unlikely(dev_gso_segment(skb, features)))
2561                                 goto out_kfree_skb;
2562                         if (skb->next)
2563                                 goto gso;
2564                 } else {
2565                         if (skb_needs_linearize(skb, features) &&
2566                             __skb_linearize(skb))
2567                                 goto out_kfree_skb;
2568
2569                         /* If packet is not checksummed and device does not
2570                          * support checksumming for this protocol, complete
2571                          * checksumming here.
2572                          */
2573                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2574                                 if (skb->encapsulation)
2575                                         skb_set_inner_transport_header(skb,
2576                                                 skb_checksum_start_offset(skb));
2577                                 else
2578                                         skb_set_transport_header(skb,
2579                                                 skb_checksum_start_offset(skb));
2580                                 if (!(features & NETIF_F_ALL_CSUM) &&
2581                                      skb_checksum_help(skb))
2582                                         goto out_kfree_skb;
2583                         }
2584                 }
2585
2586                 if (!list_empty(&ptype_all))
2587                         dev_queue_xmit_nit(skb, dev);
2588
2589                 skb_len = skb->len;
2590                 rc = ops->ndo_start_xmit(skb, dev);
2591                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2592                 if (rc == NETDEV_TX_OK)
2593                         txq_trans_update(txq);
2594                 return rc;
2595         }
2596
2597 gso:
2598         do {
2599                 struct sk_buff *nskb = skb->next;
2600
2601                 skb->next = nskb->next;
2602                 nskb->next = NULL;
2603
2604                 /*
2605                  * If device doesn't need nskb->dst, release it right now while
2606                  * its hot in this cpu cache
2607                  */
2608                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2609                         skb_dst_drop(nskb);
2610
2611                 if (!list_empty(&ptype_all))
2612                         dev_queue_xmit_nit(nskb, dev);
2613
2614                 skb_len = nskb->len;
2615                 rc = ops->ndo_start_xmit(nskb, dev);
2616                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2617                 if (unlikely(rc != NETDEV_TX_OK)) {
2618                         if (rc & ~NETDEV_TX_MASK)
2619                                 goto out_kfree_gso_skb;
2620                         nskb->next = skb->next;
2621                         skb->next = nskb;
2622                         return rc;
2623                 }
2624                 txq_trans_update(txq);
2625                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2626                         return NETDEV_TX_BUSY;
2627         } while (skb->next);
2628
2629 out_kfree_gso_skb:
2630         if (likely(skb->next == NULL))
2631                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2632 out_kfree_skb:
2633         kfree_skb(skb);
2634 out:
2635         return rc;
2636 }
2637
2638 static void qdisc_pkt_len_init(struct sk_buff *skb)
2639 {
2640         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2641
2642         qdisc_skb_cb(skb)->pkt_len = skb->len;
2643
2644         /* To get more precise estimation of bytes sent on wire,
2645          * we add to pkt_len the headers size of all segments
2646          */
2647         if (shinfo->gso_size)  {
2648                 unsigned int hdr_len;
2649
2650                 /* mac layer + network layer */
2651                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2652
2653                 /* + transport layer */
2654                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2655                         hdr_len += tcp_hdrlen(skb);
2656                 else
2657                         hdr_len += sizeof(struct udphdr);
2658                 qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;
2659         }
2660 }
2661
2662 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2663                                  struct net_device *dev,
2664                                  struct netdev_queue *txq)
2665 {
2666         spinlock_t *root_lock = qdisc_lock(q);
2667         bool contended;
2668         int rc;
2669
2670         qdisc_pkt_len_init(skb);
2671         qdisc_calculate_pkt_len(skb, q);
2672         /*
2673          * Heuristic to force contended enqueues to serialize on a
2674          * separate lock before trying to get qdisc main lock.
2675          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2676          * and dequeue packets faster.
2677          */
2678         contended = qdisc_is_running(q);
2679         if (unlikely(contended))
2680                 spin_lock(&q->busylock);
2681
2682         spin_lock(root_lock);
2683         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2684                 kfree_skb(skb);
2685                 rc = NET_XMIT_DROP;
2686         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2687                    qdisc_run_begin(q)) {
2688                 /*
2689                  * This is a work-conserving queue; there are no old skbs
2690                  * waiting to be sent out; and the qdisc is not running -
2691                  * xmit the skb directly.
2692                  */
2693                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2694                         skb_dst_force(skb);
2695
2696                 qdisc_bstats_update(q, skb);
2697
2698                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2699                         if (unlikely(contended)) {
2700                                 spin_unlock(&q->busylock);
2701                                 contended = false;
2702                         }
2703                         __qdisc_run(q);
2704                 } else
2705                         qdisc_run_end(q);
2706
2707                 rc = NET_XMIT_SUCCESS;
2708         } else {
2709                 skb_dst_force(skb);
2710                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2711                 if (qdisc_run_begin(q)) {
2712                         if (unlikely(contended)) {
2713                                 spin_unlock(&q->busylock);
2714                                 contended = false;
2715                         }
2716                         __qdisc_run(q);
2717                 }
2718         }
2719         spin_unlock(root_lock);
2720         if (unlikely(contended))
2721                 spin_unlock(&q->busylock);
2722         return rc;
2723 }
2724
2725 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2726 static void skb_update_prio(struct sk_buff *skb)
2727 {
2728         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2729
2730         if (!skb->priority && skb->sk && map) {
2731                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2732
2733                 if (prioidx < map->priomap_len)
2734                         skb->priority = map->priomap[prioidx];
2735         }
2736 }
2737 #else
2738 #define skb_update_prio(skb)
2739 #endif
2740
2741 static DEFINE_PER_CPU(int, xmit_recursion);
2742 #define RECURSION_LIMIT 10
2743
2744 /**
2745  *      dev_loopback_xmit - loop back @skb
2746  *      @skb: buffer to transmit
2747  */
2748 int dev_loopback_xmit(struct sk_buff *skb)
2749 {
2750         skb_reset_mac_header(skb);
2751         __skb_pull(skb, skb_network_offset(skb));
2752         skb->pkt_type = PACKET_LOOPBACK;
2753         skb->ip_summed = CHECKSUM_UNNECESSARY;
2754         WARN_ON(!skb_dst(skb));
2755         skb_dst_force(skb);
2756         netif_rx_ni(skb);
2757         return 0;
2758 }
2759 EXPORT_SYMBOL(dev_loopback_xmit);
2760
2761 /**
2762  *      dev_queue_xmit - transmit a buffer
2763  *      @skb: buffer to transmit
2764  *
2765  *      Queue a buffer for transmission to a network device. The caller must
2766  *      have set the device and priority and built the buffer before calling
2767  *      this function. The function can be called from an interrupt.
2768  *
2769  *      A negative errno code is returned on a failure. A success does not
2770  *      guarantee the frame will be transmitted as it may be dropped due
2771  *      to congestion or traffic shaping.
2772  *
2773  * -----------------------------------------------------------------------------------
2774  *      I notice this method can also return errors from the queue disciplines,
2775  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2776  *      be positive.
2777  *
2778  *      Regardless of the return value, the skb is consumed, so it is currently
2779  *      difficult to retry a send to this method.  (You can bump the ref count
2780  *      before sending to hold a reference for retry if you are careful.)
2781  *
2782  *      When calling this method, interrupts MUST be enabled.  This is because
2783  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2784  *          --BLG
2785  */
2786 int dev_queue_xmit(struct sk_buff *skb)
2787 {
2788         struct net_device *dev = skb->dev;
2789         struct netdev_queue *txq;
2790         struct Qdisc *q;
2791         int rc = -ENOMEM;
2792
2793         /* Disable soft irqs for various locks below. Also
2794          * stops preemption for RCU.
2795          */
2796         rcu_read_lock_bh();
2797
2798         skb_update_prio(skb);
2799
2800         txq = netdev_pick_tx(dev, skb);
2801         q = rcu_dereference_bh(txq->qdisc);
2802
2803 #ifdef CONFIG_NET_CLS_ACT
2804         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2805 #endif
2806         trace_net_dev_queue(skb);
2807         if (q->enqueue) {
2808                 rc = __dev_xmit_skb(skb, q, dev, txq);
2809                 goto out;
2810         }
2811
2812         /* The device has no queue. Common case for software devices:
2813            loopback, all the sorts of tunnels...
2814
2815            Really, it is unlikely that netif_tx_lock protection is necessary
2816            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2817            counters.)
2818            However, it is possible, that they rely on protection
2819            made by us here.
2820
2821            Check this and shot the lock. It is not prone from deadlocks.
2822            Either shot noqueue qdisc, it is even simpler 8)
2823          */
2824         if (dev->flags & IFF_UP) {
2825                 int cpu = smp_processor_id(); /* ok because BHs are off */
2826
2827                 if (txq->xmit_lock_owner != cpu) {
2828
2829                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2830                                 goto recursion_alert;
2831
2832                         HARD_TX_LOCK(dev, txq, cpu);
2833
2834                         if (!netif_xmit_stopped(txq)) {
2835                                 __this_cpu_inc(xmit_recursion);
2836                                 rc = dev_hard_start_xmit(skb, dev, txq);
2837                                 __this_cpu_dec(xmit_recursion);
2838                                 if (dev_xmit_complete(rc)) {
2839                                         HARD_TX_UNLOCK(dev, txq);
2840                                         goto out;
2841                                 }
2842                         }
2843                         HARD_TX_UNLOCK(dev, txq);
2844                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2845                                              dev->name);
2846                 } else {
2847                         /* Recursion is detected! It is possible,
2848                          * unfortunately
2849                          */
2850 recursion_alert:
2851                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2852                                              dev->name);
2853                 }
2854         }
2855
2856         rc = -ENETDOWN;
2857         rcu_read_unlock_bh();
2858
2859         kfree_skb(skb);
2860         return rc;
2861 out:
2862         rcu_read_unlock_bh();
2863         return rc;
2864 }
2865 EXPORT_SYMBOL(dev_queue_xmit);
2866
2867
2868 /*=======================================================================
2869                         Receiver routines
2870   =======================================================================*/
2871
2872 int netdev_max_backlog __read_mostly = 1000;
2873 EXPORT_SYMBOL(netdev_max_backlog);
2874
2875 int netdev_tstamp_prequeue __read_mostly = 1;
2876 int netdev_budget __read_mostly = 300;
2877 int weight_p __read_mostly = 64;            /* old backlog weight */
2878
2879 /* Called with irq disabled */
2880 static inline void ____napi_schedule(struct softnet_data *sd,
2881                                      struct napi_struct *napi)
2882 {
2883         list_add_tail(&napi->poll_list, &sd->poll_list);
2884         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2885 }
2886
2887 #ifdef CONFIG_RPS
2888
2889 /* One global table that all flow-based protocols share. */
2890 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2891 EXPORT_SYMBOL(rps_sock_flow_table);
2892
2893 struct static_key rps_needed __read_mostly;
2894
2895 static struct rps_dev_flow *
2896 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2897             struct rps_dev_flow *rflow, u16 next_cpu)
2898 {
2899         if (next_cpu != RPS_NO_CPU) {
2900 #ifdef CONFIG_RFS_ACCEL
2901                 struct netdev_rx_queue *rxqueue;
2902                 struct rps_dev_flow_table *flow_table;
2903                 struct rps_dev_flow *old_rflow;
2904                 u32 flow_id;
2905                 u16 rxq_index;
2906                 int rc;
2907
2908                 /* Should we steer this flow to a different hardware queue? */
2909                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2910                     !(dev->features & NETIF_F_NTUPLE))
2911                         goto out;
2912                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2913                 if (rxq_index == skb_get_rx_queue(skb))
2914                         goto out;
2915
2916                 rxqueue = dev->_rx + rxq_index;
2917                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2918                 if (!flow_table)
2919                         goto out;
2920                 flow_id = skb->rxhash & flow_table->mask;
2921                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2922                                                         rxq_index, flow_id);
2923                 if (rc < 0)
2924                         goto out;
2925                 old_rflow = rflow;
2926                 rflow = &flow_table->flows[flow_id];
2927                 rflow->filter = rc;
2928                 if (old_rflow->filter == rflow->filter)
2929                         old_rflow->filter = RPS_NO_FILTER;
2930         out:
2931 #endif
2932                 rflow->last_qtail =
2933                         per_cpu(softnet_data, next_cpu).input_queue_head;
2934         }
2935
2936         rflow->cpu = next_cpu;
2937         return rflow;
2938 }
2939
2940 /*
2941  * get_rps_cpu is called from netif_receive_skb and returns the target
2942  * CPU from the RPS map of the receiving queue for a given skb.
2943  * rcu_read_lock must be held on entry.
2944  */
2945 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2946                        struct rps_dev_flow **rflowp)
2947 {
2948         struct netdev_rx_queue *rxqueue;
2949         struct rps_map *map;
2950         struct rps_dev_flow_table *flow_table;
2951         struct rps_sock_flow_table *sock_flow_table;
2952         int cpu = -1;
2953         u16 tcpu;
2954
2955         if (skb_rx_queue_recorded(skb)) {
2956                 u16 index = skb_get_rx_queue(skb);
2957                 if (unlikely(index >= dev->real_num_rx_queues)) {
2958                         WARN_ONCE(dev->real_num_rx_queues > 1,
2959                                   "%s received packet on queue %u, but number "
2960                                   "of RX queues is %u\n",
2961                                   dev->name, index, dev->real_num_rx_queues);
2962                         goto done;
2963                 }
2964                 rxqueue = dev->_rx + index;
2965         } else
2966                 rxqueue = dev->_rx;
2967
2968         map = rcu_dereference(rxqueue->rps_map);
2969         if (map) {
2970                 if (map->len == 1 &&
2971                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2972                         tcpu = map->cpus[0];
2973                         if (cpu_online(tcpu))
2974                                 cpu = tcpu;
2975                         goto done;
2976                 }
2977         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2978                 goto done;
2979         }
2980
2981         skb_reset_network_header(skb);
2982         if (!skb_get_rxhash(skb))
2983                 goto done;
2984
2985         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2986         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2987         if (flow_table && sock_flow_table) {
2988                 u16 next_cpu;
2989                 struct rps_dev_flow *rflow;
2990
2991                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2992                 tcpu = rflow->cpu;
2993
2994                 next_cpu = sock_flow_table->ents[skb->rxhash &
2995                     sock_flow_table->mask];
2996
2997                 /*
2998                  * If the desired CPU (where last recvmsg was done) is
2999                  * different from current CPU (one in the rx-queue flow
3000                  * table entry), switch if one of the following holds:
3001                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3002                  *   - Current CPU is offline.
3003                  *   - The current CPU's queue tail has advanced beyond the
3004                  *     last packet that was enqueued using this table entry.
3005                  *     This guarantees that all previous packets for the flow
3006                  *     have been dequeued, thus preserving in order delivery.
3007                  */
3008                 if (unlikely(tcpu != next_cpu) &&
3009                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3010                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3011                       rflow->last_qtail)) >= 0)) {
3012                         tcpu = next_cpu;
3013                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3014                 }
3015
3016                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3017                         *rflowp = rflow;
3018                         cpu = tcpu;
3019                         goto done;
3020                 }
3021         }
3022
3023         if (map) {
3024                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3025
3026                 if (cpu_online(tcpu)) {
3027                         cpu = tcpu;
3028                         goto done;
3029                 }
3030         }
3031
3032 done:
3033         return cpu;
3034 }
3035
3036 #ifdef CONFIG_RFS_ACCEL
3037
3038 /**
3039  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3040  * @dev: Device on which the filter was set
3041  * @rxq_index: RX queue index
3042  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3043  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3044  *
3045  * Drivers that implement ndo_rx_flow_steer() should periodically call
3046  * this function for each installed filter and remove the filters for
3047  * which it returns %true.
3048  */
3049 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3050                          u32 flow_id, u16 filter_id)
3051 {
3052         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3053         struct rps_dev_flow_table *flow_table;
3054         struct rps_dev_flow *rflow;
3055         bool expire = true;
3056         int cpu;
3057
3058         rcu_read_lock();
3059         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3060         if (flow_table && flow_id <= flow_table->mask) {
3061                 rflow = &flow_table->flows[flow_id];
3062                 cpu = ACCESS_ONCE(rflow->cpu);
3063                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3064                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3065                            rflow->last_qtail) <
3066                      (int)(10 * flow_table->mask)))
3067                         expire = false;
3068         }
3069         rcu_read_unlock();
3070         return expire;
3071 }
3072 EXPORT_SYMBOL(rps_may_expire_flow);
3073
3074 #endif /* CONFIG_RFS_ACCEL */
3075
3076 /* Called from hardirq (IPI) context */
3077 static void rps_trigger_softirq(void *data)
3078 {
3079         struct softnet_data *sd = data;
3080
3081         ____napi_schedule(sd, &sd->backlog);
3082         sd->received_rps++;
3083 }
3084
3085 #endif /* CONFIG_RPS */
3086
3087 /*
3088  * Check if this softnet_data structure is another cpu one
3089  * If yes, queue it to our IPI list and return 1
3090  * If no, return 0
3091  */
3092 static int rps_ipi_queued(struct softnet_data *sd)
3093 {
3094 #ifdef CONFIG_RPS
3095         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3096
3097         if (sd != mysd) {
3098                 sd->rps_ipi_next = mysd->rps_ipi_list;
3099                 mysd->rps_ipi_list = sd;
3100
3101                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3102                 return 1;
3103         }
3104 #endif /* CONFIG_RPS */
3105         return 0;
3106 }
3107
3108 /*
3109  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3110  * queue (may be a remote CPU queue).
3111  */
3112 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3113                               unsigned int *qtail)
3114 {
3115         struct softnet_data *sd;
3116         unsigned long flags;
3117
3118         sd = &per_cpu(softnet_data, cpu);
3119
3120         local_irq_save(flags);
3121
3122         rps_lock(sd);
3123         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3124                 if (skb_queue_len(&sd->input_pkt_queue)) {
3125 enqueue:
3126                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3127                         input_queue_tail_incr_save(sd, qtail);
3128                         rps_unlock(sd);
3129                         local_irq_restore(flags);
3130                         return NET_RX_SUCCESS;
3131                 }
3132
3133                 /* Schedule NAPI for backlog device
3134                  * We can use non atomic operation since we own the queue lock
3135                  */
3136                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3137                         if (!rps_ipi_queued(sd))
3138                                 ____napi_schedule(sd, &sd->backlog);
3139                 }
3140                 goto enqueue;
3141         }
3142
3143         sd->dropped++;
3144         rps_unlock(sd);
3145
3146         local_irq_restore(flags);
3147
3148         atomic_long_inc(&skb->dev->rx_dropped);
3149         kfree_skb(skb);
3150         return NET_RX_DROP;
3151 }
3152
3153 /**
3154  *      netif_rx        -       post buffer to the network code
3155  *      @skb: buffer to post
3156  *
3157  *      This function receives a packet from a device driver and queues it for
3158  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3159  *      may be dropped during processing for congestion control or by the
3160  *      protocol layers.
3161  *
3162  *      return values:
3163  *      NET_RX_SUCCESS  (no congestion)
3164  *      NET_RX_DROP     (packet was dropped)
3165  *
3166  */
3167
3168 int netif_rx(struct sk_buff *skb)
3169 {
3170         int ret;
3171
3172         /* if netpoll wants it, pretend we never saw it */
3173         if (netpoll_rx(skb))
3174                 return NET_RX_DROP;
3175
3176         net_timestamp_check(netdev_tstamp_prequeue, skb);
3177
3178         trace_netif_rx(skb);
3179 #ifdef CONFIG_RPS
3180         if (static_key_false(&rps_needed)) {
3181                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3182                 int cpu;
3183
3184                 preempt_disable();
3185                 rcu_read_lock();
3186
3187                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3188                 if (cpu < 0)
3189                         cpu = smp_processor_id();
3190
3191                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3192
3193                 rcu_read_unlock();
3194                 preempt_enable();
3195         } else
3196 #endif
3197         {
3198                 unsigned int qtail;
3199                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3200                 put_cpu();
3201         }
3202         return ret;
3203 }
3204 EXPORT_SYMBOL(netif_rx);
3205
3206 int netif_rx_ni(struct sk_buff *skb)
3207 {
3208         int err;
3209
3210         preempt_disable();
3211         err = netif_rx(skb);
3212         if (local_softirq_pending())
3213                 do_softirq();
3214         preempt_enable();
3215
3216         return err;
3217 }
3218 EXPORT_SYMBOL(netif_rx_ni);
3219
3220 static void net_tx_action(struct softirq_action *h)
3221 {
3222         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3223
3224         if (sd->completion_queue) {
3225                 struct sk_buff *clist;
3226
3227                 local_irq_disable();
3228                 clist = sd->completion_queue;
3229                 sd->completion_queue = NULL;
3230                 local_irq_enable();
3231
3232                 while (clist) {
3233                         struct sk_buff *skb = clist;
3234                         clist = clist->next;
3235
3236                         WARN_ON(atomic_read(&skb->users));
3237                         trace_kfree_skb(skb, net_tx_action);
3238                         __kfree_skb(skb);
3239                 }
3240         }
3241
3242         if (sd->output_queue) {
3243                 struct Qdisc *head;
3244
3245                 local_irq_disable();
3246                 head = sd->output_queue;
3247                 sd->output_queue = NULL;
3248                 sd->output_queue_tailp = &sd->output_queue;
3249                 local_irq_enable();
3250
3251                 while (head) {
3252                         struct Qdisc *q = head;
3253                         spinlock_t *root_lock;
3254
3255                         head = head->next_sched;
3256
3257                         root_lock = qdisc_lock(q);
3258                         if (spin_trylock(root_lock)) {
3259                                 smp_mb__before_clear_bit();
3260                                 clear_bit(__QDISC_STATE_SCHED,
3261                                           &q->state);
3262                                 qdisc_run(q);
3263                                 spin_unlock(root_lock);
3264                         } else {
3265                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3266                                               &q->state)) {
3267                                         __netif_reschedule(q);
3268                                 } else {
3269                                         smp_mb__before_clear_bit();
3270                                         clear_bit(__QDISC_STATE_SCHED,
3271                                                   &q->state);
3272                                 }
3273                         }
3274                 }
3275         }
3276 }
3277
3278 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3279     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3280 /* This hook is defined here for ATM LANE */
3281 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3282                              unsigned char *addr) __read_mostly;
3283 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3284 #endif
3285
3286 #ifdef CONFIG_NET_CLS_ACT
3287 /* TODO: Maybe we should just force sch_ingress to be compiled in
3288  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3289  * a compare and 2 stores extra right now if we dont have it on
3290  * but have CONFIG_NET_CLS_ACT
3291  * NOTE: This doesn't stop any functionality; if you dont have
3292  * the ingress scheduler, you just can't add policies on ingress.
3293  *
3294  */
3295 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3296 {
3297         struct net_device *dev = skb->dev;
3298         u32 ttl = G_TC_RTTL(skb->tc_verd);
3299         int result = TC_ACT_OK;
3300         struct Qdisc *q;
3301
3302         if (unlikely(MAX_RED_LOOP < ttl++)) {
3303                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3304                                      skb->skb_iif, dev->ifindex);
3305                 return TC_ACT_SHOT;
3306         }
3307
3308         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3309         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3310
3311         q = rxq->qdisc;
3312         if (q != &noop_qdisc) {
3313                 spin_lock(qdisc_lock(q));
3314                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3315                         result = qdisc_enqueue_root(skb, q);
3316                 spin_unlock(qdisc_lock(q));
3317         }
3318
3319         return result;
3320 }
3321
3322 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3323                                          struct packet_type **pt_prev,
3324                                          int *ret, struct net_device *orig_dev)
3325 {
3326         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3327
3328         if (!rxq || rxq->qdisc == &noop_qdisc)
3329                 goto out;
3330
3331         if (*pt_prev) {
3332                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3333                 *pt_prev = NULL;
3334         }
3335
3336         switch (ing_filter(skb, rxq)) {
3337         case TC_ACT_SHOT:
3338         case TC_ACT_STOLEN:
3339                 kfree_skb(skb);
3340                 return NULL;
3341         }
3342
3343 out:
3344         skb->tc_verd = 0;
3345         return skb;
3346 }
3347 #endif
3348
3349 /**
3350  *      netdev_rx_handler_register - register receive handler
3351  *      @dev: device to register a handler for
3352  *      @rx_handler: receive handler to register
3353  *      @rx_handler_data: data pointer that is used by rx handler
3354  *
3355  *      Register a receive hander for a device. This handler will then be
3356  *      called from __netif_receive_skb. A negative errno code is returned
3357  *      on a failure.
3358  *
3359  *      The caller must hold the rtnl_mutex.
3360  *
3361  *      For a general description of rx_handler, see enum rx_handler_result.
3362  */
3363 int netdev_rx_handler_register(struct net_device *dev,
3364                                rx_handler_func_t *rx_handler,
3365                                void *rx_handler_data)
3366 {
3367         ASSERT_RTNL();
3368
3369         if (dev->rx_handler)
3370                 return -EBUSY;
3371
3372         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3373         rcu_assign_pointer(dev->rx_handler, rx_handler);
3374
3375         return 0;
3376 }
3377 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3378
3379 /**
3380  *      netdev_rx_handler_unregister - unregister receive handler
3381  *      @dev: device to unregister a handler from
3382  *
3383  *      Unregister a receive hander from a device.
3384  *
3385  *      The caller must hold the rtnl_mutex.
3386  */
3387 void netdev_rx_handler_unregister(struct net_device *dev)
3388 {
3389
3390         ASSERT_RTNL();
3391         RCU_INIT_POINTER(dev->rx_handler, NULL);
3392         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3393 }
3394 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3395
3396 /*
3397  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3398  * the special handling of PFMEMALLOC skbs.
3399  */
3400 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3401 {
3402         switch (skb->protocol) {
3403         case __constant_htons(ETH_P_ARP):
3404         case __constant_htons(ETH_P_IP):
3405         case __constant_htons(ETH_P_IPV6):
3406         case __constant_htons(ETH_P_8021Q):
3407                 return true;
3408         default:
3409                 return false;
3410         }
3411 }
3412
3413 static int __netif_receive_skb(struct sk_buff *skb)
3414 {
3415         struct packet_type *ptype, *pt_prev;
3416         rx_handler_func_t *rx_handler;
3417         struct net_device *orig_dev;
3418         struct net_device *null_or_dev;
3419         bool deliver_exact = false;
3420         int ret = NET_RX_DROP;
3421         __be16 type;
3422         unsigned long pflags = current->flags;
3423
3424         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3425
3426         trace_netif_receive_skb(skb);
3427
3428         /*
3429          * PFMEMALLOC skbs are special, they should
3430          * - be delivered to SOCK_MEMALLOC sockets only
3431          * - stay away from userspace
3432          * - have bounded memory usage
3433          *
3434          * Use PF_MEMALLOC as this saves us from propagating the allocation
3435          * context down to all allocation sites.
3436          */
3437         if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3438                 current->flags |= PF_MEMALLOC;
3439
3440         /* if we've gotten here through NAPI, check netpoll */
3441         if (netpoll_receive_skb(skb))
3442                 goto out;
3443
3444         orig_dev = skb->dev;
3445
3446         skb_reset_network_header(skb);
3447         if (!skb_transport_header_was_set(skb))
3448                 skb_reset_transport_header(skb);
3449         skb_reset_mac_len(skb);
3450
3451         pt_prev = NULL;
3452
3453         rcu_read_lock();
3454
3455 another_round:
3456         skb->skb_iif = skb->dev->ifindex;
3457
3458         __this_cpu_inc(softnet_data.processed);
3459
3460         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3461                 skb = vlan_untag(skb);
3462                 if (unlikely(!skb))
3463                         goto unlock;
3464         }
3465
3466 #ifdef CONFIG_NET_CLS_ACT
3467         if (skb->tc_verd & TC_NCLS) {
3468                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3469                 goto ncls;
3470         }
3471 #endif
3472
3473         if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3474                 goto skip_taps;
3475
3476         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3477                 if (!ptype->dev || ptype->dev == skb->dev) {
3478                         if (pt_prev)
3479                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3480                         pt_prev = ptype;
3481                 }
3482         }
3483
3484 skip_taps:
3485 #ifdef CONFIG_NET_CLS_ACT
3486         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3487         if (!skb)
3488                 goto unlock;
3489 ncls:
3490 #endif
3491
3492         if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3493                                 && !skb_pfmemalloc_protocol(skb))
3494                 goto drop;
3495
3496         if (vlan_tx_tag_present(skb)) {
3497                 if (pt_prev) {
3498                         ret = deliver_skb(skb, pt_prev, orig_dev);
3499                         pt_prev = NULL;
3500                 }
3501                 if (vlan_do_receive(&skb))
3502                         goto another_round;
3503                 else if (unlikely(!skb))
3504                         goto unlock;
3505         }
3506
3507         rx_handler = rcu_dereference(skb->dev->rx_handler);
3508         if (rx_handler) {
3509                 if (pt_prev) {
3510                         ret = deliver_skb(skb, pt_prev, orig_dev);
3511                         pt_prev = NULL;
3512                 }
3513                 switch (rx_handler(&skb)) {
3514                 case RX_HANDLER_CONSUMED:
3515                         goto unlock;
3516                 case RX_HANDLER_ANOTHER:
3517                         goto another_round;
3518                 case RX_HANDLER_EXACT:
3519                         deliver_exact = true;
3520                 case RX_HANDLER_PASS:
3521                         break;
3522                 default:
3523                         BUG();
3524                 }
3525         }
3526
3527         if (vlan_tx_nonzero_tag_present(skb))
3528                 skb->pkt_type = PACKET_OTHERHOST;
3529
3530         /* deliver only exact match when indicated */
3531         null_or_dev = deliver_exact ? skb->dev : NULL;
3532
3533         type = skb->protocol;
3534         list_for_each_entry_rcu(ptype,
3535                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3536                 if (ptype->type == type &&
3537                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3538                      ptype->dev == orig_dev)) {
3539                         if (pt_prev)
3540                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3541                         pt_prev = ptype;
3542                 }
3543         }
3544
3545         if (pt_prev) {
3546                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3547                         goto drop;
3548                 else
3549                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3550         } else {
3551 drop:
3552                 atomic_long_inc(&skb->dev->rx_dropped);
3553                 kfree_skb(skb);
3554                 /* Jamal, now you will not able to escape explaining
3555                  * me how you were going to use this. :-)
3556                  */
3557                 ret = NET_RX_DROP;
3558         }
3559
3560 unlock:
3561         rcu_read_unlock();
3562 out:
3563         tsk_restore_flags(current, pflags, PF_MEMALLOC);
3564         return ret;
3565 }
3566
3567 /**
3568  *      netif_receive_skb - process receive buffer from network
3569  *      @skb: buffer to process
3570  *
3571  *      netif_receive_skb() is the main receive data processing function.
3572  *      It always succeeds. The buffer may be dropped during processing
3573  *      for congestion control or by the protocol layers.
3574  *
3575  *      This function may only be called from softirq context and interrupts
3576  *      should be enabled.
3577  *
3578  *      Return values (usually ignored):
3579  *      NET_RX_SUCCESS: no congestion
3580  *      NET_RX_DROP: packet was dropped
3581  */
3582 int netif_receive_skb(struct sk_buff *skb)
3583 {
3584         net_timestamp_check(netdev_tstamp_prequeue, skb);
3585
3586         if (skb_defer_rx_timestamp(skb))
3587                 return NET_RX_SUCCESS;
3588
3589 #ifdef CONFIG_RPS
3590         if (static_key_false(&rps_needed)) {
3591                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3592                 int cpu, ret;
3593
3594                 rcu_read_lock();
3595
3596                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3597
3598                 if (cpu >= 0) {
3599                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3600                         rcu_read_unlock();
3601                         return ret;
3602                 }
3603                 rcu_read_unlock();
3604         }
3605 #endif
3606         return __netif_receive_skb(skb);
3607 }
3608 EXPORT_SYMBOL(netif_receive_skb);
3609
3610 /* Network device is going away, flush any packets still pending
3611  * Called with irqs disabled.
3612  */
3613 static void flush_backlog(void *arg)
3614 {
3615         struct net_device *dev = arg;
3616         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3617         struct sk_buff *skb, *tmp;
3618
3619         rps_lock(sd);
3620         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3621                 if (skb->dev == dev) {
3622                         __skb_unlink(skb, &sd->input_pkt_queue);
3623                         kfree_skb(skb);
3624                         input_queue_head_incr(sd);
3625                 }
3626         }
3627         rps_unlock(sd);
3628
3629         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3630                 if (skb->dev == dev) {
3631                         __skb_unlink(skb, &sd->process_queue);
3632                         kfree_skb(skb);
3633                         input_queue_head_incr(sd);
3634                 }
3635         }
3636 }
3637
3638 static int napi_gro_complete(struct sk_buff *skb)
3639 {
3640         struct packet_offload *ptype;
3641         __be16 type = skb->protocol;
3642         struct list_head *head = &offload_base;
3643         int err = -ENOENT;
3644
3645         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3646
3647         if (NAPI_GRO_CB(skb)->count == 1) {
3648                 skb_shinfo(skb)->gso_size = 0;
3649                 goto out;
3650         }
3651
3652         rcu_read_lock();
3653         list_for_each_entry_rcu(ptype, head, list) {
3654                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3655                         continue;
3656
3657                 err = ptype->callbacks.gro_complete(skb);
3658                 break;
3659         }
3660         rcu_read_unlock();
3661
3662         if (err) {
3663                 WARN_ON(&ptype->list == head);
3664                 kfree_skb(skb);
3665                 return NET_RX_SUCCESS;
3666         }
3667
3668 out:
3669         return netif_receive_skb(skb);
3670 }
3671
3672 /* napi->gro_list contains packets ordered by age.
3673  * youngest packets at the head of it.
3674  * Complete skbs in reverse order to reduce latencies.
3675  */
3676 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3677 {
3678         struct sk_buff *skb, *prev = NULL;
3679
3680         /* scan list and build reverse chain */
3681         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3682                 skb->prev = prev;
3683                 prev = skb;
3684         }
3685
3686         for (skb = prev; skb; skb = prev) {
3687                 skb->next = NULL;
3688
3689                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3690                         return;
3691
3692                 prev = skb->prev;
3693                 napi_gro_complete(skb);
3694                 napi->gro_count--;
3695         }
3696
3697         napi->gro_list = NULL;
3698 }
3699 EXPORT_SYMBOL(napi_gro_flush);
3700
3701 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3702 {
3703         struct sk_buff *p;
3704         unsigned int maclen = skb->dev->hard_header_len;
3705
3706         for (p = napi->gro_list; p; p = p->next) {
3707                 unsigned long diffs;
3708
3709                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3710                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3711                 if (maclen == ETH_HLEN)
3712                         diffs |= compare_ether_header(skb_mac_header(p),
3713                                                       skb_gro_mac_header(skb));
3714                 else if (!diffs)
3715                         diffs = memcmp(skb_mac_header(p),
3716                                        skb_gro_mac_header(skb),
3717                                        maclen);
3718                 NAPI_GRO_CB(p)->same_flow = !diffs;
3719                 NAPI_GRO_CB(p)->flush = 0;
3720         }
3721 }
3722
3723 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3724 {
3725         struct sk_buff **pp = NULL;
3726         struct packet_offload *ptype;
3727         __be16 type = skb->protocol;
3728         struct list_head *head = &offload_base;
3729         int same_flow;
3730         int mac_len;
3731         enum gro_result ret;
3732
3733         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3734                 goto normal;
3735
3736         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3737                 goto normal;
3738
3739         gro_list_prepare(napi, skb);
3740
3741         rcu_read_lock();
3742         list_for_each_entry_rcu(ptype, head, list) {
3743                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3744                         continue;
3745
3746                 skb_set_network_header(skb, skb_gro_offset(skb));
3747                 mac_len = skb->network_header - skb->mac_header;
3748                 skb->mac_len = mac_len;
3749                 NAPI_GRO_CB(skb)->same_flow = 0;
3750                 NAPI_GRO_CB(skb)->flush = 0;
3751                 NAPI_GRO_CB(skb)->free = 0;
3752
3753                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3754                 break;
3755         }
3756         rcu_read_unlock();
3757
3758         if (&ptype->list == head)
3759                 goto normal;
3760
3761         same_flow = NAPI_GRO_CB(skb)->same_flow;
3762         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3763
3764         if (pp) {
3765                 struct sk_buff *nskb = *pp;
3766
3767                 *pp = nskb->next;
3768                 nskb->next = NULL;
3769                 napi_gro_complete(nskb);
3770                 napi->gro_count--;
3771         }
3772
3773         if (same_flow)
3774                 goto ok;
3775
3776         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3777                 goto normal;
3778
3779         napi->gro_count++;
3780         NAPI_GRO_CB(skb)->count = 1;
3781         NAPI_GRO_CB(skb)->age = jiffies;
3782         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3783         skb->next = napi->gro_list;
3784         napi->gro_list = skb;
3785         ret = GRO_HELD;
3786
3787 pull:
3788         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3789                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3790
3791                 BUG_ON(skb->end - skb->tail < grow);
3792
3793                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3794
3795                 skb->tail += grow;
3796                 skb->data_len -= grow;
3797
3798                 skb_shinfo(skb)->frags[0].page_offset += grow;
3799                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3800
3801                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3802                         skb_frag_unref(skb, 0);
3803                         memmove(skb_shinfo(skb)->frags,
3804                                 skb_shinfo(skb)->frags + 1,
3805                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3806                 }
3807         }
3808
3809 ok:
3810         return ret;
3811
3812 normal:
3813         ret = GRO_NORMAL;
3814         goto pull;
3815 }
3816
3817
3818 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3819 {
3820         switch (ret) {
3821         case GRO_NORMAL:
3822                 if (netif_receive_skb(skb))
3823                         ret = GRO_DROP;
3824                 break;
3825
3826         case GRO_DROP:
3827                 kfree_skb(skb);
3828                 break;
3829
3830         case GRO_MERGED_FREE:
3831                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3832                         kmem_cache_free(skbuff_head_cache, skb);
3833                 else
3834                         __kfree_skb(skb);
3835                 break;
3836
3837         case GRO_HELD:
3838         case GRO_MERGED:
3839                 break;
3840         }
3841
3842         return ret;
3843 }
3844
3845 static void skb_gro_reset_offset(struct sk_buff *skb)
3846 {
3847         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3848         const skb_frag_t *frag0 = &pinfo->frags[0];
3849
3850         NAPI_GRO_CB(skb)->data_offset = 0;
3851         NAPI_GRO_CB(skb)->frag0 = NULL;
3852         NAPI_GRO_CB(skb)->frag0_len = 0;
3853
3854         if (skb->mac_header == skb->tail &&
3855             pinfo->nr_frags &&
3856             !PageHighMem(skb_frag_page(frag0))) {
3857                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3858                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3859         }
3860 }
3861
3862 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3863 {
3864         skb_gro_reset_offset(skb);
3865
3866         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3867 }
3868 EXPORT_SYMBOL(napi_gro_receive);
3869
3870 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3871 {
3872         __skb_pull(skb, skb_headlen(skb));
3873         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3874         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3875         skb->vlan_tci = 0;
3876         skb->dev = napi->dev;
3877         skb->skb_iif = 0;
3878
3879         napi->skb = skb;
3880 }
3881
3882 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3883 {
3884         struct sk_buff *skb = napi->skb;
3885
3886         if (!skb) {
3887                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3888                 if (skb)
3889                         napi->skb = skb;
3890         }
3891         return skb;
3892 }
3893 EXPORT_SYMBOL(napi_get_frags);
3894
3895 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3896                                gro_result_t ret)
3897 {
3898         switch (ret) {
3899         case GRO_NORMAL:
3900         case GRO_HELD:
3901                 skb->protocol = eth_type_trans(skb, skb->dev);
3902
3903                 if (ret == GRO_HELD)
3904                         skb_gro_pull(skb, -ETH_HLEN);
3905                 else if (netif_receive_skb(skb))
3906                         ret = GRO_DROP;
3907                 break;
3908
3909         case GRO_DROP:
3910         case GRO_MERGED_FREE:
3911                 napi_reuse_skb(napi, skb);
3912                 break;
3913
3914         case GRO_MERGED:
3915                 break;
3916         }
3917
3918         return ret;
3919 }
3920
3921 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3922 {
3923         struct sk_buff *skb = napi->skb;
3924         struct ethhdr *eth;
3925         unsigned int hlen;
3926         unsigned int off;
3927
3928         napi->skb = NULL;
3929
3930         skb_reset_mac_header(skb);
3931         skb_gro_reset_offset(skb);
3932
3933         off = skb_gro_offset(skb);
3934         hlen = off + sizeof(*eth);
3935         eth = skb_gro_header_fast(skb, off);
3936         if (skb_gro_header_hard(skb, hlen)) {
3937                 eth = skb_gro_header_slow(skb, hlen, off);
3938                 if (unlikely(!eth)) {
3939                         napi_reuse_skb(napi, skb);
3940                         skb = NULL;
3941                         goto out;
3942                 }
3943         }
3944
3945         skb_gro_pull(skb, sizeof(*eth));
3946
3947         /*
3948          * This works because the only protocols we care about don't require
3949          * special handling.  We'll fix it up properly at the end.
3950          */
3951         skb->protocol = eth->h_proto;
3952
3953 out:
3954         return skb;
3955 }
3956
3957 gro_result_t napi_gro_frags(struct napi_struct *napi)
3958 {
3959         struct sk_buff *skb = napi_frags_skb(napi);
3960
3961         if (!skb)
3962                 return GRO_DROP;
3963
3964         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
3965 }
3966 EXPORT_SYMBOL(napi_gro_frags);
3967
3968 /*
3969  * net_rps_action sends any pending IPI's for rps.
3970  * Note: called with local irq disabled, but exits with local irq enabled.
3971  */
3972 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3973 {
3974 #ifdef CONFIG_RPS
3975         struct softnet_data *remsd = sd->rps_ipi_list;
3976
3977         if (remsd) {
3978                 sd->rps_ipi_list = NULL;
3979
3980                 local_irq_enable();
3981
3982                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3983                 while (remsd) {
3984                         struct softnet_data *next = remsd->rps_ipi_next;
3985
3986                         if (cpu_online(remsd->cpu))
3987                                 __smp_call_function_single(remsd->cpu,
3988                                                            &remsd->csd, 0);
3989                         remsd = next;
3990                 }
3991         } else
3992 #endif
3993                 local_irq_enable();
3994 }
3995
3996 static int process_backlog(struct napi_struct *napi, int quota)
3997 {
3998         int work = 0;
3999         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4000
4001 #ifdef CONFIG_RPS
4002         /* Check if we have pending ipi, its better to send them now,
4003          * not waiting net_rx_action() end.
4004          */
4005         if (sd->rps_ipi_list) {
4006                 local_irq_disable();
4007                 net_rps_action_and_irq_enable(sd);
4008         }
4009 #endif
4010         napi->weight = weight_p;
4011         local_irq_disable();
4012         while (work < quota) {
4013                 struct sk_buff *skb;
4014                 unsigned int qlen;
4015
4016                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4017                         local_irq_enable();
4018                         __netif_receive_skb(skb);
4019                         local_irq_disable();
4020                         input_queue_head_incr(sd);
4021                         if (++work >= quota) {
4022                                 local_irq_enable();
4023                                 return work;
4024                         }
4025                 }
4026
4027                 rps_lock(sd);
4028                 qlen = skb_queue_len(&sd->input_pkt_queue);
4029                 if (qlen)
4030                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4031                                                    &sd->process_queue);
4032
4033                 if (qlen < quota - work) {
4034                         /*
4035                          * Inline a custom version of __napi_complete().
4036                          * only current cpu owns and manipulates this napi,
4037                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4038                          * we can use a plain write instead of clear_bit(),
4039                          * and we dont need an smp_mb() memory barrier.
4040                          */
4041                         list_del(&napi->poll_list);
4042                         napi->state = 0;
4043
4044                         quota = work + qlen;
4045                 }
4046                 rps_unlock(sd);
4047         }
4048         local_irq_enable();
4049
4050         return work;
4051 }
4052
4053 /**
4054  * __napi_schedule - schedule for receive
4055  * @n: entry to schedule
4056  *
4057  * The entry's receive function will be scheduled to run
4058  */
4059 void __napi_schedule(struct napi_struct *n)
4060 {
4061         unsigned long flags;
4062
4063         local_irq_save(flags);
4064         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4065         local_irq_restore(flags);
4066 }
4067 EXPORT_SYMBOL(__napi_schedule);
4068
4069 void __napi_complete(struct napi_struct *n)
4070 {
4071         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4072         BUG_ON(n->gro_list);
4073
4074         list_del(&n->poll_list);
4075         smp_mb__before_clear_bit();
4076         clear_bit(NAPI_STATE_SCHED, &n->state);
4077 }
4078 EXPORT_SYMBOL(__napi_complete);
4079
4080 void napi_complete(struct napi_struct *n)
4081 {
4082         unsigned long flags;
4083
4084         /*
4085          * don't let napi dequeue from the cpu poll list
4086          * just in case its running on a different cpu
4087          */
4088         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4089                 return;
4090
4091         napi_gro_flush(n, false);
4092         local_irq_save(flags);
4093         __napi_complete(n);
4094         local_irq_restore(flags);
4095 }
4096 EXPORT_SYMBOL(napi_complete);
4097
4098 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4099                     int (*poll)(struct napi_struct *, int), int weight)
4100 {
4101         INIT_LIST_HEAD(&napi->poll_list);
4102         napi->gro_count = 0;
4103         napi->gro_list = NULL;
4104         napi->skb = NULL;
4105         napi->poll = poll;
4106         napi->weight = weight;
4107         list_add(&napi->dev_list, &dev->napi_list);
4108         napi->dev = dev;
4109 #ifdef CONFIG_NETPOLL
4110         spin_lock_init(&napi->poll_lock);
4111         napi->poll_owner = -1;
4112 #endif
4113         set_bit(NAPI_STATE_SCHED, &napi->state);
4114 }
4115 EXPORT_SYMBOL(netif_napi_add);
4116
4117 void netif_napi_del(struct napi_struct *napi)
4118 {
4119         struct sk_buff *skb, *next;
4120
4121         list_del_init(&napi->dev_list);
4122         napi_free_frags(napi);
4123
4124         for (skb = napi->gro_list; skb; skb = next) {
4125                 next = skb->next;
4126                 skb->next = NULL;
4127                 kfree_skb(skb);
4128         }
4129
4130         napi->gro_list = NULL;
4131         napi->gro_count = 0;
4132 }
4133 EXPORT_SYMBOL(netif_napi_del);
4134
4135 static void net_rx_action(struct softirq_action *h)
4136 {
4137         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4138         unsigned long time_limit = jiffies + 2;
4139         int budget = netdev_budget;
4140         void *have;
4141
4142         local_irq_disable();
4143
4144         while (!list_empty(&sd->poll_list)) {
4145                 struct napi_struct *n;
4146                 int work, weight;
4147
4148                 /* If softirq window is exhuasted then punt.
4149                  * Allow this to run for 2 jiffies since which will allow
4150                  * an average latency of 1.5/HZ.
4151                  */
4152                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
4153                         goto softnet_break;
4154
4155                 local_irq_enable();
4156
4157                 /* Even though interrupts have been re-enabled, this
4158                  * access is safe because interrupts can only add new
4159                  * entries to the tail of this list, and only ->poll()
4160                  * calls can remove this head entry from the list.
4161                  */
4162                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4163
4164                 have = netpoll_poll_lock(n);
4165
4166                 weight = n->weight;
4167
4168                 /* This NAPI_STATE_SCHED test is for avoiding a race
4169                  * with netpoll's poll_napi().  Only the entity which
4170                  * obtains the lock and sees NAPI_STATE_SCHED set will
4171                  * actually make the ->poll() call.  Therefore we avoid
4172                  * accidentally calling ->poll() when NAPI is not scheduled.
4173                  */
4174                 work = 0;
4175                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4176                         work = n->poll(n, weight);
4177                         trace_napi_poll(n);
4178                 }
4179
4180                 WARN_ON_ONCE(work > weight);
4181
4182                 budget -= work;
4183
4184                 local_irq_disable();
4185
4186                 /* Drivers must not modify the NAPI state if they
4187                  * consume the entire weight.  In such cases this code
4188                  * still "owns" the NAPI instance and therefore can
4189                  * move the instance around on the list at-will.
4190                  */
4191                 if (unlikely(work == weight)) {
4192                         if (unlikely(napi_disable_pending(n))) {
4193                                 local_irq_enable();
4194                                 napi_complete(n);
4195                                 local_irq_disable();
4196                         } else {
4197                                 if (n->gro_list) {
4198                                         /* flush too old packets
4199                                          * If HZ < 1000, flush all packets.
4200                                          */
4201                                         local_irq_enable();
4202                                         napi_gro_flush(n, HZ >= 1000);
4203                                         local_irq_disable();
4204                                 }
4205                                 list_move_tail(&n->poll_list, &sd->poll_list);
4206                         }
4207                 }
4208
4209                 netpoll_poll_unlock(have);
4210         }
4211 out:
4212         net_rps_action_and_irq_enable(sd);
4213
4214 #ifdef CONFIG_NET_DMA
4215         /*
4216          * There may not be any more sk_buffs coming right now, so push
4217          * any pending DMA copies to hardware
4218          */
4219         dma_issue_pending_all();
4220 #endif
4221
4222         return;
4223
4224 softnet_break:
4225         sd->time_squeeze++;
4226         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4227         goto out;
4228 }
4229
4230 static gifconf_func_t *gifconf_list[NPROTO];
4231
4232 /**
4233  *      register_gifconf        -       register a SIOCGIF handler
4234  *      @family: Address family
4235  *      @gifconf: Function handler
4236  *
4237  *      Register protocol dependent address dumping routines. The handler
4238  *      that is passed must not be freed or reused until it has been replaced
4239  *      by another handler.
4240  */
4241 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4242 {
4243         if (family >= NPROTO)
4244                 return -EINVAL;
4245         gifconf_list[family] = gifconf;
4246         return 0;
4247 }
4248 EXPORT_SYMBOL(register_gifconf);
4249
4250
4251 /*
4252  *      Map an interface index to its name (SIOCGIFNAME)
4253  */
4254
4255 /*
4256  *      We need this ioctl for efficient implementation of the
4257  *      if_indextoname() function required by the IPv6 API.  Without
4258  *      it, we would have to search all the interfaces to find a
4259  *      match.  --pb
4260  */
4261
4262 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4263 {
4264         struct net_device *dev;
4265         struct ifreq ifr;
4266         unsigned seq;
4267
4268         /*
4269          *      Fetch the caller's info block.
4270          */
4271
4272         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4273                 return -EFAULT;
4274
4275 retry:
4276         seq = read_seqcount_begin(&devnet_rename_seq);
4277         rcu_read_lock();
4278         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4279         if (!dev) {
4280                 rcu_read_unlock();
4281                 return -ENODEV;
4282         }
4283
4284         strcpy(ifr.ifr_name, dev->name);
4285         rcu_read_unlock();
4286         if (read_seqcount_retry(&devnet_rename_seq, seq))
4287                 goto retry;
4288
4289         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4290                 return -EFAULT;
4291         return 0;
4292 }
4293
4294 /*
4295  *      Perform a SIOCGIFCONF call. This structure will change
4296  *      size eventually, and there is nothing I can do about it.
4297  *      Thus we will need a 'compatibility mode'.
4298  */
4299
4300 static int dev_ifconf(struct net *net, char __user *arg)
4301 {
4302         struct ifconf ifc;
4303         struct net_device *dev;
4304         char __user *pos;
4305         int len;
4306         int total;
4307         int i;
4308
4309         /*
4310          *      Fetch the caller's info block.
4311          */
4312
4313         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4314                 return -EFAULT;
4315
4316         pos = ifc.ifc_buf;
4317         len = ifc.ifc_len;
4318
4319         /*
4320          *      Loop over the interfaces, and write an info block for each.
4321          */
4322
4323         total = 0;
4324         for_each_netdev(net, dev) {
4325                 for (i = 0; i < NPROTO; i++) {
4326                         if (gifconf_list[i]) {
4327                                 int done;
4328                                 if (!pos)
4329                                         done = gifconf_list[i](dev, NULL, 0);
4330                                 else
4331                                         done = gifconf_list[i](dev, pos + total,
4332                                                                len - total);
4333                                 if (done < 0)
4334                                         return -EFAULT;
4335                                 total += done;
4336                         }
4337                 }
4338         }
4339
4340         /*
4341          *      All done.  Write the updated control block back to the caller.
4342          */
4343         ifc.ifc_len = total;
4344
4345         /*
4346          *      Both BSD and Solaris return 0 here, so we do too.
4347          */
4348         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4349 }
4350
4351 #ifdef CONFIG_PROC_FS
4352
4353 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4354
4355 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4356 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4357 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4358
4359 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4360 {
4361         struct net *net = seq_file_net(seq);
4362         struct net_device *dev;
4363         struct hlist_node *p;
4364         struct hlist_head *h;
4365         unsigned int count = 0, offset = get_offset(*pos);
4366
4367         h = &net->dev_name_head[get_bucket(*pos)];
4368         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4369                 if (++count == offset)
4370                         return dev;
4371         }
4372
4373         return NULL;
4374 }
4375
4376 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4377 {
4378         struct net_device *dev;
4379         unsigned int bucket;
4380
4381         do {
4382                 dev = dev_from_same_bucket(seq, pos);
4383                 if (dev)
4384                         return dev;
4385
4386                 bucket = get_bucket(*pos) + 1;
4387                 *pos = set_bucket_offset(bucket, 1);
4388         } while (bucket < NETDEV_HASHENTRIES);
4389
4390         return NULL;
4391 }
4392
4393 /*
4394  *      This is invoked by the /proc filesystem handler to display a device
4395  *      in detail.
4396  */
4397 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4398         __acquires(RCU)
4399 {
4400         rcu_read_lock();
4401         if (!*pos)
4402                 return SEQ_START_TOKEN;
4403
4404         if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4405                 return NULL;
4406
4407         return dev_from_bucket(seq, pos);
4408 }
4409
4410 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4411 {
4412         ++*pos;
4413         return dev_from_bucket(seq, pos);
4414 }
4415
4416 void dev_seq_stop(struct seq_file *seq, void *v)
4417         __releases(RCU)
4418 {
4419         rcu_read_unlock();
4420 }
4421
4422 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4423 {
4424         struct rtnl_link_stats64 temp;
4425         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4426
4427         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4428                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4429                    dev->name, stats->rx_bytes, stats->rx_packets,
4430                    stats->rx_errors,
4431                    stats->rx_dropped + stats->rx_missed_errors,
4432                    stats->rx_fifo_errors,
4433                    stats->rx_length_errors + stats->rx_over_errors +
4434                     stats->rx_crc_errors + stats->rx_frame_errors,
4435                    stats->rx_compressed, stats->multicast,
4436                    stats->tx_bytes, stats->tx_packets,
4437                    stats->tx_errors, stats->tx_dropped,
4438                    stats->tx_fifo_errors, stats->collisions,
4439                    stats->tx_carrier_errors +
4440                     stats->tx_aborted_errors +
4441                     stats->tx_window_errors +
4442                     stats->tx_heartbeat_errors,
4443                    stats->tx_compressed);
4444 }
4445
4446 /*
4447  *      Called from the PROCfs module. This now uses the new arbitrary sized
4448  *      /proc/net interface to create /proc/net/dev
4449  */
4450 static int dev_seq_show(struct seq_file *seq, void *v)
4451 {
4452         if (v == SEQ_START_TOKEN)
4453                 seq_puts(seq, "Inter-|   Receive                            "
4454                               "                    |  Transmit\n"
4455                               " face |bytes    packets errs drop fifo frame "
4456                               "compressed multicast|bytes    packets errs "
4457                               "drop fifo colls carrier compressed\n");
4458         else
4459                 dev_seq_printf_stats(seq, v);
4460         return 0;
4461 }
4462
4463 static struct softnet_data *softnet_get_online(loff_t *pos)
4464 {
4465         struct softnet_data *sd = NULL;
4466
4467         while (*pos < nr_cpu_ids)
4468                 if (cpu_online(*pos)) {
4469                         sd = &per_cpu(softnet_data, *pos);
4470                         break;
4471                 } else
4472                         ++*pos;
4473         return sd;
4474 }
4475
4476 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4477 {
4478         return softnet_get_online(pos);
4479 }
4480
4481 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4482 {
4483         ++*pos;
4484         return softnet_get_online(pos);
4485 }
4486
4487 static void softnet_seq_stop(struct seq_file *seq, void *v)
4488 {
4489 }
4490
4491 static int softnet_seq_show(struct seq_file *seq, void *v)
4492 {
4493         struct softnet_data *sd = v;
4494
4495         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4496                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4497                    0, 0, 0, 0, /* was fastroute */
4498                    sd->cpu_collision, sd->received_rps);
4499         return 0;
4500 }
4501
4502 static const struct seq_operations dev_seq_ops = {
4503         .start = dev_seq_start,
4504         .next  = dev_seq_next,
4505         .stop  = dev_seq_stop,
4506         .show  = dev_seq_show,
4507 };
4508
4509 static int dev_seq_open(struct inode *inode, struct file *file)
4510 {
4511         return seq_open_net(inode, file, &dev_seq_ops,
4512                             sizeof(struct seq_net_private));
4513 }
4514
4515 static const struct file_operations dev_seq_fops = {
4516         .owner   = THIS_MODULE,
4517         .open    = dev_seq_open,
4518         .read    = seq_read,
4519         .llseek  = seq_lseek,
4520         .release = seq_release_net,
4521 };
4522
4523 static const struct seq_operations softnet_seq_ops = {
4524         .start = softnet_seq_start,
4525         .next  = softnet_seq_next,
4526         .stop  = softnet_seq_stop,
4527         .show  = softnet_seq_show,
4528 };
4529
4530 static int softnet_seq_open(struct inode *inode, struct file *file)
4531 {
4532         return seq_open(file, &softnet_seq_ops);
4533 }
4534
4535 static const struct file_operations softnet_seq_fops = {
4536         .owner   = THIS_MODULE,
4537         .open    = softnet_seq_open,
4538         .read    = seq_read,
4539         .llseek  = seq_lseek,
4540         .release = seq_release,
4541 };
4542
4543 static void *ptype_get_idx(loff_t pos)
4544 {
4545         struct packet_type *pt = NULL;
4546         loff_t i = 0;
4547         int t;
4548
4549         list_for_each_entry_rcu(pt, &ptype_all, list) {
4550                 if (i == pos)
4551                         return pt;
4552                 ++i;
4553         }
4554
4555         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4556                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4557                         if (i == pos)
4558                                 return pt;
4559                         ++i;
4560                 }
4561         }
4562         return NULL;
4563 }
4564
4565 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4566         __acquires(RCU)
4567 {
4568         rcu_read_lock();
4569         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4570 }
4571
4572 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4573 {
4574         struct packet_type *pt;
4575         struct list_head *nxt;
4576         int hash;
4577
4578         ++*pos;
4579         if (v == SEQ_START_TOKEN)
4580                 return ptype_get_idx(0);
4581
4582         pt = v;
4583         nxt = pt->list.next;
4584         if (pt->type == htons(ETH_P_ALL)) {
4585                 if (nxt != &ptype_all)
4586                         goto found;
4587                 hash = 0;
4588                 nxt = ptype_base[0].next;
4589         } else
4590                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4591
4592         while (nxt == &ptype_base[hash]) {
4593                 if (++hash >= PTYPE_HASH_SIZE)
4594                         return NULL;
4595                 nxt = ptype_base[hash].next;
4596         }
4597 found:
4598         return list_entry(nxt, struct packet_type, list);
4599 }
4600
4601 static void ptype_seq_stop(struct seq_file *seq, void *v)
4602         __releases(RCU)
4603 {
4604         rcu_read_unlock();
4605 }
4606
4607 static int ptype_seq_show(struct seq_file *seq, void *v)
4608 {
4609         struct packet_type *pt = v;
4610
4611         if (v == SEQ_START_TOKEN)
4612                 seq_puts(seq, "Type Device      Function\n");
4613         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4614                 if (pt->type == htons(ETH_P_ALL))
4615                         seq_puts(seq, "ALL ");
4616                 else
4617                         seq_printf(seq, "%04x", ntohs(pt->type));
4618
4619                 seq_printf(seq, " %-8s %pF\n",
4620                            pt->dev ? pt->dev->name : "", pt->func);
4621         }
4622
4623         return 0;
4624 }
4625
4626 static const struct seq_operations ptype_seq_ops = {
4627         .start = ptype_seq_start,
4628         .next  = ptype_seq_next,
4629         .stop  = ptype_seq_stop,
4630         .show  = ptype_seq_show,
4631 };
4632
4633 static int ptype_seq_open(struct inode *inode, struct file *file)
4634 {
4635         return seq_open_net(inode, file, &ptype_seq_ops,
4636                         sizeof(struct seq_net_private));
4637 }
4638
4639 static const struct file_operations ptype_seq_fops = {
4640         .owner   = THIS_MODULE,
4641         .open    = ptype_seq_open,
4642         .read    = seq_read,
4643         .llseek  = seq_lseek,
4644         .release = seq_release_net,
4645 };
4646
4647
4648 static int __net_init dev_proc_net_init(struct net *net)
4649 {
4650         int rc = -ENOMEM;
4651
4652         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4653                 goto out;
4654         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4655                 goto out_dev;
4656         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4657                 goto out_softnet;
4658
4659         if (wext_proc_init(net))
4660                 goto out_ptype;
4661         rc = 0;
4662 out:
4663         return rc;
4664 out_ptype:
4665         proc_net_remove(net, "ptype");
4666 out_softnet:
4667         proc_net_remove(net, "softnet_stat");
4668 out_dev:
4669         proc_net_remove(net, "dev");
4670         goto out;
4671 }
4672
4673 static void __net_exit dev_proc_net_exit(struct net *net)
4674 {
4675         wext_proc_exit(net);
4676
4677         proc_net_remove(net, "ptype");
4678         proc_net_remove(net, "softnet_stat");
4679         proc_net_remove(net, "dev");
4680 }
4681
4682 static struct pernet_operations __net_initdata dev_proc_ops = {
4683         .init = dev_proc_net_init,
4684         .exit = dev_proc_net_exit,
4685 };
4686
4687 static int __init dev_proc_init(void)
4688 {
4689         return register_pernet_subsys(&dev_proc_ops);
4690 }
4691 #else
4692 #define dev_proc_init() 0
4693 #endif  /* CONFIG_PROC_FS */
4694
4695
4696 struct netdev_upper {
4697         struct net_device *dev;
4698         bool master;
4699         struct list_head list;
4700         struct rcu_head rcu;
4701         struct list_head search_list;
4702 };
4703
4704 static void __append_search_uppers(struct list_head *search_list,
4705                                    struct net_device *dev)
4706 {
4707         struct netdev_upper *upper;
4708
4709         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4710                 /* check if this upper is not already in search list */
4711                 if (list_empty(&upper->search_list))
4712                         list_add_tail(&upper->search_list, search_list);
4713         }
4714 }
4715
4716 static bool __netdev_search_upper_dev(struct net_device *dev,
4717                                       struct net_device *upper_dev)
4718 {
4719         LIST_HEAD(search_list);
4720         struct netdev_upper *upper;
4721         struct netdev_upper *tmp;
4722         bool ret = false;
4723
4724         __append_search_uppers(&search_list, dev);
4725         list_for_each_entry(upper, &search_list, search_list) {
4726                 if (upper->dev == upper_dev) {
4727                         ret = true;
4728                         break;
4729                 }
4730                 __append_search_uppers(&search_list, upper->dev);
4731         }
4732         list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4733                 INIT_LIST_HEAD(&upper->search_list);
4734         return ret;
4735 }
4736
4737 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4738                                                 struct net_device *upper_dev)
4739 {
4740         struct netdev_upper *upper;
4741
4742         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4743                 if (upper->dev == upper_dev)
4744                         return upper;
4745         }
4746         return NULL;
4747 }
4748
4749 /**
4750  * netdev_has_upper_dev - Check if device is linked to an upper device
4751  * @dev: device
4752  * @upper_dev: upper device to check
4753  *
4754  * Find out if a device is linked to specified upper device and return true
4755  * in case it is. Note that this checks only immediate upper device,
4756  * not through a complete stack of devices. The caller must hold the RTNL lock.
4757  */
4758 bool netdev_has_upper_dev(struct net_device *dev,
4759                           struct net_device *upper_dev)
4760 {
4761         ASSERT_RTNL();
4762
4763         return __netdev_find_upper(dev, upper_dev);
4764 }
4765 EXPORT_SYMBOL(netdev_has_upper_dev);
4766
4767 /**
4768  * netdev_has_any_upper_dev - Check if device is linked to some device
4769  * @dev: device
4770  *
4771  * Find out if a device is linked to an upper device and return true in case
4772  * it is. The caller must hold the RTNL lock.
4773  */
4774 bool netdev_has_any_upper_dev(struct net_device *dev)
4775 {
4776         ASSERT_RTNL();
4777
4778         return !list_empty(&dev->upper_dev_list);
4779 }
4780 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4781
4782 /**
4783  * netdev_master_upper_dev_get - Get master upper device
4784  * @dev: device
4785  *
4786  * Find a master upper device and return pointer to it or NULL in case
4787  * it's not there. The caller must hold the RTNL lock.
4788  */
4789 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4790 {
4791         struct netdev_upper *upper;
4792
4793         ASSERT_RTNL();
4794
4795         if (list_empty(&dev->upper_dev_list))
4796                 return NULL;
4797
4798         upper = list_first_entry(&dev->upper_dev_list,
4799                                  struct netdev_upper, list);
4800         if (likely(upper->master))
4801                 return upper->dev;
4802         return NULL;
4803 }
4804 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4805
4806 /**
4807  * netdev_master_upper_dev_get_rcu - Get master upper device
4808  * @dev: device
4809  *
4810  * Find a master upper device and return pointer to it or NULL in case
4811  * it's not there. The caller must hold the RCU read lock.
4812  */
4813 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4814 {
4815         struct netdev_upper *upper;
4816
4817         upper = list_first_or_null_rcu(&dev->upper_dev_list,
4818                                        struct netdev_upper, list);
4819         if (upper && likely(upper->master))
4820                 return upper->dev;
4821         return NULL;
4822 }
4823 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4824
4825 static int __netdev_upper_dev_link(struct net_device *dev,
4826                                    struct net_device *upper_dev, bool master)
4827 {
4828         struct netdev_upper *upper;
4829
4830         ASSERT_RTNL();
4831
4832         if (dev == upper_dev)
4833                 return -EBUSY;
4834
4835         /* To prevent loops, check if dev is not upper device to upper_dev. */
4836         if (__netdev_search_upper_dev(upper_dev, dev))
4837                 return -EBUSY;
4838
4839         if (__netdev_find_upper(dev, upper_dev))
4840                 return -EEXIST;
4841
4842         if (master && netdev_master_upper_dev_get(dev))
4843                 return -EBUSY;
4844
4845         upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4846         if (!upper)
4847                 return -ENOMEM;
4848
4849         upper->dev = upper_dev;
4850         upper->master = master;
4851         INIT_LIST_HEAD(&upper->search_list);
4852
4853         /* Ensure that master upper link is always the first item in list. */
4854         if (master)
4855                 list_add_rcu(&upper->list, &dev->upper_dev_list);
4856         else
4857                 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4858         dev_hold(upper_dev);
4859
4860         return 0;
4861 }
4862
4863 /**
4864  * netdev_upper_dev_link - Add a link to the upper device
4865  * @dev: device
4866  * @upper_dev: new upper device
4867  *
4868  * Adds a link to device which is upper to this one. The caller must hold
4869  * the RTNL lock. On a failure a negative errno code is returned.
4870  * On success the reference counts are adjusted and the function
4871  * returns zero.
4872  */
4873 int netdev_upper_dev_link(struct net_device *dev,
4874                           struct net_device *upper_dev)
4875 {
4876         return __netdev_upper_dev_link(dev, upper_dev, false);
4877 }
4878 EXPORT_SYMBOL(netdev_upper_dev_link);
4879
4880 /**
4881  * netdev_master_upper_dev_link - Add a master link to the upper device
4882  * @dev: device
4883  * @upper_dev: new upper device
4884  *
4885  * Adds a link to device which is upper to this one. In this case, only
4886  * one master upper device can be linked, although other non-master devices
4887  * might be linked as well. The caller must hold the RTNL lock.
4888  * On a failure a negative errno code is returned. On success the reference
4889  * counts are adjusted and the function returns zero.
4890  */
4891 int netdev_master_upper_dev_link(struct net_device *dev,
4892                                  struct net_device *upper_dev)
4893 {
4894         return __netdev_upper_dev_link(dev, upper_dev, true);
4895 }
4896 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4897
4898 /**
4899  * netdev_upper_dev_unlink - Removes a link to upper device
4900  * @dev: device
4901  * @upper_dev: new upper device
4902  *
4903  * Removes a link to device which is upper to this one. The caller must hold
4904  * the RTNL lock.
4905  */
4906 void netdev_upper_dev_unlink(struct net_device *dev,
4907                              struct net_device *upper_dev)
4908 {
4909         struct netdev_upper *upper;
4910
4911         ASSERT_RTNL();
4912
4913         upper = __netdev_find_upper(dev, upper_dev);
4914         if (!upper)
4915                 return;
4916         list_del_rcu(&upper->list);
4917         dev_put(upper_dev);
4918         kfree_rcu(upper, rcu);
4919 }
4920 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4921
4922 static void dev_change_rx_flags(struct net_device *dev, int flags)
4923 {
4924         const struct net_device_ops *ops = dev->netdev_ops;
4925
4926         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4927                 ops->ndo_change_rx_flags(dev, flags);
4928 }
4929
4930 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4931 {
4932         unsigned int old_flags = dev->flags;
4933         kuid_t uid;
4934         kgid_t gid;
4935
4936         ASSERT_RTNL();
4937
4938         dev->flags |= IFF_PROMISC;
4939         dev->promiscuity += inc;
4940         if (dev->promiscuity == 0) {
4941                 /*
4942                  * Avoid overflow.
4943                  * If inc causes overflow, untouch promisc and return error.
4944                  */
4945                 if (inc < 0)
4946                         dev->flags &= ~IFF_PROMISC;
4947                 else {
4948                         dev->promiscuity -= inc;
4949                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4950                                 dev->name);
4951                         return -EOVERFLOW;
4952                 }
4953         }
4954         if (dev->flags != old_flags) {
4955                 pr_info("device %s %s promiscuous mode\n",
4956                         dev->name,
4957                         dev->flags & IFF_PROMISC ? "entered" : "left");
4958                 if (audit_enabled) {
4959                         current_uid_gid(&uid, &gid);
4960                         audit_log(current->audit_context, GFP_ATOMIC,
4961                                 AUDIT_ANOM_PROMISCUOUS,
4962                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4963                                 dev->name, (dev->flags & IFF_PROMISC),
4964                                 (old_flags & IFF_PROMISC),
4965                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4966                                 from_kuid(&init_user_ns, uid),
4967                                 from_kgid(&init_user_ns, gid),
4968                                 audit_get_sessionid(current));
4969                 }
4970
4971                 dev_change_rx_flags(dev, IFF_PROMISC);
4972         }
4973         return 0;
4974 }
4975
4976 /**
4977  *      dev_set_promiscuity     - update promiscuity count on a device
4978  *      @dev: device
4979  *      @inc: modifier
4980  *
4981  *      Add or remove promiscuity from a device. While the count in the device
4982  *      remains above zero the interface remains promiscuous. Once it hits zero
4983  *      the device reverts back to normal filtering operation. A negative inc
4984  *      value is used to drop promiscuity on the device.
4985  *      Return 0 if successful or a negative errno code on error.
4986  */
4987 int dev_set_promiscuity(struct net_device *dev, int inc)
4988 {
4989         unsigned int old_flags = dev->flags;
4990         int err;
4991
4992         err = __dev_set_promiscuity(dev, inc);
4993         if (err < 0)
4994                 return err;
4995         if (dev->flags != old_flags)
4996                 dev_set_rx_mode(dev);
4997         return err;
4998 }
4999 EXPORT_SYMBOL(dev_set_promiscuity);
5000
5001 /**
5002  *      dev_set_allmulti        - update allmulti count on a device
5003  *      @dev: device
5004  *      @inc: modifier
5005  *
5006  *      Add or remove reception of all multicast frames to a device. While the
5007  *      count in the device remains above zero the interface remains listening
5008  *      to all interfaces. Once it hits zero the device reverts back to normal
5009  *      filtering operation. A negative @inc value is used to drop the counter
5010  *      when releasing a resource needing all multicasts.
5011  *      Return 0 if successful or a negative errno code on error.
5012  */
5013
5014 int dev_set_allmulti(struct net_device *dev, int inc)
5015 {
5016         unsigned int old_flags = dev->flags;
5017
5018         ASSERT_RTNL();
5019
5020         dev->flags |= IFF_ALLMULTI;
5021         dev->allmulti += inc;
5022         if (dev->allmulti == 0) {
5023                 /*
5024                  * Avoid overflow.
5025                  * If inc causes overflow, untouch allmulti and return error.
5026                  */
5027                 if (inc < 0)
5028                         dev->flags &= ~IFF_ALLMULTI;
5029                 else {
5030                         dev->allmulti -= inc;
5031                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5032                                 dev->name);
5033                         return -EOVERFLOW;
5034                 }
5035         }
5036         if (dev->flags ^ old_flags) {
5037                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5038                 dev_set_rx_mode(dev);
5039         }
5040         return 0;
5041 }
5042 EXPORT_SYMBOL(dev_set_allmulti);
5043
5044 /*
5045  *      Upload unicast and multicast address lists to device and
5046  *      configure RX filtering. When the device doesn't support unicast
5047  *      filtering it is put in promiscuous mode while unicast addresses
5048  *      are present.
5049  */
5050 void __dev_set_rx_mode(struct net_device *dev)
5051 {
5052         const struct net_device_ops *ops = dev->netdev_ops;
5053
5054         /* dev_open will call this function so the list will stay sane. */
5055         if (!(dev->flags&IFF_UP))
5056                 return;
5057
5058         if (!netif_device_present(dev))
5059                 return;
5060
5061         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5062                 /* Unicast addresses changes may only happen under the rtnl,
5063                  * therefore calling __dev_set_promiscuity here is safe.
5064                  */
5065                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5066                         __dev_set_promiscuity(dev, 1);
5067                         dev->uc_promisc = true;
5068                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5069                         __dev_set_promiscuity(dev, -1);
5070                         dev->uc_promisc = false;
5071                 }
5072         }
5073
5074         if (ops->ndo_set_rx_mode)
5075                 ops->ndo_set_rx_mode(dev);
5076 }
5077
5078 void dev_set_rx_mode(struct net_device *dev)
5079 {
5080         netif_addr_lock_bh(dev);
5081         __dev_set_rx_mode(dev);
5082         netif_addr_unlock_bh(dev);
5083 }
5084
5085 /**
5086  *      dev_get_flags - get flags reported to userspace
5087  *      @dev: device
5088  *
5089  *      Get the combination of flag bits exported through APIs to userspace.
5090  */
5091 unsigned int dev_get_flags(const struct net_device *dev)
5092 {
5093         unsigned int flags;
5094
5095         flags = (dev->flags & ~(IFF_PROMISC |
5096                                 IFF_ALLMULTI |
5097                                 IFF_RUNNING |
5098                                 IFF_LOWER_UP |
5099                                 IFF_DORMANT)) |
5100                 (dev->gflags & (IFF_PROMISC |
5101                                 IFF_ALLMULTI));
5102
5103         if (netif_running(dev)) {
5104                 if (netif_oper_up(dev))
5105                         flags |= IFF_RUNNING;
5106                 if (netif_carrier_ok(dev))
5107                         flags |= IFF_LOWER_UP;
5108                 if (netif_dormant(dev))
5109                         flags |= IFF_DORMANT;
5110         }
5111
5112         return flags;
5113 }
5114 EXPORT_SYMBOL(dev_get_flags);
5115
5116 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5117 {
5118         unsigned int old_flags = dev->flags;
5119         int ret;
5120
5121         ASSERT_RTNL();
5122
5123         /*
5124          *      Set the flags on our device.
5125          */
5126
5127         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5128                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5129                                IFF_AUTOMEDIA)) |
5130                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5131                                     IFF_ALLMULTI));
5132
5133         /*
5134          *      Load in the correct multicast list now the flags have changed.
5135          */
5136
5137         if ((old_flags ^ flags) & IFF_MULTICAST)
5138                 dev_change_rx_flags(dev, IFF_MULTICAST);
5139
5140         dev_set_rx_mode(dev);
5141
5142         /*
5143          *      Have we downed the interface. We handle IFF_UP ourselves
5144          *      according to user attempts to set it, rather than blindly
5145          *      setting it.
5146          */
5147
5148         ret = 0;
5149         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
5150                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5151
5152                 if (!ret)
5153                         dev_set_rx_mode(dev);
5154         }
5155
5156         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5157                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5158
5159                 dev->gflags ^= IFF_PROMISC;
5160                 dev_set_promiscuity(dev, inc);
5161         }
5162
5163         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5164            is important. Some (broken) drivers set IFF_PROMISC, when
5165            IFF_ALLMULTI is requested not asking us and not reporting.
5166          */
5167         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5168                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5169
5170                 dev->gflags ^= IFF_ALLMULTI;
5171                 dev_set_allmulti(dev, inc);
5172         }
5173
5174         return ret;
5175 }
5176
5177 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
5178 {
5179         unsigned int changes = dev->flags ^ old_flags;
5180
5181         if (changes & IFF_UP) {
5182                 if (dev->flags & IFF_UP)
5183                         call_netdevice_notifiers(NETDEV_UP, dev);
5184                 else
5185                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5186         }
5187
5188         if (dev->flags & IFF_UP &&
5189             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
5190                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
5191 }
5192
5193 /**
5194  *      dev_change_flags - change device settings
5195  *      @dev: device
5196  *      @flags: device state flags
5197  *
5198  *      Change settings on device based state flags. The flags are
5199  *      in the userspace exported format.
5200  */
5201 int dev_change_flags(struct net_device *dev, unsigned int flags)
5202 {
5203         int ret;
5204         unsigned int changes, old_flags = dev->flags;
5205
5206         ret = __dev_change_flags(dev, flags);
5207         if (ret < 0)
5208                 return ret;
5209
5210         changes = old_flags ^ dev->flags;
5211         if (changes)
5212                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
5213
5214         __dev_notify_flags(dev, old_flags);
5215         return ret;
5216 }
5217 EXPORT_SYMBOL(dev_change_flags);
5218
5219 /**
5220  *      dev_set_mtu - Change maximum transfer unit
5221  *      @dev: device
5222  *      @new_mtu: new transfer unit
5223  *
5224  *      Change the maximum transfer size of the network device.
5225  */
5226 int dev_set_mtu(struct net_device *dev, int new_mtu)
5227 {
5228         const struct net_device_ops *ops = dev->netdev_ops;
5229         int err;
5230
5231         if (new_mtu == dev->mtu)
5232                 return 0;
5233
5234         /*      MTU must be positive.    */
5235         if (new_mtu < 0)
5236                 return -EINVAL;
5237
5238         if (!netif_device_present(dev))
5239                 return -ENODEV;
5240
5241         err = 0;
5242         if (ops->ndo_change_mtu)
5243                 err = ops->ndo_change_mtu(dev, new_mtu);
5244         else
5245                 dev->mtu = new_mtu;
5246
5247         if (!err)
5248                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5249         return err;
5250 }
5251 EXPORT_SYMBOL(dev_set_mtu);
5252
5253 /**
5254  *      dev_set_group - Change group this device belongs to
5255  *      @dev: device
5256  *      @new_group: group this device should belong to
5257  */
5258 void dev_set_group(struct net_device *dev, int new_group)
5259 {
5260         dev->group = new_group;
5261 }
5262 EXPORT_SYMBOL(dev_set_group);
5263
5264 /**
5265  *      dev_set_mac_address - Change Media Access Control Address
5266  *      @dev: device
5267  *      @sa: new address
5268  *
5269  *      Change the hardware (MAC) address of the device
5270  */
5271 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5272 {
5273         const struct net_device_ops *ops = dev->netdev_ops;
5274         int err;
5275
5276         if (!ops->ndo_set_mac_address)
5277                 return -EOPNOTSUPP;
5278         if (sa->sa_family != dev->type)
5279                 return -EINVAL;
5280         if (!netif_device_present(dev))
5281                 return -ENODEV;
5282         err = ops->ndo_set_mac_address(dev, sa);
5283         if (err)
5284                 return err;
5285         dev->addr_assign_type = NET_ADDR_SET;
5286         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5287         add_device_randomness(dev->dev_addr, dev->addr_len);
5288         return 0;
5289 }
5290 EXPORT_SYMBOL(dev_set_mac_address);
5291
5292 /**
5293  *      dev_change_carrier - Change device carrier
5294  *      @dev: device
5295  *      @new_carries: new value
5296  *
5297  *      Change device carrier
5298  */
5299 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5300 {
5301         const struct net_device_ops *ops = dev->netdev_ops;
5302
5303         if (!ops->ndo_change_carrier)
5304                 return -EOPNOTSUPP;
5305         if (!netif_device_present(dev))
5306                 return -ENODEV;
5307         return ops->ndo_change_carrier(dev, new_carrier);
5308 }
5309 EXPORT_SYMBOL(dev_change_carrier);
5310
5311 /*
5312  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
5313  */
5314 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
5315 {
5316         int err;
5317         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
5318
5319         if (!dev)
5320                 return -ENODEV;
5321
5322         switch (cmd) {
5323         case SIOCGIFFLAGS:      /* Get interface flags */
5324                 ifr->ifr_flags = (short) dev_get_flags(dev);
5325                 return 0;
5326
5327         case SIOCGIFMETRIC:     /* Get the metric on the interface
5328                                    (currently unused) */
5329                 ifr->ifr_metric = 0;
5330                 return 0;
5331
5332         case SIOCGIFMTU:        /* Get the MTU of a device */
5333                 ifr->ifr_mtu = dev->mtu;
5334                 return 0;
5335
5336         case SIOCGIFHWADDR:
5337                 if (!dev->addr_len)
5338                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
5339                 else
5340                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
5341                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5342                 ifr->ifr_hwaddr.sa_family = dev->type;
5343                 return 0;
5344
5345         case SIOCGIFSLAVE:
5346                 err = -EINVAL;
5347                 break;
5348
5349         case SIOCGIFMAP:
5350                 ifr->ifr_map.mem_start = dev->mem_start;
5351                 ifr->ifr_map.mem_end   = dev->mem_end;
5352                 ifr->ifr_map.base_addr = dev->base_addr;
5353                 ifr->ifr_map.irq       = dev->irq;
5354                 ifr->ifr_map.dma       = dev->dma;
5355                 ifr->ifr_map.port      = dev->if_port;
5356                 return 0;
5357
5358         case SIOCGIFINDEX:
5359                 ifr->ifr_ifindex = dev->ifindex;
5360                 return 0;
5361
5362         case SIOCGIFTXQLEN:
5363                 ifr->ifr_qlen = dev->tx_queue_len;
5364                 return 0;
5365
5366         default:
5367                 /* dev_ioctl() should ensure this case
5368                  * is never reached
5369                  */
5370                 WARN_ON(1);
5371                 err = -ENOTTY;
5372                 break;
5373
5374         }
5375         return err;
5376 }
5377
5378 /*
5379  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
5380  */
5381 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
5382 {
5383         int err;
5384         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5385         const struct net_device_ops *ops;
5386
5387         if (!dev)
5388                 return -ENODEV;
5389
5390         ops = dev->netdev_ops;
5391
5392         switch (cmd) {
5393         case SIOCSIFFLAGS:      /* Set interface flags */
5394                 return dev_change_flags(dev, ifr->ifr_flags);
5395
5396         case SIOCSIFMETRIC:     /* Set the metric on the interface
5397                                    (currently unused) */
5398                 return -EOPNOTSUPP;
5399
5400         case SIOCSIFMTU:        /* Set the MTU of a device */
5401                 return dev_set_mtu(dev, ifr->ifr_mtu);
5402
5403         case SIOCSIFHWADDR:
5404                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
5405
5406         case SIOCSIFHWBROADCAST:
5407                 if (ifr->ifr_hwaddr.sa_family != dev->type)
5408                         return -EINVAL;
5409                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
5410                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5411                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5412                 return 0;
5413
5414         case SIOCSIFMAP:
5415                 if (ops->ndo_set_config) {
5416                         if (!netif_device_present(dev))
5417                                 return -ENODEV;
5418                         return ops->ndo_set_config(dev, &ifr->ifr_map);
5419                 }
5420                 return -EOPNOTSUPP;
5421
5422         case SIOCADDMULTI:
5423                 if (!ops->ndo_set_rx_mode ||
5424                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5425                         return -EINVAL;
5426                 if (!netif_device_present(dev))
5427                         return -ENODEV;
5428                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5429
5430         case SIOCDELMULTI:
5431                 if (!ops->ndo_set_rx_mode ||
5432                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5433                         return -EINVAL;
5434                 if (!netif_device_present(dev))
5435                         return -ENODEV;
5436                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5437
5438         case SIOCSIFTXQLEN:
5439                 if (ifr->ifr_qlen < 0)
5440                         return -EINVAL;
5441                 dev->tx_queue_len = ifr->ifr_qlen;
5442                 return 0;
5443
5444         case SIOCSIFNAME:
5445                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5446                 return dev_change_name(dev, ifr->ifr_newname);
5447
5448         case SIOCSHWTSTAMP:
5449                 err = net_hwtstamp_validate(ifr);
5450                 if (err)
5451                         return err;
5452                 /* fall through */
5453
5454         /*
5455          *      Unknown or private ioctl
5456          */
5457         default:
5458                 if ((cmd >= SIOCDEVPRIVATE &&
5459                     cmd <= SIOCDEVPRIVATE + 15) ||
5460                     cmd == SIOCBONDENSLAVE ||
5461                     cmd == SIOCBONDRELEASE ||
5462                     cmd == SIOCBONDSETHWADDR ||
5463                     cmd == SIOCBONDSLAVEINFOQUERY ||
5464                     cmd == SIOCBONDINFOQUERY ||
5465                     cmd == SIOCBONDCHANGEACTIVE ||
5466                     cmd == SIOCGMIIPHY ||
5467                     cmd == SIOCGMIIREG ||
5468                     cmd == SIOCSMIIREG ||
5469                     cmd == SIOCBRADDIF ||
5470                     cmd == SIOCBRDELIF ||
5471                     cmd == SIOCSHWTSTAMP ||
5472                     cmd == SIOCWANDEV) {
5473                         err = -EOPNOTSUPP;
5474                         if (ops->ndo_do_ioctl) {
5475                                 if (netif_device_present(dev))
5476                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
5477                                 else
5478                                         err = -ENODEV;
5479                         }
5480                 } else
5481                         err = -EINVAL;
5482
5483         }
5484         return err;
5485 }
5486
5487 /*
5488  *      This function handles all "interface"-type I/O control requests. The actual
5489  *      'doing' part of this is dev_ifsioc above.
5490  */
5491
5492 /**
5493  *      dev_ioctl       -       network device ioctl
5494  *      @net: the applicable net namespace
5495  *      @cmd: command to issue
5496  *      @arg: pointer to a struct ifreq in user space
5497  *
5498  *      Issue ioctl functions to devices. This is normally called by the
5499  *      user space syscall interfaces but can sometimes be useful for
5500  *      other purposes. The return value is the return from the syscall if
5501  *      positive or a negative errno code on error.
5502  */
5503
5504 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5505 {
5506         struct ifreq ifr;
5507         int ret;
5508         char *colon;
5509
5510         /* One special case: SIOCGIFCONF takes ifconf argument
5511            and requires shared lock, because it sleeps writing
5512            to user space.
5513          */
5514
5515         if (cmd == SIOCGIFCONF) {
5516                 rtnl_lock();
5517                 ret = dev_ifconf(net, (char __user *) arg);
5518                 rtnl_unlock();
5519                 return ret;
5520         }
5521         if (cmd == SIOCGIFNAME)
5522                 return dev_ifname(net, (struct ifreq __user *)arg);
5523
5524         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5525                 return -EFAULT;
5526
5527         ifr.ifr_name[IFNAMSIZ-1] = 0;
5528
5529         colon = strchr(ifr.ifr_name, ':');
5530         if (colon)
5531                 *colon = 0;
5532
5533         /*
5534          *      See which interface the caller is talking about.
5535          */
5536
5537         switch (cmd) {
5538         /*
5539          *      These ioctl calls:
5540          *      - can be done by all.
5541          *      - atomic and do not require locking.
5542          *      - return a value
5543          */
5544         case SIOCGIFFLAGS:
5545         case SIOCGIFMETRIC:
5546         case SIOCGIFMTU:
5547         case SIOCGIFHWADDR:
5548         case SIOCGIFSLAVE:
5549         case SIOCGIFMAP:
5550         case SIOCGIFINDEX:
5551         case SIOCGIFTXQLEN:
5552                 dev_load(net, ifr.ifr_name);
5553                 rcu_read_lock();
5554                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5555                 rcu_read_unlock();
5556                 if (!ret) {
5557                         if (colon)
5558                                 *colon = ':';
5559                         if (copy_to_user(arg, &ifr,
5560                                          sizeof(struct ifreq)))
5561                                 ret = -EFAULT;
5562                 }
5563                 return ret;
5564
5565         case SIOCETHTOOL:
5566                 dev_load(net, ifr.ifr_name);
5567                 rtnl_lock();
5568                 ret = dev_ethtool(net, &ifr);
5569                 rtnl_unlock();
5570                 if (!ret) {
5571                         if (colon)
5572                                 *colon = ':';
5573                         if (copy_to_user(arg, &ifr,
5574                                          sizeof(struct ifreq)))
5575                                 ret = -EFAULT;
5576                 }
5577                 return ret;
5578
5579         /*
5580          *      These ioctl calls:
5581          *      - require superuser power.
5582          *      - require strict serialization.
5583          *      - return a value
5584          */
5585         case SIOCGMIIPHY:
5586         case SIOCGMIIREG:
5587         case SIOCSIFNAME:
5588                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5589                         return -EPERM;
5590                 dev_load(net, ifr.ifr_name);
5591                 rtnl_lock();
5592                 ret = dev_ifsioc(net, &ifr, cmd);
5593                 rtnl_unlock();
5594                 if (!ret) {
5595                         if (colon)
5596                                 *colon = ':';
5597                         if (copy_to_user(arg, &ifr,
5598                                          sizeof(struct ifreq)))
5599                                 ret = -EFAULT;
5600                 }
5601                 return ret;
5602
5603         /*
5604          *      These ioctl calls:
5605          *      - require superuser power.
5606          *      - require strict serialization.
5607          *      - do not return a value
5608          */
5609         case SIOCSIFMAP:
5610         case SIOCSIFTXQLEN:
5611                 if (!capable(CAP_NET_ADMIN))
5612                         return -EPERM;
5613                 /* fall through */
5614         /*
5615          *      These ioctl calls:
5616          *      - require local superuser power.
5617          *      - require strict serialization.
5618          *      - do not return a value
5619          */
5620         case SIOCSIFFLAGS:
5621         case SIOCSIFMETRIC:
5622         case SIOCSIFMTU:
5623         case SIOCSIFHWADDR:
5624         case SIOCSIFSLAVE:
5625         case SIOCADDMULTI:
5626         case SIOCDELMULTI:
5627         case SIOCSIFHWBROADCAST:
5628         case SIOCSMIIREG:
5629         case SIOCBONDENSLAVE:
5630         case SIOCBONDRELEASE:
5631         case SIOCBONDSETHWADDR:
5632         case SIOCBONDCHANGEACTIVE:
5633         case SIOCBRADDIF:
5634         case SIOCBRDELIF:
5635         case SIOCSHWTSTAMP:
5636                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5637                         return -EPERM;
5638                 /* fall through */
5639         case SIOCBONDSLAVEINFOQUERY:
5640         case SIOCBONDINFOQUERY:
5641                 dev_load(net, ifr.ifr_name);
5642                 rtnl_lock();
5643                 ret = dev_ifsioc(net, &ifr, cmd);
5644                 rtnl_unlock();
5645                 return ret;
5646
5647         case SIOCGIFMEM:
5648                 /* Get the per device memory space. We can add this but
5649                  * currently do not support it */
5650         case SIOCSIFMEM:
5651                 /* Set the per device memory buffer space.
5652                  * Not applicable in our case */
5653         case SIOCSIFLINK:
5654                 return -ENOTTY;
5655
5656         /*
5657          *      Unknown or private ioctl.
5658          */
5659         default:
5660                 if (cmd == SIOCWANDEV ||
5661                     (cmd >= SIOCDEVPRIVATE &&
5662                      cmd <= SIOCDEVPRIVATE + 15)) {
5663                         dev_load(net, ifr.ifr_name);
5664                         rtnl_lock();
5665                         ret = dev_ifsioc(net, &ifr, cmd);
5666                         rtnl_unlock();
5667                         if (!ret && copy_to_user(arg, &ifr,
5668                                                  sizeof(struct ifreq)))
5669                                 ret = -EFAULT;
5670                         return ret;
5671                 }
5672                 /* Take care of Wireless Extensions */
5673                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5674                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5675                 return -ENOTTY;
5676         }
5677 }
5678
5679
5680 /**
5681  *      dev_new_index   -       allocate an ifindex
5682  *      @net: the applicable net namespace
5683  *
5684  *      Returns a suitable unique value for a new device interface
5685  *      number.  The caller must hold the rtnl semaphore or the
5686  *      dev_base_lock to be sure it remains unique.
5687  */
5688 static int dev_new_index(struct net *net)
5689 {
5690         int ifindex = net->ifindex;
5691         for (;;) {
5692                 if (++ifindex <= 0)
5693                         ifindex = 1;
5694                 if (!__dev_get_by_index(net, ifindex))
5695                         return net->ifindex = ifindex;
5696         }
5697 }
5698
5699 /* Delayed registration/unregisteration */
5700 static LIST_HEAD(net_todo_list);
5701
5702 static void net_set_todo(struct net_device *dev)
5703 {
5704         list_add_tail(&dev->todo_list, &net_todo_list);
5705 }
5706
5707 static void rollback_registered_many(struct list_head *head)
5708 {
5709         struct net_device *dev, *tmp;
5710
5711         BUG_ON(dev_boot_phase);
5712         ASSERT_RTNL();
5713
5714         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5715                 /* Some devices call without registering
5716                  * for initialization unwind. Remove those
5717                  * devices and proceed with the remaining.
5718                  */
5719                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5720                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5721                                  dev->name, dev);
5722
5723                         WARN_ON(1);
5724                         list_del(&dev->unreg_list);
5725                         continue;
5726                 }
5727                 dev->dismantle = true;
5728                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5729         }
5730
5731         /* If device is running, close it first. */
5732         dev_close_many(head);
5733
5734         list_for_each_entry(dev, head, unreg_list) {
5735                 /* And unlink it from device chain. */
5736                 unlist_netdevice(dev);
5737
5738                 dev->reg_state = NETREG_UNREGISTERING;
5739         }
5740
5741         synchronize_net();
5742
5743         list_for_each_entry(dev, head, unreg_list) {
5744                 /* Shutdown queueing discipline. */
5745                 dev_shutdown(dev);
5746
5747
5748                 /* Notify protocols, that we are about to destroy
5749                    this device. They should clean all the things.
5750                 */
5751                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5752
5753                 if (!dev->rtnl_link_ops ||
5754                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5755                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5756
5757                 /*
5758                  *      Flush the unicast and multicast chains
5759                  */
5760                 dev_uc_flush(dev);
5761                 dev_mc_flush(dev);
5762
5763                 if (dev->netdev_ops->ndo_uninit)
5764                         dev->netdev_ops->ndo_uninit(dev);
5765
5766                 /* Notifier chain MUST detach us all upper devices. */
5767                 WARN_ON(netdev_has_any_upper_dev(dev));
5768
5769                 /* Remove entries from kobject tree */
5770                 netdev_unregister_kobject(dev);
5771 #ifdef CONFIG_XPS
5772                 /* Remove XPS queueing entries */
5773                 netif_reset_xps_queues_gt(dev, 0);
5774 #endif
5775         }
5776
5777         synchronize_net();
5778
5779         list_for_each_entry(dev, head, unreg_list)
5780                 dev_put(dev);
5781 }
5782
5783 static void rollback_registered(struct net_device *dev)
5784 {
5785         LIST_HEAD(single);
5786
5787         list_add(&dev->unreg_list, &single);
5788         rollback_registered_many(&single);
5789         list_del(&single);
5790 }
5791
5792 static netdev_features_t netdev_fix_features(struct net_device *dev,
5793         netdev_features_t features)
5794 {
5795         /* Fix illegal checksum combinations */
5796         if ((features & NETIF_F_HW_CSUM) &&
5797             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5798                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5799                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5800         }
5801
5802         /* Fix illegal SG+CSUM combinations. */
5803         if ((features & NETIF_F_SG) &&
5804             !(features & NETIF_F_ALL_CSUM)) {
5805                 netdev_dbg(dev,
5806                         "Dropping NETIF_F_SG since no checksum feature.\n");
5807                 features &= ~NETIF_F_SG;
5808         }
5809
5810         /* TSO requires that SG is present as well. */
5811         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5812                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5813                 features &= ~NETIF_F_ALL_TSO;
5814         }
5815
5816         /* TSO ECN requires that TSO is present as well. */
5817         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5818                 features &= ~NETIF_F_TSO_ECN;
5819
5820         /* Software GSO depends on SG. */
5821         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5822                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5823                 features &= ~NETIF_F_GSO;
5824         }
5825
5826         /* UFO needs SG and checksumming */
5827         if (features & NETIF_F_UFO) {
5828                 /* maybe split UFO into V4 and V6? */
5829                 if (!((features & NETIF_F_GEN_CSUM) ||
5830                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5831                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5832                         netdev_dbg(dev,
5833                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5834                         features &= ~NETIF_F_UFO;
5835                 }
5836
5837                 if (!(features & NETIF_F_SG)) {
5838                         netdev_dbg(dev,
5839                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5840                         features &= ~NETIF_F_UFO;
5841                 }
5842         }
5843
5844         return features;
5845 }
5846
5847 int __netdev_update_features(struct net_device *dev)
5848 {
5849         netdev_features_t features;
5850         int err = 0;
5851
5852         ASSERT_RTNL();
5853
5854         features = netdev_get_wanted_features(dev);
5855
5856         if (dev->netdev_ops->ndo_fix_features)
5857                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5858
5859         /* driver might be less strict about feature dependencies */
5860         features = netdev_fix_features(dev, features);
5861
5862         if (dev->features == features)
5863                 return 0;
5864
5865         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5866                 &dev->features, &features);
5867
5868         if (dev->netdev_ops->ndo_set_features)
5869                 err = dev->netdev_ops->ndo_set_features(dev, features);
5870
5871         if (unlikely(err < 0)) {
5872                 netdev_err(dev,
5873                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5874                         err, &features, &dev->features);
5875                 return -1;
5876         }
5877
5878         if (!err)
5879                 dev->features = features;
5880
5881         return 1;
5882 }
5883
5884 /**
5885  *      netdev_update_features - recalculate device features
5886  *      @dev: the device to check
5887  *
5888  *      Recalculate dev->features set and send notifications if it
5889  *      has changed. Should be called after driver or hardware dependent
5890  *      conditions might have changed that influence the features.
5891  */
5892 void netdev_update_features(struct net_device *dev)
5893 {
5894         if (__netdev_update_features(dev))
5895                 netdev_features_change(dev);
5896 }
5897 EXPORT_SYMBOL(netdev_update_features);
5898
5899 /**
5900  *      netdev_change_features - recalculate device features
5901  *      @dev: the device to check
5902  *
5903  *      Recalculate dev->features set and send notifications even
5904  *      if they have not changed. Should be called instead of
5905  *      netdev_update_features() if also dev->vlan_features might
5906  *      have changed to allow the changes to be propagated to stacked
5907  *      VLAN devices.
5908  */
5909 void netdev_change_features(struct net_device *dev)
5910 {
5911         __netdev_update_features(dev);
5912         netdev_features_change(dev);
5913 }
5914 EXPORT_SYMBOL(netdev_change_features);
5915
5916 /**
5917  *      netif_stacked_transfer_operstate -      transfer operstate
5918  *      @rootdev: the root or lower level device to transfer state from
5919  *      @dev: the device to transfer operstate to
5920  *
5921  *      Transfer operational state from root to device. This is normally
5922  *      called when a stacking relationship exists between the root
5923  *      device and the device(a leaf device).
5924  */
5925 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5926                                         struct net_device *dev)
5927 {
5928         if (rootdev->operstate == IF_OPER_DORMANT)
5929                 netif_dormant_on(dev);
5930         else
5931                 netif_dormant_off(dev);
5932
5933         if (netif_carrier_ok(rootdev)) {
5934                 if (!netif_carrier_ok(dev))
5935                         netif_carrier_on(dev);
5936         } else {
5937                 if (netif_carrier_ok(dev))
5938                         netif_carrier_off(dev);
5939         }
5940 }
5941 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5942
5943 #ifdef CONFIG_RPS
5944 static int netif_alloc_rx_queues(struct net_device *dev)
5945 {
5946         unsigned int i, count = dev->num_rx_queues;
5947         struct netdev_rx_queue *rx;
5948
5949         BUG_ON(count < 1);
5950
5951         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5952         if (!rx) {
5953                 pr_err("netdev: Unable to allocate %u rx queues\n", count);
5954                 return -ENOMEM;
5955         }
5956         dev->_rx = rx;
5957
5958         for (i = 0; i < count; i++)
5959                 rx[i].dev = dev;
5960         return 0;
5961 }
5962 #endif
5963
5964 static void netdev_init_one_queue(struct net_device *dev,
5965                                   struct netdev_queue *queue, void *_unused)
5966 {
5967         /* Initialize queue lock */
5968         spin_lock_init(&queue->_xmit_lock);
5969         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5970         queue->xmit_lock_owner = -1;
5971         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5972         queue->dev = dev;
5973 #ifdef CONFIG_BQL
5974         dql_init(&queue->dql, HZ);
5975 #endif
5976 }
5977
5978 static int netif_alloc_netdev_queues(struct net_device *dev)
5979 {
5980         unsigned int count = dev->num_tx_queues;
5981         struct netdev_queue *tx;
5982
5983         BUG_ON(count < 1);
5984
5985         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5986         if (!tx) {
5987                 pr_err("netdev: Unable to allocate %u tx queues\n", count);
5988                 return -ENOMEM;
5989         }
5990         dev->_tx = tx;
5991
5992         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5993         spin_lock_init(&dev->tx_global_lock);
5994
5995         return 0;
5996 }
5997
5998 /**
5999  *      register_netdevice      - register a network device
6000  *      @dev: device to register
6001  *
6002  *      Take a completed network device structure and add it to the kernel
6003  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6004  *      chain. 0 is returned on success. A negative errno code is returned
6005  *      on a failure to set up the device, or if the name is a duplicate.
6006  *
6007  *      Callers must hold the rtnl semaphore. You may want
6008  *      register_netdev() instead of this.
6009  *
6010  *      BUGS:
6011  *      The locking appears insufficient to guarantee two parallel registers
6012  *      will not get the same name.
6013  */
6014
6015 int register_netdevice(struct net_device *dev)
6016 {
6017         int ret;
6018         struct net *net = dev_net(dev);
6019
6020         BUG_ON(dev_boot_phase);
6021         ASSERT_RTNL();
6022
6023         might_sleep();
6024
6025         /* When net_device's are persistent, this will be fatal. */
6026         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6027         BUG_ON(!net);
6028
6029         spin_lock_init(&dev->addr_list_lock);
6030         netdev_set_addr_lockdep_class(dev);
6031
6032         dev->iflink = -1;
6033
6034         ret = dev_get_valid_name(net, dev, dev->name);
6035         if (ret < 0)
6036                 goto out;
6037
6038         /* Init, if this function is available */
6039         if (dev->netdev_ops->ndo_init) {
6040                 ret = dev->netdev_ops->ndo_init(dev);
6041                 if (ret) {
6042                         if (ret > 0)
6043                                 ret = -EIO;
6044                         goto out;
6045                 }
6046         }
6047
6048         ret = -EBUSY;
6049         if (!dev->ifindex)
6050                 dev->ifindex = dev_new_index(net);
6051         else if (__dev_get_by_index(net, dev->ifindex))
6052                 goto err_uninit;
6053
6054         if (dev->iflink == -1)
6055                 dev->iflink = dev->ifindex;
6056
6057         /* Transfer changeable features to wanted_features and enable
6058          * software offloads (GSO and GRO).
6059          */
6060         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6061         dev->features |= NETIF_F_SOFT_FEATURES;
6062         dev->wanted_features = dev->features & dev->hw_features;
6063
6064         /* Turn on no cache copy if HW is doing checksum */
6065         if (!(dev->flags & IFF_LOOPBACK)) {
6066                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6067                 if (dev->features & NETIF_F_ALL_CSUM) {
6068                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
6069                         dev->features |= NETIF_F_NOCACHE_COPY;
6070                 }
6071         }
6072
6073         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6074          */
6075         dev->vlan_features |= NETIF_F_HIGHDMA;
6076
6077         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6078         ret = notifier_to_errno(ret);
6079         if (ret)
6080                 goto err_uninit;
6081
6082         ret = netdev_register_kobject(dev);
6083         if (ret)
6084                 goto err_uninit;
6085         dev->reg_state = NETREG_REGISTERED;
6086
6087         __netdev_update_features(dev);
6088
6089         /*
6090          *      Default initial state at registry is that the
6091          *      device is present.
6092          */
6093
6094         set_bit(__LINK_STATE_PRESENT, &dev->state);
6095
6096         linkwatch_init_dev(dev);
6097
6098         dev_init_scheduler(dev);
6099         dev_hold(dev);
6100         list_netdevice(dev);
6101         add_device_randomness(dev->dev_addr, dev->addr_len);
6102
6103         /* If the device has permanent device address, driver should
6104          * set dev_addr and also addr_assign_type should be set to
6105          * NET_ADDR_PERM (default value).
6106          */
6107         if (dev->addr_assign_type == NET_ADDR_PERM)
6108                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6109
6110         /* Notify protocols, that a new device appeared. */
6111         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6112         ret = notifier_to_errno(ret);
6113         if (ret) {
6114                 rollback_registered(dev);
6115                 dev->reg_state = NETREG_UNREGISTERED;
6116         }
6117         /*
6118          *      Prevent userspace races by waiting until the network
6119          *      device is fully setup before sending notifications.
6120          */
6121         if (!dev->rtnl_link_ops ||
6122             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6123                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6124
6125 out:
6126         return ret;
6127
6128 err_uninit:
6129         if (dev->netdev_ops->ndo_uninit)
6130                 dev->netdev_ops->ndo_uninit(dev);
6131         goto out;
6132 }
6133 EXPORT_SYMBOL(register_netdevice);
6134
6135 /**
6136  *      init_dummy_netdev       - init a dummy network device for NAPI
6137  *      @dev: device to init
6138  *
6139  *      This takes a network device structure and initialize the minimum
6140  *      amount of fields so it can be used to schedule NAPI polls without
6141  *      registering a full blown interface. This is to be used by drivers
6142  *      that need to tie several hardware interfaces to a single NAPI
6143  *      poll scheduler due to HW limitations.
6144  */
6145 int init_dummy_netdev(struct net_device *dev)
6146 {
6147         /* Clear everything. Note we don't initialize spinlocks
6148          * are they aren't supposed to be taken by any of the
6149          * NAPI code and this dummy netdev is supposed to be
6150          * only ever used for NAPI polls
6151          */
6152         memset(dev, 0, sizeof(struct net_device));
6153
6154         /* make sure we BUG if trying to hit standard
6155          * register/unregister code path
6156          */
6157         dev->reg_state = NETREG_DUMMY;
6158
6159         /* NAPI wants this */
6160         INIT_LIST_HEAD(&dev->napi_list);
6161
6162         /* a dummy interface is started by default */
6163         set_bit(__LINK_STATE_PRESENT, &dev->state);
6164         set_bit(__LINK_STATE_START, &dev->state);
6165
6166         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6167          * because users of this 'device' dont need to change
6168          * its refcount.
6169          */
6170
6171         return 0;
6172 }
6173 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6174
6175
6176 /**
6177  *      register_netdev - register a network device
6178  *      @dev: device to register
6179  *
6180  *      Take a completed network device structure and add it to the kernel
6181  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6182  *      chain. 0 is returned on success. A negative errno code is returned
6183  *      on a failure to set up the device, or if the name is a duplicate.
6184  *
6185  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6186  *      and expands the device name if you passed a format string to
6187  *      alloc_netdev.
6188  */
6189 int register_netdev(struct net_device *dev)
6190 {
6191         int err;
6192
6193         rtnl_lock();
6194         err = register_netdevice(dev);
6195         rtnl_unlock();
6196         return err;
6197 }
6198 EXPORT_SYMBOL(register_netdev);
6199
6200 int netdev_refcnt_read(const struct net_device *dev)
6201 {
6202         int i, refcnt = 0;
6203
6204         for_each_possible_cpu(i)
6205                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6206         return refcnt;
6207 }
6208 EXPORT_SYMBOL(netdev_refcnt_read);
6209
6210 /**
6211  * netdev_wait_allrefs - wait until all references are gone.
6212  * @dev: target net_device
6213  *
6214  * This is called when unregistering network devices.
6215  *
6216  * Any protocol or device that holds a reference should register
6217  * for netdevice notification, and cleanup and put back the
6218  * reference if they receive an UNREGISTER event.
6219  * We can get stuck here if buggy protocols don't correctly
6220  * call dev_put.
6221  */
6222 static void netdev_wait_allrefs(struct net_device *dev)
6223 {
6224         unsigned long rebroadcast_time, warning_time;
6225         int refcnt;
6226
6227         linkwatch_forget_dev(dev);
6228
6229         rebroadcast_time = warning_time = jiffies;
6230         refcnt = netdev_refcnt_read(dev);
6231
6232         while (refcnt != 0) {
6233                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6234                         rtnl_lock();
6235
6236                         /* Rebroadcast unregister notification */
6237                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6238
6239                         __rtnl_unlock();
6240                         rcu_barrier();
6241                         rtnl_lock();
6242
6243                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6244                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6245                                      &dev->state)) {
6246                                 /* We must not have linkwatch events
6247                                  * pending on unregister. If this
6248                                  * happens, we simply run the queue
6249                                  * unscheduled, resulting in a noop
6250                                  * for this device.
6251                                  */
6252                                 linkwatch_run_queue();
6253                         }
6254
6255                         __rtnl_unlock();
6256
6257                         rebroadcast_time = jiffies;
6258                 }
6259
6260                 msleep(250);
6261
6262                 refcnt = netdev_refcnt_read(dev);
6263
6264                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6265                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6266                                  dev->name, refcnt);
6267                         warning_time = jiffies;
6268                 }
6269         }
6270 }
6271
6272 /* The sequence is:
6273  *
6274  *      rtnl_lock();
6275  *      ...
6276  *      register_netdevice(x1);
6277  *      register_netdevice(x2);
6278  *      ...
6279  *      unregister_netdevice(y1);
6280  *      unregister_netdevice(y2);
6281  *      ...
6282  *      rtnl_unlock();
6283  *      free_netdev(y1);
6284  *      free_netdev(y2);
6285  *
6286  * We are invoked by rtnl_unlock().
6287  * This allows us to deal with problems:
6288  * 1) We can delete sysfs objects which invoke hotplug
6289  *    without deadlocking with linkwatch via keventd.
6290  * 2) Since we run with the RTNL semaphore not held, we can sleep
6291  *    safely in order to wait for the netdev refcnt to drop to zero.
6292  *
6293  * We must not return until all unregister events added during
6294  * the interval the lock was held have been completed.
6295  */
6296 void netdev_run_todo(void)
6297 {
6298         struct list_head list;
6299
6300         /* Snapshot list, allow later requests */
6301         list_replace_init(&net_todo_list, &list);
6302
6303         __rtnl_unlock();
6304
6305
6306         /* Wait for rcu callbacks to finish before next phase */
6307         if (!list_empty(&list))
6308                 rcu_barrier();
6309
6310         while (!list_empty(&list)) {
6311                 struct net_device *dev
6312                         = list_first_entry(&list, struct net_device, todo_list);
6313                 list_del(&dev->todo_list);
6314
6315                 rtnl_lock();
6316                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6317                 __rtnl_unlock();
6318
6319                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6320                         pr_err("network todo '%s' but state %d\n",
6321                                dev->name, dev->reg_state);
6322                         dump_stack();
6323                         continue;
6324                 }
6325
6326                 dev->reg_state = NETREG_UNREGISTERED;
6327
6328                 on_each_cpu(flush_backlog, dev, 1);
6329
6330                 netdev_wait_allrefs(dev);
6331
6332                 /* paranoia */
6333                 BUG_ON(netdev_refcnt_read(dev));
6334                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6335                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6336                 WARN_ON(dev->dn_ptr);
6337
6338                 if (dev->destructor)
6339                         dev->destructor(dev);
6340
6341                 /* Free network device */
6342                 kobject_put(&dev->dev.kobj);
6343         }
6344 }
6345
6346 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6347  * fields in the same order, with only the type differing.
6348  */
6349 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6350                              const struct net_device_stats *netdev_stats)
6351 {
6352 #if BITS_PER_LONG == 64
6353         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6354         memcpy(stats64, netdev_stats, sizeof(*stats64));
6355 #else
6356         size_t i, n = sizeof(*stats64) / sizeof(u64);
6357         const unsigned long *src = (const unsigned long *)netdev_stats;
6358         u64 *dst = (u64 *)stats64;
6359
6360         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6361                      sizeof(*stats64) / sizeof(u64));
6362         for (i = 0; i < n; i++)
6363                 dst[i] = src[i];
6364 #endif
6365 }
6366 EXPORT_SYMBOL(netdev_stats_to_stats64);
6367
6368 /**
6369  *      dev_get_stats   - get network device statistics
6370  *      @dev: device to get statistics from
6371  *      @storage: place to store stats
6372  *
6373  *      Get network statistics from device. Return @storage.
6374  *      The device driver may provide its own method by setting
6375  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6376  *      otherwise the internal statistics structure is used.
6377  */
6378 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6379                                         struct rtnl_link_stats64 *storage)
6380 {
6381         const struct net_device_ops *ops = dev->netdev_ops;
6382
6383         if (ops->ndo_get_stats64) {
6384                 memset(storage, 0, sizeof(*storage));
6385                 ops->ndo_get_stats64(dev, storage);
6386         } else if (ops->ndo_get_stats) {
6387                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6388         } else {
6389                 netdev_stats_to_stats64(storage, &dev->stats);
6390         }
6391         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6392         return storage;
6393 }
6394 EXPORT_SYMBOL(dev_get_stats);
6395
6396 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6397 {
6398         struct netdev_queue *queue = dev_ingress_queue(dev);
6399
6400 #ifdef CONFIG_NET_CLS_ACT
6401         if (queue)
6402                 return queue;
6403         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6404         if (!queue)
6405                 return NULL;
6406         netdev_init_one_queue(dev, queue, NULL);
6407         queue->qdisc = &noop_qdisc;
6408         queue->qdisc_sleeping = &noop_qdisc;
6409         rcu_assign_pointer(dev->ingress_queue, queue);
6410 #endif
6411         return queue;
6412 }
6413
6414 static const struct ethtool_ops default_ethtool_ops;
6415
6416 void netdev_set_default_ethtool_ops(struct net_device *dev,
6417                                     const struct ethtool_ops *ops)
6418 {
6419         if (dev->ethtool_ops == &default_ethtool_ops)
6420                 dev->ethtool_ops = ops;
6421 }
6422 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6423
6424 /**
6425  *      alloc_netdev_mqs - allocate network device
6426  *      @sizeof_priv:   size of private data to allocate space for
6427  *      @name:          device name format string
6428  *      @setup:         callback to initialize device
6429  *      @txqs:          the number of TX subqueues to allocate
6430  *      @rxqs:          the number of RX subqueues to allocate
6431  *
6432  *      Allocates a struct net_device with private data area for driver use
6433  *      and performs basic initialization.  Also allocates subquue structs
6434  *      for each queue on the device.
6435  */
6436 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6437                 void (*setup)(struct net_device *),
6438                 unsigned int txqs, unsigned int rxqs)
6439 {
6440         struct net_device *dev;
6441         size_t alloc_size;
6442         struct net_device *p;
6443
6444         BUG_ON(strlen(name) >= sizeof(dev->name));
6445
6446         if (txqs < 1) {
6447                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6448                 return NULL;
6449         }
6450
6451 #ifdef CONFIG_RPS
6452         if (rxqs < 1) {
6453                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6454                 return NULL;
6455         }
6456 #endif
6457
6458         alloc_size = sizeof(struct net_device);
6459         if (sizeof_priv) {
6460                 /* ensure 32-byte alignment of private area */
6461                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6462                 alloc_size += sizeof_priv;
6463         }
6464         /* ensure 32-byte alignment of whole construct */
6465         alloc_size += NETDEV_ALIGN - 1;
6466
6467         p = kzalloc(alloc_size, GFP_KERNEL);
6468         if (!p) {
6469                 pr_err("alloc_netdev: Unable to allocate device\n");
6470                 return NULL;
6471         }
6472
6473         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6474         dev->padded = (char *)dev - (char *)p;
6475
6476         dev->pcpu_refcnt = alloc_percpu(int);
6477         if (!dev->pcpu_refcnt)
6478                 goto free_p;
6479
6480         if (dev_addr_init(dev))
6481                 goto free_pcpu;
6482
6483         dev_mc_init(dev);
6484         dev_uc_init(dev);
6485
6486         dev_net_set(dev, &init_net);
6487
6488         dev->gso_max_size = GSO_MAX_SIZE;
6489         dev->gso_max_segs = GSO_MAX_SEGS;
6490
6491         INIT_LIST_HEAD(&dev->napi_list);
6492         INIT_LIST_HEAD(&dev->unreg_list);
6493         INIT_LIST_HEAD(&dev->link_watch_list);
6494         INIT_LIST_HEAD(&dev->upper_dev_list);
6495         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6496         setup(dev);
6497
6498         dev->num_tx_queues = txqs;
6499         dev->real_num_tx_queues = txqs;
6500         if (netif_alloc_netdev_queues(dev))
6501                 goto free_all;
6502
6503 #ifdef CONFIG_RPS
6504         dev->num_rx_queues = rxqs;
6505         dev->real_num_rx_queues = rxqs;
6506         if (netif_alloc_rx_queues(dev))
6507                 goto free_all;
6508 #endif
6509
6510         strcpy(dev->name, name);
6511         dev->group = INIT_NETDEV_GROUP;
6512         if (!dev->ethtool_ops)
6513                 dev->ethtool_ops = &default_ethtool_ops;
6514         return dev;
6515
6516 free_all:
6517         free_netdev(dev);
6518         return NULL;
6519
6520 free_pcpu:
6521         free_percpu(dev->pcpu_refcnt);
6522         kfree(dev->_tx);
6523 #ifdef CONFIG_RPS
6524         kfree(dev->_rx);
6525 #endif
6526
6527 free_p:
6528         kfree(p);
6529         return NULL;
6530 }
6531 EXPORT_SYMBOL(alloc_netdev_mqs);
6532
6533 /**
6534  *      free_netdev - free network device
6535  *      @dev: device
6536  *
6537  *      This function does the last stage of destroying an allocated device
6538  *      interface. The reference to the device object is released.
6539  *      If this is the last reference then it will be freed.
6540  */
6541 void free_netdev(struct net_device *dev)
6542 {
6543         struct napi_struct *p, *n;
6544
6545         release_net(dev_net(dev));
6546
6547         kfree(dev->_tx);
6548 #ifdef CONFIG_RPS
6549         kfree(dev->_rx);
6550 #endif
6551
6552         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6553
6554         /* Flush device addresses */
6555         dev_addr_flush(dev);
6556
6557         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6558                 netif_napi_del(p);
6559
6560         free_percpu(dev->pcpu_refcnt);
6561         dev->pcpu_refcnt = NULL;
6562
6563         /*  Compatibility with error handling in drivers */
6564         if (dev->reg_state == NETREG_UNINITIALIZED) {
6565                 kfree((char *)dev - dev->padded);
6566                 return;
6567         }
6568
6569         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6570         dev->reg_state = NETREG_RELEASED;
6571
6572         /* will free via device release */
6573         put_device(&dev->dev);
6574 }
6575 EXPORT_SYMBOL(free_netdev);
6576
6577 /**
6578  *      synchronize_net -  Synchronize with packet receive processing
6579  *
6580  *      Wait for packets currently being received to be done.
6581  *      Does not block later packets from starting.
6582  */
6583 void synchronize_net(void)
6584 {
6585         might_sleep();
6586         if (rtnl_is_locked())
6587                 synchronize_rcu_expedited();
6588         else
6589                 synchronize_rcu();
6590 }
6591 EXPORT_SYMBOL(synchronize_net);
6592
6593 /**
6594  *      unregister_netdevice_queue - remove device from the kernel
6595  *      @dev: device
6596  *      @head: list
6597  *
6598  *      This function shuts down a device interface and removes it
6599  *      from the kernel tables.
6600  *      If head not NULL, device is queued to be unregistered later.
6601  *
6602  *      Callers must hold the rtnl semaphore.  You may want
6603  *      unregister_netdev() instead of this.
6604  */
6605
6606 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6607 {
6608         ASSERT_RTNL();
6609
6610         if (head) {
6611                 list_move_tail(&dev->unreg_list, head);
6612         } else {
6613                 rollback_registered(dev);
6614                 /* Finish processing unregister after unlock */
6615                 net_set_todo(dev);
6616         }
6617 }
6618 EXPORT_SYMBOL(unregister_netdevice_queue);
6619
6620 /**
6621  *      unregister_netdevice_many - unregister many devices
6622  *      @head: list of devices
6623  */
6624 void unregister_netdevice_many(struct list_head *head)
6625 {
6626         struct net_device *dev;
6627
6628         if (!list_empty(head)) {
6629                 rollback_registered_many(head);
6630                 list_for_each_entry(dev, head, unreg_list)
6631                         net_set_todo(dev);
6632         }
6633 }
6634 EXPORT_SYMBOL(unregister_netdevice_many);
6635
6636 /**
6637  *      unregister_netdev - remove device from the kernel
6638  *      @dev: device
6639  *
6640  *      This function shuts down a device interface and removes it
6641  *      from the kernel tables.
6642  *
6643  *      This is just a wrapper for unregister_netdevice that takes
6644  *      the rtnl semaphore.  In general you want to use this and not
6645  *      unregister_netdevice.
6646  */
6647 void unregister_netdev(struct net_device *dev)
6648 {
6649         rtnl_lock();
6650         unregister_netdevice(dev);
6651         rtnl_unlock();
6652 }
6653 EXPORT_SYMBOL(unregister_netdev);
6654
6655 /**
6656  *      dev_change_net_namespace - move device to different nethost namespace
6657  *      @dev: device
6658  *      @net: network namespace
6659  *      @pat: If not NULL name pattern to try if the current device name
6660  *            is already taken in the destination network namespace.
6661  *
6662  *      This function shuts down a device interface and moves it
6663  *      to a new network namespace. On success 0 is returned, on
6664  *      a failure a netagive errno code is returned.
6665  *
6666  *      Callers must hold the rtnl semaphore.
6667  */
6668
6669 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6670 {
6671         int err;
6672
6673         ASSERT_RTNL();
6674
6675         /* Don't allow namespace local devices to be moved. */
6676         err = -EINVAL;
6677         if (dev->features & NETIF_F_NETNS_LOCAL)
6678                 goto out;
6679
6680         /* Ensure the device has been registrered */
6681         if (dev->reg_state != NETREG_REGISTERED)
6682                 goto out;
6683
6684         /* Get out if there is nothing todo */
6685         err = 0;
6686         if (net_eq(dev_net(dev), net))
6687                 goto out;
6688
6689         /* Pick the destination device name, and ensure
6690          * we can use it in the destination network namespace.
6691          */
6692         err = -EEXIST;
6693         if (__dev_get_by_name(net, dev->name)) {
6694                 /* We get here if we can't use the current device name */
6695                 if (!pat)
6696                         goto out;
6697                 if (dev_get_valid_name(net, dev, pat) < 0)
6698                         goto out;
6699         }
6700
6701         /*
6702          * And now a mini version of register_netdevice unregister_netdevice.
6703          */
6704
6705         /* If device is running close it first. */
6706         dev_close(dev);
6707
6708         /* And unlink it from device chain */
6709         err = -ENODEV;
6710         unlist_netdevice(dev);
6711
6712         synchronize_net();
6713
6714         /* Shutdown queueing discipline. */
6715         dev_shutdown(dev);
6716
6717         /* Notify protocols, that we are about to destroy
6718            this device. They should clean all the things.
6719
6720            Note that dev->reg_state stays at NETREG_REGISTERED.
6721            This is wanted because this way 8021q and macvlan know
6722            the device is just moving and can keep their slaves up.
6723         */
6724         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6725         rcu_barrier();
6726         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6727         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6728
6729         /*
6730          *      Flush the unicast and multicast chains
6731          */
6732         dev_uc_flush(dev);
6733         dev_mc_flush(dev);
6734
6735         /* Send a netdev-removed uevent to the old namespace */
6736         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6737
6738         /* Actually switch the network namespace */
6739         dev_net_set(dev, net);
6740
6741         /* If there is an ifindex conflict assign a new one */
6742         if (__dev_get_by_index(net, dev->ifindex)) {
6743                 int iflink = (dev->iflink == dev->ifindex);
6744                 dev->ifindex = dev_new_index(net);
6745                 if (iflink)
6746                         dev->iflink = dev->ifindex;
6747         }
6748
6749         /* Send a netdev-add uevent to the new namespace */
6750         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6751
6752         /* Fixup kobjects */
6753         err = device_rename(&dev->dev, dev->name);
6754         WARN_ON(err);
6755
6756         /* Add the device back in the hashes */
6757         list_netdevice(dev);
6758
6759         /* Notify protocols, that a new device appeared. */
6760         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6761
6762         /*
6763          *      Prevent userspace races by waiting until the network
6764          *      device is fully setup before sending notifications.
6765          */
6766         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6767
6768         synchronize_net();
6769         err = 0;
6770 out:
6771         return err;
6772 }
6773 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6774
6775 static int dev_cpu_callback(struct notifier_block *nfb,
6776                             unsigned long action,
6777                             void *ocpu)
6778 {
6779         struct sk_buff **list_skb;
6780         struct sk_buff *skb;
6781         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6782         struct softnet_data *sd, *oldsd;
6783
6784         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6785                 return NOTIFY_OK;
6786
6787         local_irq_disable();
6788         cpu = smp_processor_id();
6789         sd = &per_cpu(softnet_data, cpu);
6790         oldsd = &per_cpu(softnet_data, oldcpu);
6791
6792         /* Find end of our completion_queue. */
6793         list_skb = &sd->completion_queue;
6794         while (*list_skb)
6795                 list_skb = &(*list_skb)->next;
6796         /* Append completion queue from offline CPU. */
6797         *list_skb = oldsd->completion_queue;
6798         oldsd->completion_queue = NULL;
6799
6800         /* Append output queue from offline CPU. */
6801         if (oldsd->output_queue) {
6802                 *sd->output_queue_tailp = oldsd->output_queue;
6803                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6804                 oldsd->output_queue = NULL;
6805                 oldsd->output_queue_tailp = &oldsd->output_queue;
6806         }
6807         /* Append NAPI poll list from offline CPU. */
6808         if (!list_empty(&oldsd->poll_list)) {
6809                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6810                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6811         }
6812
6813         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6814         local_irq_enable();
6815
6816         /* Process offline CPU's input_pkt_queue */
6817         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6818                 netif_rx(skb);
6819                 input_queue_head_incr(oldsd);
6820         }
6821         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6822                 netif_rx(skb);
6823                 input_queue_head_incr(oldsd);
6824         }
6825
6826         return NOTIFY_OK;
6827 }
6828
6829
6830 /**
6831  *      netdev_increment_features - increment feature set by one
6832  *      @all: current feature set
6833  *      @one: new feature set
6834  *      @mask: mask feature set
6835  *
6836  *      Computes a new feature set after adding a device with feature set
6837  *      @one to the master device with current feature set @all.  Will not
6838  *      enable anything that is off in @mask. Returns the new feature set.
6839  */
6840 netdev_features_t netdev_increment_features(netdev_features_t all,
6841         netdev_features_t one, netdev_features_t mask)
6842 {
6843         if (mask & NETIF_F_GEN_CSUM)
6844                 mask |= NETIF_F_ALL_CSUM;
6845         mask |= NETIF_F_VLAN_CHALLENGED;
6846
6847         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6848         all &= one | ~NETIF_F_ALL_FOR_ALL;
6849
6850         /* If one device supports hw checksumming, set for all. */
6851         if (all & NETIF_F_GEN_CSUM)
6852                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6853
6854         return all;
6855 }
6856 EXPORT_SYMBOL(netdev_increment_features);
6857
6858 static struct hlist_head *netdev_create_hash(void)
6859 {
6860         int i;
6861         struct hlist_head *hash;
6862
6863         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6864         if (hash != NULL)
6865                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6866                         INIT_HLIST_HEAD(&hash[i]);
6867
6868         return hash;
6869 }
6870
6871 /* Initialize per network namespace state */
6872 static int __net_init netdev_init(struct net *net)
6873 {
6874         if (net != &init_net)
6875                 INIT_LIST_HEAD(&net->dev_base_head);
6876
6877         net->dev_name_head = netdev_create_hash();
6878         if (net->dev_name_head == NULL)
6879                 goto err_name;
6880
6881         net->dev_index_head = netdev_create_hash();
6882         if (net->dev_index_head == NULL)
6883                 goto err_idx;
6884
6885         return 0;
6886
6887 err_idx:
6888         kfree(net->dev_name_head);
6889 err_name:
6890         return -ENOMEM;
6891 }
6892
6893 /**
6894  *      netdev_drivername - network driver for the device
6895  *      @dev: network device
6896  *
6897  *      Determine network driver for device.
6898  */
6899 const char *netdev_drivername(const struct net_device *dev)
6900 {
6901         const struct device_driver *driver;
6902         const struct device *parent;
6903         const char *empty = "";
6904
6905         parent = dev->dev.parent;
6906         if (!parent)
6907                 return empty;
6908
6909         driver = parent->driver;
6910         if (driver && driver->name)
6911                 return driver->name;
6912         return empty;
6913 }
6914
6915 static int __netdev_printk(const char *level, const struct net_device *dev,
6916                            struct va_format *vaf)
6917 {
6918         int r;
6919
6920         if (dev && dev->dev.parent) {
6921                 r = dev_printk_emit(level[1] - '0',
6922                                     dev->dev.parent,
6923                                     "%s %s %s: %pV",
6924                                     dev_driver_string(dev->dev.parent),
6925                                     dev_name(dev->dev.parent),
6926                                     netdev_name(dev), vaf);
6927         } else if (dev) {
6928                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6929         } else {
6930                 r = printk("%s(NULL net_device): %pV", level, vaf);
6931         }
6932
6933         return r;
6934 }
6935
6936 int netdev_printk(const char *level, const struct net_device *dev,
6937                   const char *format, ...)
6938 {
6939         struct va_format vaf;
6940         va_list args;
6941         int r;
6942
6943         va_start(args, format);
6944
6945         vaf.fmt = format;
6946         vaf.va = &args;
6947
6948         r = __netdev_printk(level, dev, &vaf);
6949
6950         va_end(args);
6951
6952         return r;
6953 }
6954 EXPORT_SYMBOL(netdev_printk);
6955
6956 #define define_netdev_printk_level(func, level)                 \
6957 int func(const struct net_device *dev, const char *fmt, ...)    \
6958 {                                                               \
6959         int r;                                                  \
6960         struct va_format vaf;                                   \
6961         va_list args;                                           \
6962                                                                 \
6963         va_start(args, fmt);                                    \
6964                                                                 \
6965         vaf.fmt = fmt;                                          \
6966         vaf.va = &args;                                         \
6967                                                                 \
6968         r = __netdev_printk(level, dev, &vaf);                  \
6969                                                                 \
6970         va_end(args);                                           \
6971                                                                 \
6972         return r;                                               \
6973 }                                                               \
6974 EXPORT_SYMBOL(func);
6975
6976 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6977 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6978 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6979 define_netdev_printk_level(netdev_err, KERN_ERR);
6980 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6981 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6982 define_netdev_printk_level(netdev_info, KERN_INFO);
6983
6984 static void __net_exit netdev_exit(struct net *net)
6985 {
6986         kfree(net->dev_name_head);
6987         kfree(net->dev_index_head);
6988 }
6989
6990 static struct pernet_operations __net_initdata netdev_net_ops = {
6991         .init = netdev_init,
6992         .exit = netdev_exit,
6993 };
6994
6995 static void __net_exit default_device_exit(struct net *net)
6996 {
6997         struct net_device *dev, *aux;
6998         /*
6999          * Push all migratable network devices back to the
7000          * initial network namespace
7001          */
7002         rtnl_lock();
7003         for_each_netdev_safe(net, dev, aux) {
7004                 int err;
7005                 char fb_name[IFNAMSIZ];
7006
7007                 /* Ignore unmoveable devices (i.e. loopback) */
7008                 if (dev->features & NETIF_F_NETNS_LOCAL)
7009                         continue;
7010
7011                 /* Leave virtual devices for the generic cleanup */
7012                 if (dev->rtnl_link_ops)
7013                         continue;
7014
7015                 /* Push remaining network devices to init_net */
7016                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7017                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7018                 if (err) {
7019                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7020                                  __func__, dev->name, err);
7021                         BUG();
7022                 }
7023         }
7024         rtnl_unlock();
7025 }
7026
7027 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7028 {
7029         /* At exit all network devices most be removed from a network
7030          * namespace.  Do this in the reverse order of registration.
7031          * Do this across as many network namespaces as possible to
7032          * improve batching efficiency.
7033          */
7034         struct net_device *dev;
7035         struct net *net;
7036         LIST_HEAD(dev_kill_list);
7037
7038         rtnl_lock();
7039         list_for_each_entry(net, net_list, exit_list) {
7040                 for_each_netdev_reverse(net, dev) {
7041                         if (dev->rtnl_link_ops)
7042                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7043                         else
7044                                 unregister_netdevice_queue(dev, &dev_kill_list);
7045                 }
7046         }
7047         unregister_netdevice_many(&dev_kill_list);
7048         list_del(&dev_kill_list);
7049         rtnl_unlock();
7050 }
7051
7052 static struct pernet_operations __net_initdata default_device_ops = {
7053         .exit = default_device_exit,
7054         .exit_batch = default_device_exit_batch,
7055 };
7056
7057 /*
7058  *      Initialize the DEV module. At boot time this walks the device list and
7059  *      unhooks any devices that fail to initialise (normally hardware not
7060  *      present) and leaves us with a valid list of present and active devices.
7061  *
7062  */
7063
7064 /*
7065  *       This is called single threaded during boot, so no need
7066  *       to take the rtnl semaphore.
7067  */
7068 static int __init net_dev_init(void)
7069 {
7070         int i, rc = -ENOMEM;
7071
7072         BUG_ON(!dev_boot_phase);
7073
7074         if (dev_proc_init())
7075                 goto out;
7076
7077         if (netdev_kobject_init())
7078                 goto out;
7079
7080         INIT_LIST_HEAD(&ptype_all);
7081         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7082                 INIT_LIST_HEAD(&ptype_base[i]);
7083
7084         INIT_LIST_HEAD(&offload_base);
7085
7086         if (register_pernet_subsys(&netdev_net_ops))
7087                 goto out;
7088
7089         /*
7090          *      Initialise the packet receive queues.
7091          */
7092
7093         for_each_possible_cpu(i) {
7094                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7095
7096                 memset(sd, 0, sizeof(*sd));
7097                 skb_queue_head_init(&sd->input_pkt_queue);
7098                 skb_queue_head_init(&sd->process_queue);
7099                 sd->completion_queue = NULL;
7100                 INIT_LIST_HEAD(&sd->poll_list);
7101                 sd->output_queue = NULL;
7102                 sd->output_queue_tailp = &sd->output_queue;
7103 #ifdef CONFIG_RPS
7104                 sd->csd.func = rps_trigger_softirq;
7105                 sd->csd.info = sd;
7106                 sd->csd.flags = 0;
7107                 sd->cpu = i;
7108 #endif
7109
7110                 sd->backlog.poll = process_backlog;
7111                 sd->backlog.weight = weight_p;
7112                 sd->backlog.gro_list = NULL;
7113                 sd->backlog.gro_count = 0;
7114         }
7115
7116         dev_boot_phase = 0;
7117
7118         /* The loopback device is special if any other network devices
7119          * is present in a network namespace the loopback device must
7120          * be present. Since we now dynamically allocate and free the
7121          * loopback device ensure this invariant is maintained by
7122          * keeping the loopback device as the first device on the
7123          * list of network devices.  Ensuring the loopback devices
7124          * is the first device that appears and the last network device
7125          * that disappears.
7126          */
7127         if (register_pernet_device(&loopback_net_ops))
7128                 goto out;
7129
7130         if (register_pernet_device(&default_device_ops))
7131                 goto out;
7132
7133         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7134         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7135
7136         hotcpu_notifier(dev_cpu_callback, 0);
7137         dst_init();
7138         dev_mcast_init();
7139         rc = 0;
7140 out:
7141         return rc;
7142 }
7143
7144 subsys_initcall(net_dev_init);