net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/mutex.h>
  84 #include <linux/string.h>
  85 #include <linux/mm.h>
  86 #include <linux/socket.h>
  87 #include <linux/sockios.h>
  88 #include <linux/errno.h>
  89 #include <linux/interrupt.h>
  90 #include <linux/if_ether.h>
  91 #include <linux/netdevice.h>
  92 #include <linux/etherdevice.h>
  93 #include <linux/ethtool.h>
  94 #include <linux/notifier.h>
  95 #include <linux/skbuff.h>
  96 #include <net/net_namespace.h>
  97 #include <net/sock.h>
  98 #include <linux/rtnetlink.h>
  99 #include <linux/proc_fs.h>
 100 #include <linux/seq_file.h>
 101 #include <linux/stat.h>
 102 #include <linux/if_bridge.h>
 103 #include <linux/if_macvlan.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/kallsyms.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130
 131 #include "net-sysfs.h"
 132
 133 /*
 134  *      The list of packet types we will receive (as opposed to discard)
 135  *      and the routines to invoke.
 136  *
 137  *      Why 16. Because with 16 the only overlap we get on a hash of the
 138  *      low nibble of the protocol value is RARP/SNAP/X.25.
 139  *
 140  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 141  *             sure which should go first, but I bet it won't make much
 142  *             difference if we are running VLANs.  The good news is that
 143  *             this protocol won't be in the list unless compiled in, so
 144  *             the average user (w/out VLANs) will not be adversely affected.
 145  *             --BLG
 146  *
 147  *              0800    IP
 148  *              8100    802.1Q VLAN
 149  *              0001    802.3
 150  *              0002    AX.25
 151  *              0004    802.2
 152  *              8035    RARP
 153  *              0005    SNAP
 154  *              0805    X.25
 155  *              0806    ARP
 156  *              8137    IPX
 157  *              0009    Localtalk
 158  *              86DD    IPv6
 159  */
 160
 161 #define PTYPE_HASH_SIZE (16)
 162 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 163
 164 static DEFINE_SPINLOCK(ptype_lock);
 165 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 166 static struct list_head ptype_all __read_mostly;        /* Taps */
 167
 168 #ifdef CONFIG_NET_DMA
 169 struct net_dma {
 170         struct dma_client client;
 171         spinlock_t lock;
 172         cpumask_t channel_mask;
 173         struct dma_chan **channels;
 174 };
 175
 176 static enum dma_state_client
 177 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
 178         enum dma_state state);
 179
 180 static struct net_dma net_dma = {
 181         .client = {
 182                 .event_callback = netdev_dma_event,
 183         },
 184 };
 185 #endif
 186
 187 /*
 188  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 189  * semaphore.
 190  *
 191  * Pure readers hold dev_base_lock for reading.
 192  *
 193  * Writers must hold the rtnl semaphore while they loop through the
 194  * dev_base_head list, and hold dev_base_lock for writing when they do the
 195  * actual updates.  This allows pure readers to access the list even
 196  * while a writer is preparing to update it.
 197  *
 198  * To put it another way, dev_base_lock is held for writing only to
 199  * protect against pure readers; the rtnl semaphore provides the
 200  * protection against other writers.
 201  *
 202  * See, for example usages, register_netdevice() and
 203  * unregister_netdevice(), which must be called with the rtnl
 204  * semaphore held.
 205  */
 206 DEFINE_RWLOCK(dev_base_lock);
 207
 208 EXPORT_SYMBOL(dev_base_lock);
 209
 210 #define NETDEV_HASHBITS 8
 211 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 212
 213 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 214 {
 215         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 216         return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 217 }
 218
 219 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 220 {
 221         return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 222 }
 223
 224 /* Device list insertion */
 225 static int list_netdevice(struct net_device *dev)
 226 {
 227         struct net *net = dev_net(dev);
 228
 229         ASSERT_RTNL();
 230
 231         write_lock_bh(&dev_base_lock);
 232         list_add_tail(&dev->dev_list, &net->dev_base_head);
 233         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 234         hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 235         write_unlock_bh(&dev_base_lock);
 236         return 0;
 237 }
 238
 239 /* Device list removal */
 240 static void unlist_netdevice(struct net_device *dev)
 241 {
 242         ASSERT_RTNL();
 243
 244         /* Unlink dev from the device chain */
 245         write_lock_bh(&dev_base_lock);
 246         list_del(&dev->dev_list);
 247         hlist_del(&dev->name_hlist);
 248         hlist_del(&dev->index_hlist);
 249         write_unlock_bh(&dev_base_lock);
 250 }
 251
 252 /*
 253  *      Our notifier list
 254  */
 255
 256 static RAW_NOTIFIER_HEAD(netdev_chain);
 257
 258 /*
 259  *      Device drivers call our routines to queue packets here. We empty the
 260  *      queue in the local softnet handler.
 261  */
 262
 263 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 264
 265 #ifdef CONFIG_LOCKDEP
 266 /*
 267  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 268  * according to dev->type
 269  */
 270 static const unsigned short netdev_lock_type[] =
 271         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 272          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 273          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 274          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 275          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 276          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 277          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 278          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 279          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 280          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 281          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 282          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 283          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 284          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID,
 285          ARPHRD_NONE};
 286
 287 static const char *netdev_lock_name[] =
 288         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 289          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 290          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 291          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 292          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 293          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 294          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 295          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 296          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 297          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 298          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 299          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 300          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 301          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID",
 302          "_xmit_NONE"};
 303
 304 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 305 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 306
 307 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 308 {
 309         int i;
 310
 311         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 312                 if (netdev_lock_type[i] == dev_type)
 313                         return i;
 314         /* the last key is used by default */
 315         return ARRAY_SIZE(netdev_lock_type) - 1;
 316 }
 317
 318 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 319                                                  unsigned short dev_type)
 320 {
 321         int i;
 322
 323         i = netdev_lock_pos(dev_type);
 324         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 325                                    netdev_lock_name[i]);
 326 }
 327
 328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 329 {
 330         int i;
 331
 332         i = netdev_lock_pos(dev->type);
 333         lockdep_set_class_and_name(&dev->addr_list_lock,
 334                                    &netdev_addr_lock_key[i],
 335                                    netdev_lock_name[i]);
 336 }
 337 #else
 338 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 339                                                  unsigned short dev_type)
 340 {
 341 }
 342 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 343 {
 344 }
 345 #endif
 346
 347 /*******************************************************************************
 348
 349                 Protocol management and registration routines
 350
 351 *******************************************************************************/
 352
 353 /*
 354  *      Add a protocol ID to the list. Now that the input handler is
 355  *      smarter we can dispense with all the messy stuff that used to be
 356  *      here.
 357  *
 358  *      BEWARE!!! Protocol handlers, mangling input packets,
 359  *      MUST BE last in hash buckets and checking protocol handlers
 360  *      MUST start from promiscuous ptype_all chain in net_bh.
 361  *      It is true now, do not change it.
 362  *      Explanation follows: if protocol handler, mangling packet, will
 363  *      be the first on list, it is not able to sense, that packet
 364  *      is cloned and should be copied-on-write, so that it will
 365  *      change it and subsequent readers will get broken packet.
 366  *                                                      --ANK (980803)
 367  */
 368
 369 /**
 370  *      dev_add_pack - add packet handler
 371  *      @pt: packet type declaration
 372  *
 373  *      Add a protocol handler to the networking stack. The passed &packet_type
 374  *      is linked into kernel lists and may not be freed until it has been
 375  *      removed from the kernel lists.
 376  *
 377  *      This call does not sleep therefore it can not
 378  *      guarantee all CPU's that are in middle of receiving packets
 379  *      will see the new packet type (until the next received packet).
 380  */
 381
 382 void dev_add_pack(struct packet_type *pt)
 383 {
 384         int hash;
 385
 386         spin_lock_bh(&ptype_lock);
 387         if (pt->type == htons(ETH_P_ALL))
 388                 list_add_rcu(&pt->list, &ptype_all);
 389         else {
 390                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 391                 list_add_rcu(&pt->list, &ptype_base[hash]);
 392         }
 393         spin_unlock_bh(&ptype_lock);
 394 }
 395
 396 /**
 397  *      __dev_remove_pack        - remove packet handler
 398  *      @pt: packet type declaration
 399  *
 400  *      Remove a protocol handler that was previously added to the kernel
 401  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 402  *      from the kernel lists and can be freed or reused once this function
 403  *      returns.
 404  *
 405  *      The packet type might still be in use by receivers
 406  *      and must not be freed until after all the CPU's have gone
 407  *      through a quiescent state.
 408  */
 409 void __dev_remove_pack(struct packet_type *pt)
 410 {
 411         struct list_head *head;
 412         struct packet_type *pt1;
 413
 414         spin_lock_bh(&ptype_lock);
 415
 416         if (pt->type == htons(ETH_P_ALL))
 417                 head = &ptype_all;
 418         else
 419                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 420
 421         list_for_each_entry(pt1, head, list) {
 422                 if (pt == pt1) {
 423                         list_del_rcu(&pt->list);
 424                         goto out;
 425                 }
 426         }
 427
 428         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 429 out:
 430         spin_unlock_bh(&ptype_lock);
 431 }
 432 /**
 433  *      dev_remove_pack  - remove packet handler
 434  *      @pt: packet type declaration
 435  *
 436  *      Remove a protocol handler that was previously added to the kernel
 437  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 438  *      from the kernel lists and can be freed or reused once this function
 439  *      returns.
 440  *
 441  *      This call sleeps to guarantee that no CPU is looking at the packet
 442  *      type after return.
 443  */
 444 void dev_remove_pack(struct packet_type *pt)
 445 {
 446         __dev_remove_pack(pt);
 447
 448         synchronize_net();
 449 }
 450
 451 /******************************************************************************
 452
 453                       Device Boot-time Settings Routines
 454
 455 *******************************************************************************/
 456
 457 /* Boot time configuration table */
 458 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 459
 460 /**
 461  *      netdev_boot_setup_add   - add new setup entry
 462  *      @name: name of the device
 463  *      @map: configured settings for the device
 464  *
 465  *      Adds new setup entry to the dev_boot_setup list.  The function
 466  *      returns 0 on error and 1 on success.  This is a generic routine to
 467  *      all netdevices.
 468  */
 469 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 470 {
 471         struct netdev_boot_setup *s;
 472         int i;
 473
 474         s = dev_boot_setup;
 475         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 476                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 477                         memset(s[i].name, 0, sizeof(s[i].name));
 478                         strlcpy(s[i].name, name, IFNAMSIZ);
 479                         memcpy(&s[i].map, map, sizeof(s[i].map));
 480                         break;
 481                 }
 482         }
 483
 484         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 485 }
 486
 487 /**
 488  *      netdev_boot_setup_check - check boot time settings
 489  *      @dev: the netdevice
 490  *
 491  *      Check boot time settings for the device.
 492  *      The found settings are set for the device to be used
 493  *      later in the device probing.
 494  *      Returns 0 if no settings found, 1 if they are.
 495  */
 496 int netdev_boot_setup_check(struct net_device *dev)
 497 {
 498         struct netdev_boot_setup *s = dev_boot_setup;
 499         int i;
 500
 501         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 502                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 503                     !strcmp(dev->name, s[i].name)) {
 504                         dev->irq        = s[i].map.irq;
 505                         dev->base_addr  = s[i].map.base_addr;
 506                         dev->mem_start  = s[i].map.mem_start;
 507                         dev->mem_end    = s[i].map.mem_end;
 508                         return 1;
 509                 }
 510         }
 511         return 0;
 512 }
 513
 514
 515 /**
 516  *      netdev_boot_base        - get address from boot time settings
 517  *      @prefix: prefix for network device
 518  *      @unit: id for network device
 519  *
 520  *      Check boot time settings for the base address of device.
 521  *      The found settings are set for the device to be used
 522  *      later in the device probing.
 523  *      Returns 0 if no settings found.
 524  */
 525 unsigned long netdev_boot_base(const char *prefix, int unit)
 526 {
 527         const struct netdev_boot_setup *s = dev_boot_setup;
 528         char name[IFNAMSIZ];
 529         int i;
 530
 531         sprintf(name, "%s%d", prefix, unit);
 532
 533         /*
 534          * If device already registered then return base of 1
 535          * to indicate not to probe for this interface
 536          */
 537         if (__dev_get_by_name(&init_net, name))
 538                 return 1;
 539
 540         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 541                 if (!strcmp(name, s[i].name))
 542                         return s[i].map.base_addr;
 543         return 0;
 544 }
 545
 546 /*
 547  * Saves at boot time configured settings for any netdevice.
 548  */
 549 int __init netdev_boot_setup(char *str)
 550 {
 551         int ints[5];
 552         struct ifmap map;
 553
 554         str = get_options(str, ARRAY_SIZE(ints), ints);
 555         if (!str || !*str)
 556                 return 0;
 557
 558         /* Save settings */
 559         memset(&map, 0, sizeof(map));
 560         if (ints[0] > 0)
 561                 map.irq = ints[1];
 562         if (ints[0] > 1)
 563                 map.base_addr = ints[2];
 564         if (ints[0] > 2)
 565                 map.mem_start = ints[3];
 566         if (ints[0] > 3)
 567                 map.mem_end = ints[4];
 568
 569         /* Add new entry to the list */
 570         return netdev_boot_setup_add(str, &map);
 571 }
 572
 573 __setup("netdev=", netdev_boot_setup);
 574
 575 /*******************************************************************************
 576
 577                             Device Interface Subroutines
 578
 579 *******************************************************************************/
 580
 581 /**
 582  *      __dev_get_by_name       - find a device by its name
 583  *      @net: the applicable net namespace
 584  *      @name: name to find
 585  *
 586  *      Find an interface by name. Must be called under RTNL semaphore
 587  *      or @dev_base_lock. If the name is found a pointer to the device
 588  *      is returned. If the name is not found then %NULL is returned. The
 589  *      reference counters are not incremented so the caller must be
 590  *      careful with locks.
 591  */
 592
 593 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 594 {
 595         struct hlist_node *p;
 596
 597         hlist_for_each(p, dev_name_hash(net, name)) {
 598                 struct net_device *dev
 599                         = hlist_entry(p, struct net_device, name_hlist);
 600                 if (!strncmp(dev->name, name, IFNAMSIZ))
 601                         return dev;
 602         }
 603         return NULL;
 604 }
 605
 606 /**
 607  *      dev_get_by_name         - find a device by its name
 608  *      @net: the applicable net namespace
 609  *      @name: name to find
 610  *
 611  *      Find an interface by name. This can be called from any
 612  *      context and does its own locking. The returned handle has
 613  *      the usage count incremented and the caller must use dev_put() to
 614  *      release it when it is no longer needed. %NULL is returned if no
 615  *      matching device is found.
 616  */
 617
 618 struct net_device *dev_get_by_name(struct net *net, const char *name)
 619 {
 620         struct net_device *dev;
 621
 622         read_lock(&dev_base_lock);
 623         dev = __dev_get_by_name(net, name);
 624         if (dev)
 625                 dev_hold(dev);
 626         read_unlock(&dev_base_lock);
 627         return dev;
 628 }
 629
 630 /**
 631  *      __dev_get_by_index - find a device by its ifindex
 632  *      @net: the applicable net namespace
 633  *      @ifindex: index of device
 634  *
 635  *      Search for an interface by index. Returns %NULL if the device
 636  *      is not found or a pointer to the device. The device has not
 637  *      had its reference counter increased so the caller must be careful
 638  *      about locking. The caller must hold either the RTNL semaphore
 639  *      or @dev_base_lock.
 640  */
 641
 642 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 643 {
 644         struct hlist_node *p;
 645
 646         hlist_for_each(p, dev_index_hash(net, ifindex)) {
 647                 struct net_device *dev
 648                         = hlist_entry(p, struct net_device, index_hlist);
 649                 if (dev->ifindex == ifindex)
 650                         return dev;
 651         }
 652         return NULL;
 653 }
 654
 655
 656 /**
 657  *      dev_get_by_index - find a device by its ifindex
 658  *      @net: the applicable net namespace
 659  *      @ifindex: index of device
 660  *
 661  *      Search for an interface by index. Returns NULL if the device
 662  *      is not found or a pointer to the device. The device returned has
 663  *      had a reference added and the pointer is safe until the user calls
 664  *      dev_put to indicate they have finished with it.
 665  */
 666
 667 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 668 {
 669         struct net_device *dev;
 670
 671         read_lock(&dev_base_lock);
 672         dev = __dev_get_by_index(net, ifindex);
 673         if (dev)
 674                 dev_hold(dev);
 675         read_unlock(&dev_base_lock);
 676         return dev;
 677 }
 678
 679 /**
 680  *      dev_getbyhwaddr - find a device by its hardware address
 681  *      @net: the applicable net namespace
 682  *      @type: media type of device
 683  *      @ha: hardware address
 684  *
 685  *      Search for an interface by MAC address. Returns NULL if the device
 686  *      is not found or a pointer to the device. The caller must hold the
 687  *      rtnl semaphore. The returned device has not had its ref count increased
 688  *      and the caller must therefore be careful about locking
 689  *
 690  *      BUGS:
 691  *      If the API was consistent this would be __dev_get_by_hwaddr
 692  */
 693
 694 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 695 {
 696         struct net_device *dev;
 697
 698         ASSERT_RTNL();
 699
 700         for_each_netdev(net, dev)
 701                 if (dev->type == type &&
 702                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 703                         return dev;
 704
 705         return NULL;
 706 }
 707
 708 EXPORT_SYMBOL(dev_getbyhwaddr);
 709
 710 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 711 {
 712         struct net_device *dev;
 713
 714         ASSERT_RTNL();
 715         for_each_netdev(net, dev)
 716                 if (dev->type == type)
 717                         return dev;
 718
 719         return NULL;
 720 }
 721
 722 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 723
 724 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 725 {
 726         struct net_device *dev;
 727
 728         rtnl_lock();
 729         dev = __dev_getfirstbyhwtype(net, type);
 730         if (dev)
 731                 dev_hold(dev);
 732         rtnl_unlock();
 733         return dev;
 734 }
 735
 736 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 737
 738 /**
 739  *      dev_get_by_flags - find any device with given flags
 740  *      @net: the applicable net namespace
 741  *      @if_flags: IFF_* values
 742  *      @mask: bitmask of bits in if_flags to check
 743  *
 744  *      Search for any interface with the given flags. Returns NULL if a device
 745  *      is not found or a pointer to the device. The device returned has
 746  *      had a reference added and the pointer is safe until the user calls
 747  *      dev_put to indicate they have finished with it.
 748  */
 749
 750 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
 751 {
 752         struct net_device *dev, *ret;
 753
 754         ret = NULL;
 755         read_lock(&dev_base_lock);
 756         for_each_netdev(net, dev) {
 757                 if (((dev->flags ^ if_flags) & mask) == 0) {
 758                         dev_hold(dev);
 759                         ret = dev;
 760                         break;
 761                 }
 762         }
 763         read_unlock(&dev_base_lock);
 764         return ret;
 765 }
 766
 767 /**
 768  *      dev_valid_name - check if name is okay for network device
 769  *      @name: name string
 770  *
 771  *      Network device names need to be valid file names to
 772  *      to allow sysfs to work.  We also disallow any kind of
 773  *      whitespace.
 774  */
 775 int dev_valid_name(const char *name)
 776 {
 777         if (*name == '\0')
 778                 return 0;
 779         if (strlen(name) >= IFNAMSIZ)
 780                 return 0;
 781         if (!strcmp(name, ".") || !strcmp(name, ".."))
 782                 return 0;
 783
 784         while (*name) {
 785                 if (*name == '/' || isspace(*name))
 786                         return 0;
 787                 name++;
 788         }
 789         return 1;
 790 }
 791
 792 /**
 793  *      __dev_alloc_name - allocate a name for a device
 794  *      @net: network namespace to allocate the device name in
 795  *      @name: name format string
 796  *      @buf:  scratch buffer and result name string
 797  *
 798  *      Passed a format string - eg "lt%d" it will try and find a suitable
 799  *      id. It scans list of devices to build up a free map, then chooses
 800  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 801  *      while allocating the name and adding the device in order to avoid
 802  *      duplicates.
 803  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 804  *      Returns the number of the unit assigned or a negative errno code.
 805  */
 806
 807 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 808 {
 809         int i = 0;
 810         const char *p;
 811         const int max_netdevices = 8*PAGE_SIZE;
 812         unsigned long *inuse;
 813         struct net_device *d;
 814
 815         p = strnchr(name, IFNAMSIZ-1, '%');
 816         if (p) {
 817                 /*
 818                  * Verify the string as this thing may have come from
 819                  * the user.  There must be either one "%d" and no other "%"
 820                  * characters.
 821                  */
 822                 if (p[1] != 'd' || strchr(p + 2, '%'))
 823                         return -EINVAL;
 824
 825                 /* Use one page as a bit array of possible slots */
 826                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 827                 if (!inuse)
 828                         return -ENOMEM;
 829
 830                 for_each_netdev(net, d) {
 831                         if (!sscanf(d->name, name, &i))
 832                                 continue;
 833                         if (i < 0 || i >= max_netdevices)
 834                                 continue;
 835
 836                         /*  avoid cases where sscanf is not exact inverse of printf */
 837                         snprintf(buf, IFNAMSIZ, name, i);
 838                         if (!strncmp(buf, d->name, IFNAMSIZ))
 839                                 set_bit(i, inuse);
 840                 }
 841
 842                 i = find_first_zero_bit(inuse, max_netdevices);
 843                 free_page((unsigned long) inuse);
 844         }
 845
 846         snprintf(buf, IFNAMSIZ, name, i);
 847         if (!__dev_get_by_name(net, buf))
 848                 return i;
 849
 850         /* It is possible to run out of possible slots
 851          * when the name is long and there isn't enough space left
 852          * for the digits, or if all bits are used.
 853          */
 854         return -ENFILE;
 855 }
 856
 857 /**
 858  *      dev_alloc_name - allocate a name for a device
 859  *      @dev: device
 860  *      @name: name format string
 861  *
 862  *      Passed a format string - eg "lt%d" it will try and find a suitable
 863  *      id. It scans list of devices to build up a free map, then chooses
 864  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 865  *      while allocating the name and adding the device in order to avoid
 866  *      duplicates.
 867  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 868  *      Returns the number of the unit assigned or a negative errno code.
 869  */
 870
 871 int dev_alloc_name(struct net_device *dev, const char *name)
 872 {
 873         char buf[IFNAMSIZ];
 874         struct net *net;
 875         int ret;
 876
 877         BUG_ON(!dev_net(dev));
 878         net = dev_net(dev);
 879         ret = __dev_alloc_name(net, name, buf);
 880         if (ret >= 0)
 881                 strlcpy(dev->name, buf, IFNAMSIZ);
 882         return ret;
 883 }
 884
 885
 886 /**
 887  *      dev_change_name - change name of a device
 888  *      @dev: device
 889  *      @newname: name (or format string) must be at least IFNAMSIZ
 890  *
 891  *      Change name of a device, can pass format strings "eth%d".
 892  *      for wildcarding.
 893  */
 894 int dev_change_name(struct net_device *dev, char *newname)
 895 {
 896         char oldname[IFNAMSIZ];
 897         int err = 0;
 898         int ret;
 899         struct net *net;
 900
 901         ASSERT_RTNL();
 902         BUG_ON(!dev_net(dev));
 903
 904         net = dev_net(dev);
 905         if (dev->flags & IFF_UP)
 906                 return -EBUSY;
 907
 908         if (!dev_valid_name(newname))
 909                 return -EINVAL;
 910
 911         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 912                 return 0;
 913
 914         memcpy(oldname, dev->name, IFNAMSIZ);
 915
 916         if (strchr(newname, '%')) {
 917                 err = dev_alloc_name(dev, newname);
 918                 if (err < 0)
 919                         return err;
 920                 strcpy(newname, dev->name);
 921         }
 922         else if (__dev_get_by_name(net, newname))
 923                 return -EEXIST;
 924         else
 925                 strlcpy(dev->name, newname, IFNAMSIZ);
 926
 927 rollback:
 928         err = device_rename(&dev->dev, dev->name);
 929         if (err) {
 930                 memcpy(dev->name, oldname, IFNAMSIZ);
 931                 return err;
 932         }
 933
 934         write_lock_bh(&dev_base_lock);
 935         hlist_del(&dev->name_hlist);
 936         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 937         write_unlock_bh(&dev_base_lock);
 938
 939         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 940         ret = notifier_to_errno(ret);
 941
 942         if (ret) {
 943                 if (err) {
 944                         printk(KERN_ERR
 945                                "%s: name change rollback failed: %d.\n",
 946                                dev->name, ret);
 947                 } else {
 948                         err = ret;
 949                         memcpy(dev->name, oldname, IFNAMSIZ);
 950                         goto rollback;
 951                 }
 952         }
 953
 954         return err;
 955 }
 956
 957 /**
 958  *      netdev_features_change - device changes features
 959  *      @dev: device to cause notification
 960  *
 961  *      Called to indicate a device has changed features.
 962  */
 963 void netdev_features_change(struct net_device *dev)
 964 {
 965         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 966 }
 967 EXPORT_SYMBOL(netdev_features_change);
 968
 969 /**
 970  *      netdev_state_change - device changes state
 971  *      @dev: device to cause notification
 972  *
 973  *      Called to indicate a device has changed state. This function calls
 974  *      the notifier chains for netdev_chain and sends a NEWLINK message
 975  *      to the routing socket.
 976  */
 977 void netdev_state_change(struct net_device *dev)
 978 {
 979         if (dev->flags & IFF_UP) {
 980                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
 981                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 982         }
 983 }
 984
 985 void netdev_bonding_change(struct net_device *dev)
 986 {
 987         call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
 988 }
 989 EXPORT_SYMBOL(netdev_bonding_change);
 990
 991 /**
 992  *      dev_load        - load a network module
 993  *      @net: the applicable net namespace
 994  *      @name: name of interface
 995  *
 996  *      If a network interface is not present and the process has suitable
 997  *      privileges this function loads the module. If module loading is not
 998  *      available in this kernel then it becomes a nop.
 999  */
1000
1001 void dev_load(struct net *net, const char *name)
1002 {
1003         struct net_device *dev;
1004
1005         read_lock(&dev_base_lock);
1006         dev = __dev_get_by_name(net, name);
1007         read_unlock(&dev_base_lock);
1008
1009         if (!dev && capable(CAP_SYS_MODULE))
1010                 request_module("%s", name);
1011 }
1012
1013 /**
1014  *      dev_open        - prepare an interface for use.
1015  *      @dev:   device to open
1016  *
1017  *      Takes a device from down to up state. The device's private open
1018  *      function is invoked and then the multicast lists are loaded. Finally
1019  *      the device is moved into the up state and a %NETDEV_UP message is
1020  *      sent to the netdev notifier chain.
1021  *
1022  *      Calling this function on an active interface is a nop. On a failure
1023  *      a negative errno code is returned.
1024  */
1025 int dev_open(struct net_device *dev)
1026 {
1027         int ret = 0;
1028
1029         ASSERT_RTNL();
1030
1031         /*
1032          *      Is it already up?
1033          */
1034
1035         if (dev->flags & IFF_UP)
1036                 return 0;
1037
1038         /*
1039          *      Is it even present?
1040          */
1041         if (!netif_device_present(dev))
1042                 return -ENODEV;
1043
1044         /*
1045          *      Call device private open method
1046          */
1047         set_bit(__LINK_STATE_START, &dev->state);
1048
1049         if (dev->validate_addr)
1050                 ret = dev->validate_addr(dev);
1051
1052         if (!ret && dev->open)
1053                 ret = dev->open(dev);
1054
1055         /*
1056          *      If it went open OK then:
1057          */
1058
1059         if (ret)
1060                 clear_bit(__LINK_STATE_START, &dev->state);
1061         else {
1062                 /*
1063                  *      Set the flags.
1064                  */
1065                 dev->flags |= IFF_UP;
1066
1067                 /*
1068                  *      Initialize multicasting status
1069                  */
1070                 dev_set_rx_mode(dev);
1071
1072                 /*
1073                  *      Wakeup transmit queue engine
1074                  */
1075                 dev_activate(dev);
1076
1077                 /*
1078                  *      ... and announce new interface.
1079                  */
1080                 call_netdevice_notifiers(NETDEV_UP, dev);
1081         }
1082
1083         return ret;
1084 }
1085
1086 /**
1087  *      dev_close - shutdown an interface.
1088  *      @dev: device to shutdown
1089  *
1090  *      This function moves an active device into down state. A
1091  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1092  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1093  *      chain.
1094  */
1095 int dev_close(struct net_device *dev)
1096 {
1097         ASSERT_RTNL();
1098
1099         might_sleep();
1100
1101         if (!(dev->flags & IFF_UP))
1102                 return 0;
1103
1104         /*
1105          *      Tell people we are going down, so that they can
1106          *      prepare to death, when device is still operating.
1107          */
1108         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1109
1110         clear_bit(__LINK_STATE_START, &dev->state);
1111
1112         /* Synchronize to scheduled poll. We cannot touch poll list,
1113          * it can be even on different cpu. So just clear netif_running().
1114          *
1115          * dev->stop() will invoke napi_disable() on all of it's
1116          * napi_struct instances on this device.
1117          */
1118         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1119
1120         dev_deactivate(dev);
1121
1122         /*
1123          *      Call the device specific close. This cannot fail.
1124          *      Only if device is UP
1125          *
1126          *      We allow it to be called even after a DETACH hot-plug
1127          *      event.
1128          */
1129         if (dev->stop)
1130                 dev->stop(dev);
1131
1132         /*
1133          *      Device is now down.
1134          */
1135
1136         dev->flags &= ~IFF_UP;
1137
1138         /*
1139          * Tell people we are down
1140          */
1141         call_netdevice_notifiers(NETDEV_DOWN, dev);
1142
1143         return 0;
1144 }
1145
1146
1147 /**
1148  *      dev_disable_lro - disable Large Receive Offload on a device
1149  *      @dev: device
1150  *
1151  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1152  *      called under RTNL.  This is needed if received packets may be
1153  *      forwarded to another interface.
1154  */
1155 void dev_disable_lro(struct net_device *dev)
1156 {
1157         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1158             dev->ethtool_ops->set_flags) {
1159                 u32 flags = dev->ethtool_ops->get_flags(dev);
1160                 if (flags & ETH_FLAG_LRO) {
1161                         flags &= ~ETH_FLAG_LRO;
1162                         dev->ethtool_ops->set_flags(dev, flags);
1163                 }
1164         }
1165         WARN_ON(dev->features & NETIF_F_LRO);
1166 }
1167 EXPORT_SYMBOL(dev_disable_lro);
1168
1169
1170 static int dev_boot_phase = 1;
1171
1172 /*
1173  *      Device change register/unregister. These are not inline or static
1174  *      as we export them to the world.
1175  */
1176
1177 /**
1178  *      register_netdevice_notifier - register a network notifier block
1179  *      @nb: notifier
1180  *
1181  *      Register a notifier to be called when network device events occur.
1182  *      The notifier passed is linked into the kernel structures and must
1183  *      not be reused until it has been unregistered. A negative errno code
1184  *      is returned on a failure.
1185  *
1186  *      When registered all registration and up events are replayed
1187  *      to the new notifier to allow device to have a race free
1188  *      view of the network device list.
1189  */
1190
1191 int register_netdevice_notifier(struct notifier_block *nb)
1192 {
1193         struct net_device *dev;
1194         struct net_device *last;
1195         struct net *net;
1196         int err;
1197
1198         rtnl_lock();
1199         err = raw_notifier_chain_register(&netdev_chain, nb);
1200         if (err)
1201                 goto unlock;
1202         if (dev_boot_phase)
1203                 goto unlock;
1204         for_each_net(net) {
1205                 for_each_netdev(net, dev) {
1206                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1207                         err = notifier_to_errno(err);
1208                         if (err)
1209                                 goto rollback;
1210
1211                         if (!(dev->flags & IFF_UP))
1212                                 continue;
1213
1214                         nb->notifier_call(nb, NETDEV_UP, dev);
1215                 }
1216         }
1217
1218 unlock:
1219         rtnl_unlock();
1220         return err;
1221
1222 rollback:
1223         last = dev;
1224         for_each_net(net) {
1225                 for_each_netdev(net, dev) {
1226                         if (dev == last)
1227                                 break;
1228
1229                         if (dev->flags & IFF_UP) {
1230                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1231                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1232                         }
1233                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1234                 }
1235         }
1236
1237         raw_notifier_chain_unregister(&netdev_chain, nb);
1238         goto unlock;
1239 }
1240
1241 /**
1242  *      unregister_netdevice_notifier - unregister a network notifier block
1243  *      @nb: notifier
1244  *
1245  *      Unregister a notifier previously registered by
1246  *      register_netdevice_notifier(). The notifier is unlinked into the
1247  *      kernel structures and may then be reused. A negative errno code
1248  *      is returned on a failure.
1249  */
1250
1251 int unregister_netdevice_notifier(struct notifier_block *nb)
1252 {
1253         int err;
1254
1255         rtnl_lock();
1256         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1257         rtnl_unlock();
1258         return err;
1259 }
1260
1261 /**
1262  *      call_netdevice_notifiers - call all network notifier blocks
1263  *      @val: value passed unmodified to notifier function
1264  *      @dev: net_device pointer passed unmodified to notifier function
1265  *
1266  *      Call all network notifier blocks.  Parameters and return value
1267  *      are as for raw_notifier_call_chain().
1268  */
1269
1270 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1271 {
1272         return raw_notifier_call_chain(&netdev_chain, val, dev);
1273 }
1274
1275 /* When > 0 there are consumers of rx skb time stamps */
1276 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1277
1278 void net_enable_timestamp(void)
1279 {
1280         atomic_inc(&netstamp_needed);
1281 }
1282
1283 void net_disable_timestamp(void)
1284 {
1285         atomic_dec(&netstamp_needed);
1286 }
1287
1288 static inline void net_timestamp(struct sk_buff *skb)
1289 {
1290         if (atomic_read(&netstamp_needed))
1291                 __net_timestamp(skb);
1292         else
1293                 skb->tstamp.tv64 = 0;
1294 }
1295
1296 /*
1297  *      Support routine. Sends outgoing frames to any network
1298  *      taps currently in use.
1299  */
1300
1301 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1302 {
1303         struct packet_type *ptype;
1304
1305         net_timestamp(skb);
1306
1307         rcu_read_lock();
1308         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1309                 /* Never send packets back to the socket
1310                  * they originated from - MvS (miquels@drinkel.ow.org)
1311                  */
1312                 if ((ptype->dev == dev || !ptype->dev) &&
1313                     (ptype->af_packet_priv == NULL ||
1314                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1315                         struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1316                         if (!skb2)
1317                                 break;
1318
1319                         /* skb->nh should be correctly
1320                            set by sender, so that the second statement is
1321                            just protection against buggy protocols.
1322                          */
1323                         skb_reset_mac_header(skb2);
1324
1325                         if (skb_network_header(skb2) < skb2->data ||
1326                             skb2->network_header > skb2->tail) {
1327                                 if (net_ratelimit())
1328                                         printk(KERN_CRIT "protocol %04x is "
1329                                                "buggy, dev %s\n",
1330                                                skb2->protocol, dev->name);
1331                                 skb_reset_network_header(skb2);
1332                         }
1333
1334                         skb2->transport_header = skb2->network_header;
1335                         skb2->pkt_type = PACKET_OUTGOING;
1336                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1337                 }
1338         }
1339         rcu_read_unlock();
1340 }
1341
1342
1343 static inline void __netif_reschedule(struct Qdisc *q)
1344 {
1345         struct softnet_data *sd;
1346         unsigned long flags;
1347
1348         local_irq_save(flags);
1349         sd = &__get_cpu_var(softnet_data);
1350         q->next_sched = sd->output_queue;
1351         sd->output_queue = q;
1352         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1353         local_irq_restore(flags);
1354 }
1355
1356 void __netif_schedule(struct Qdisc *q)
1357 {
1358         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1359                 __netif_reschedule(q);
1360 }
1361 EXPORT_SYMBOL(__netif_schedule);
1362
1363 void dev_kfree_skb_irq(struct sk_buff *skb)
1364 {
1365         if (atomic_dec_and_test(&skb->users)) {
1366                 struct softnet_data *sd;
1367                 unsigned long flags;
1368
1369                 local_irq_save(flags);
1370                 sd = &__get_cpu_var(softnet_data);
1371                 skb->next = sd->completion_queue;
1372                 sd->completion_queue = skb;
1373                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1374                 local_irq_restore(flags);
1375         }
1376 }
1377 EXPORT_SYMBOL(dev_kfree_skb_irq);
1378
1379 void dev_kfree_skb_any(struct sk_buff *skb)
1380 {
1381         if (in_irq() || irqs_disabled())
1382                 dev_kfree_skb_irq(skb);
1383         else
1384                 dev_kfree_skb(skb);
1385 }
1386 EXPORT_SYMBOL(dev_kfree_skb_any);
1387
1388
1389 /**
1390  * netif_device_detach - mark device as removed
1391  * @dev: network device
1392  *
1393  * Mark device as removed from system and therefore no longer available.
1394  */
1395 void netif_device_detach(struct net_device *dev)
1396 {
1397         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1398             netif_running(dev)) {
1399                 netif_stop_queue(dev);
1400         }
1401 }
1402 EXPORT_SYMBOL(netif_device_detach);
1403
1404 /**
1405  * netif_device_attach - mark device as attached
1406  * @dev: network device
1407  *
1408  * Mark device as attached from system and restart if needed.
1409  */
1410 void netif_device_attach(struct net_device *dev)
1411 {
1412         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1413             netif_running(dev)) {
1414                 netif_wake_queue(dev);
1415                 __netdev_watchdog_up(dev);
1416         }
1417 }
1418 EXPORT_SYMBOL(netif_device_attach);
1419
1420 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1421 {
1422         return ((features & NETIF_F_GEN_CSUM) ||
1423                 ((features & NETIF_F_IP_CSUM) &&
1424                  protocol == htons(ETH_P_IP)) ||
1425                 ((features & NETIF_F_IPV6_CSUM) &&
1426                  protocol == htons(ETH_P_IPV6)));
1427 }
1428
1429 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1430 {
1431         if (can_checksum_protocol(dev->features, skb->protocol))
1432                 return true;
1433
1434         if (skb->protocol == htons(ETH_P_8021Q)) {
1435                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1436                 if (can_checksum_protocol(dev->features & dev->vlan_features,
1437                                           veh->h_vlan_encapsulated_proto))
1438                         return true;
1439         }
1440
1441         return false;
1442 }
1443
1444 /*
1445  * Invalidate hardware checksum when packet is to be mangled, and
1446  * complete checksum manually on outgoing path.
1447  */
1448 int skb_checksum_help(struct sk_buff *skb)
1449 {
1450         __wsum csum;
1451         int ret = 0, offset;
1452
1453         if (skb->ip_summed == CHECKSUM_COMPLETE)
1454                 goto out_set_summed;
1455
1456         if (unlikely(skb_shinfo(skb)->gso_size)) {
1457                 /* Let GSO fix up the checksum. */
1458                 goto out_set_summed;
1459         }
1460
1461         offset = skb->csum_start - skb_headroom(skb);
1462         BUG_ON(offset >= skb_headlen(skb));
1463         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1464
1465         offset += skb->csum_offset;
1466         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1467
1468         if (skb_cloned(skb) &&
1469             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1470                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1471                 if (ret)
1472                         goto out;
1473         }
1474
1475         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1476 out_set_summed:
1477         skb->ip_summed = CHECKSUM_NONE;
1478 out:
1479         return ret;
1480 }
1481
1482 /**
1483  *      skb_gso_segment - Perform segmentation on skb.
1484  *      @skb: buffer to segment
1485  *      @features: features for the output path (see dev->features)
1486  *
1487  *      This function segments the given skb and returns a list of segments.
1488  *
1489  *      It may return NULL if the skb requires no segmentation.  This is
1490  *      only possible when GSO is used for verifying header integrity.
1491  */
1492 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1493 {
1494         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1495         struct packet_type *ptype;
1496         __be16 type = skb->protocol;
1497         int err;
1498
1499         BUG_ON(skb_shinfo(skb)->frag_list);
1500
1501         skb_reset_mac_header(skb);
1502         skb->mac_len = skb->network_header - skb->mac_header;
1503         __skb_pull(skb, skb->mac_len);
1504
1505         if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1506                 if (skb_header_cloned(skb) &&
1507                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1508                         return ERR_PTR(err);
1509         }
1510
1511         rcu_read_lock();
1512         list_for_each_entry_rcu(ptype,
1513                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1514                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1515                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1516                                 err = ptype->gso_send_check(skb);
1517                                 segs = ERR_PTR(err);
1518                                 if (err || skb_gso_ok(skb, features))
1519                                         break;
1520                                 __skb_push(skb, (skb->data -
1521                                                  skb_network_header(skb)));
1522                         }
1523                         segs = ptype->gso_segment(skb, features);
1524                         break;
1525                 }
1526         }
1527         rcu_read_unlock();
1528
1529         __skb_push(skb, skb->data - skb_mac_header(skb));
1530
1531         return segs;
1532 }
1533
1534 EXPORT_SYMBOL(skb_gso_segment);
1535
1536 /* Take action when hardware reception checksum errors are detected. */
1537 #ifdef CONFIG_BUG
1538 void netdev_rx_csum_fault(struct net_device *dev)
1539 {
1540         if (net_ratelimit()) {
1541                 printk(KERN_ERR "%s: hw csum failure.\n",
1542                         dev ? dev->name : "<unknown>");
1543                 dump_stack();
1544         }
1545 }
1546 EXPORT_SYMBOL(netdev_rx_csum_fault);
1547 #endif
1548
1549 /* Actually, we should eliminate this check as soon as we know, that:
1550  * 1. IOMMU is present and allows to map all the memory.
1551  * 2. No high memory really exists on this machine.
1552  */
1553
1554 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1555 {
1556 #ifdef CONFIG_HIGHMEM
1557         int i;
1558
1559         if (dev->features & NETIF_F_HIGHDMA)
1560                 return 0;
1561
1562         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1563                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1564                         return 1;
1565
1566 #endif
1567         return 0;
1568 }
1569
1570 struct dev_gso_cb {
1571         void (*destructor)(struct sk_buff *skb);
1572 };
1573
1574 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1575
1576 static void dev_gso_skb_destructor(struct sk_buff *skb)
1577 {
1578         struct dev_gso_cb *cb;
1579
1580         do {
1581                 struct sk_buff *nskb = skb->next;
1582
1583                 skb->next = nskb->next;
1584                 nskb->next = NULL;
1585                 kfree_skb(nskb);
1586         } while (skb->next);
1587
1588         cb = DEV_GSO_CB(skb);
1589         if (cb->destructor)
1590                 cb->destructor(skb);
1591 }
1592
1593 /**
1594  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1595  *      @skb: buffer to segment
1596  *
1597  *      This function segments the given skb and stores the list of segments
1598  *      in skb->next.
1599  */
1600 static int dev_gso_segment(struct sk_buff *skb)
1601 {
1602         struct net_device *dev = skb->dev;
1603         struct sk_buff *segs;
1604         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1605                                          NETIF_F_SG : 0);
1606
1607         segs = skb_gso_segment(skb, features);
1608
1609         /* Verifying header integrity only. */
1610         if (!segs)
1611                 return 0;
1612
1613         if (IS_ERR(segs))
1614                 return PTR_ERR(segs);
1615
1616         skb->next = segs;
1617         DEV_GSO_CB(skb)->destructor = skb->destructor;
1618         skb->destructor = dev_gso_skb_destructor;
1619
1620         return 0;
1621 }
1622
1623 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1624                         struct netdev_queue *txq)
1625 {
1626         if (likely(!skb->next)) {
1627                 if (!list_empty(&ptype_all))
1628                         dev_queue_xmit_nit(skb, dev);
1629
1630                 if (netif_needs_gso(dev, skb)) {
1631                         if (unlikely(dev_gso_segment(skb)))
1632                                 goto out_kfree_skb;
1633                         if (skb->next)
1634                                 goto gso;
1635                 }
1636
1637                 return dev->hard_start_xmit(skb, dev);
1638         }
1639
1640 gso:
1641         do {
1642                 struct sk_buff *nskb = skb->next;
1643                 int rc;
1644
1645                 skb->next = nskb->next;
1646                 nskb->next = NULL;
1647                 rc = dev->hard_start_xmit(nskb, dev);
1648                 if (unlikely(rc)) {
1649                         nskb->next = skb->next;
1650                         skb->next = nskb;
1651                         return rc;
1652                 }
1653                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1654                         return NETDEV_TX_BUSY;
1655         } while (skb->next);
1656
1657         skb->destructor = DEV_GSO_CB(skb)->destructor;
1658
1659 out_kfree_skb:
1660         kfree_skb(skb);
1661         return 0;
1662 }
1663
1664 static u32 simple_tx_hashrnd;
1665 static int simple_tx_hashrnd_initialized = 0;
1666
1667 static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
1668 {
1669         u32 addr1, addr2, ports;
1670         u32 hash, ihl;
1671         u8 ip_proto = 0;
1672
1673         if (unlikely(!simple_tx_hashrnd_initialized)) {
1674                 get_random_bytes(&simple_tx_hashrnd, 4);
1675                 simple_tx_hashrnd_initialized = 1;
1676         }
1677
1678         switch (skb->protocol) {
1679         case __constant_htons(ETH_P_IP):
1680                 if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
1681                         ip_proto = ip_hdr(skb)->protocol;
1682                 addr1 = ip_hdr(skb)->saddr;
1683                 addr2 = ip_hdr(skb)->daddr;
1684                 ihl = ip_hdr(skb)->ihl;
1685                 break;
1686         case __constant_htons(ETH_P_IPV6):
1687                 ip_proto = ipv6_hdr(skb)->nexthdr;
1688                 addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
1689                 addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
1690                 ihl = (40 >> 2);
1691                 break;
1692         default:
1693                 return 0;
1694         }
1695
1696
1697         switch (ip_proto) {
1698         case IPPROTO_TCP:
1699         case IPPROTO_UDP:
1700         case IPPROTO_DCCP:
1701         case IPPROTO_ESP:
1702         case IPPROTO_AH:
1703         case IPPROTO_SCTP:
1704         case IPPROTO_UDPLITE:
1705                 ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
1706                 break;
1707
1708         default:
1709                 ports = 0;
1710                 break;
1711         }
1712
1713         hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
1714
1715         return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1716 }
1717
1718 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1719                                         struct sk_buff *skb)
1720 {
1721         u16 queue_index = 0;
1722
1723         if (dev->select_queue)
1724                 queue_index = dev->select_queue(dev, skb);
1725         else if (dev->real_num_tx_queues > 1)
1726                 queue_index = simple_tx_hash(dev, skb);
1727
1728         skb_set_queue_mapping(skb, queue_index);
1729         return netdev_get_tx_queue(dev, queue_index);
1730 }
1731
1732 /**
1733  *      dev_queue_xmit - transmit a buffer
1734  *      @skb: buffer to transmit
1735  *
1736  *      Queue a buffer for transmission to a network device. The caller must
1737  *      have set the device and priority and built the buffer before calling
1738  *      this function. The function can be called from an interrupt.
1739  *
1740  *      A negative errno code is returned on a failure. A success does not
1741  *      guarantee the frame will be transmitted as it may be dropped due
1742  *      to congestion or traffic shaping.
1743  *
1744  * -----------------------------------------------------------------------------------
1745  *      I notice this method can also return errors from the queue disciplines,
1746  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1747  *      be positive.
1748  *
1749  *      Regardless of the return value, the skb is consumed, so it is currently
1750  *      difficult to retry a send to this method.  (You can bump the ref count
1751  *      before sending to hold a reference for retry if you are careful.)
1752  *
1753  *      When calling this method, interrupts MUST be enabled.  This is because
1754  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1755  *          --BLG
1756  */
1757 int dev_queue_xmit(struct sk_buff *skb)
1758 {
1759         struct net_device *dev = skb->dev;
1760         struct netdev_queue *txq;
1761         struct Qdisc *q;
1762         int rc = -ENOMEM;
1763
1764         /* GSO will handle the following emulations directly. */
1765         if (netif_needs_gso(dev, skb))
1766                 goto gso;
1767
1768         if (skb_shinfo(skb)->frag_list &&
1769             !(dev->features & NETIF_F_FRAGLIST) &&
1770             __skb_linearize(skb))
1771                 goto out_kfree_skb;
1772
1773         /* Fragmented skb is linearized if device does not support SG,
1774          * or if at least one of fragments is in highmem and device
1775          * does not support DMA from it.
1776          */
1777         if (skb_shinfo(skb)->nr_frags &&
1778             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1779             __skb_linearize(skb))
1780                 goto out_kfree_skb;
1781
1782         /* If packet is not checksummed and device does not support
1783          * checksumming for this protocol, complete checksumming here.
1784          */
1785         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1786                 skb_set_transport_header(skb, skb->csum_start -
1787                                               skb_headroom(skb));
1788                 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1789                         goto out_kfree_skb;
1790         }
1791
1792 gso:
1793         /* Disable soft irqs for various locks below. Also
1794          * stops preemption for RCU.
1795          */
1796         rcu_read_lock_bh();
1797
1798         txq = dev_pick_tx(dev, skb);
1799         q = rcu_dereference(txq->qdisc);
1800
1801 #ifdef CONFIG_NET_CLS_ACT
1802         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1803 #endif
1804         if (q->enqueue) {
1805                 spinlock_t *root_lock = qdisc_lock(q);
1806
1807                 spin_lock(root_lock);
1808
1809                 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1810                         kfree_skb(skb);
1811                         rc = NET_XMIT_DROP;
1812                 } else {
1813                         rc = qdisc_enqueue_root(skb, q);
1814                         qdisc_run(q);
1815                 }
1816                 spin_unlock(root_lock);
1817
1818                 goto out;
1819         }
1820
1821         /* The device has no queue. Common case for software devices:
1822            loopback, all the sorts of tunnels...
1823
1824            Really, it is unlikely that netif_tx_lock protection is necessary
1825            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1826            counters.)
1827            However, it is possible, that they rely on protection
1828            made by us here.
1829
1830            Check this and shot the lock. It is not prone from deadlocks.
1831            Either shot noqueue qdisc, it is even simpler 8)
1832          */
1833         if (dev->flags & IFF_UP) {
1834                 int cpu = smp_processor_id(); /* ok because BHs are off */
1835
1836                 if (txq->xmit_lock_owner != cpu) {
1837
1838                         HARD_TX_LOCK(dev, txq, cpu);
1839
1840                         if (!netif_tx_queue_stopped(txq)) {
1841                                 rc = 0;
1842                                 if (!dev_hard_start_xmit(skb, dev, txq)) {
1843                                         HARD_TX_UNLOCK(dev, txq);
1844                                         goto out;
1845                                 }
1846                         }
1847                         HARD_TX_UNLOCK(dev, txq);
1848                         if (net_ratelimit())
1849                                 printk(KERN_CRIT "Virtual device %s asks to "
1850                                        "queue packet!\n", dev->name);
1851                 } else {
1852                         /* Recursion is detected! It is possible,
1853                          * unfortunately */
1854                         if (net_ratelimit())
1855                                 printk(KERN_CRIT "Dead loop on virtual device "
1856                                        "%s, fix it urgently!\n", dev->name);
1857                 }
1858         }
1859
1860         rc = -ENETDOWN;
1861         rcu_read_unlock_bh();
1862
1863 out_kfree_skb:
1864         kfree_skb(skb);
1865         return rc;
1866 out:
1867         rcu_read_unlock_bh();
1868         return rc;
1869 }
1870
1871
1872 /*=======================================================================
1873                         Receiver routines
1874   =======================================================================*/
1875
1876 int netdev_max_backlog __read_mostly = 1000;
1877 int netdev_budget __read_mostly = 300;
1878 int weight_p __read_mostly = 64;            /* old backlog weight */
1879
1880 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1881
1882
1883 /**
1884  *      netif_rx        -       post buffer to the network code
1885  *      @skb: buffer to post
1886  *
1887  *      This function receives a packet from a device driver and queues it for
1888  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1889  *      may be dropped during processing for congestion control or by the
1890  *      protocol layers.
1891  *
1892  *      return values:
1893  *      NET_RX_SUCCESS  (no congestion)
1894  *      NET_RX_DROP     (packet was dropped)
1895  *
1896  */
1897
1898 int netif_rx(struct sk_buff *skb)
1899 {
1900         struct softnet_data *queue;
1901         unsigned long flags;
1902
1903         /* if netpoll wants it, pretend we never saw it */
1904         if (netpoll_rx(skb))
1905                 return NET_RX_DROP;
1906
1907         if (!skb->tstamp.tv64)
1908                 net_timestamp(skb);
1909
1910         /*
1911          * The code is rearranged so that the path is the most
1912          * short when CPU is congested, but is still operating.
1913          */
1914         local_irq_save(flags);
1915         queue = &__get_cpu_var(softnet_data);
1916
1917         __get_cpu_var(netdev_rx_stat).total++;
1918         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1919                 if (queue->input_pkt_queue.qlen) {
1920 enqueue:
1921                         __skb_queue_tail(&queue->input_pkt_queue, skb);
1922                         local_irq_restore(flags);
1923                         return NET_RX_SUCCESS;
1924                 }
1925
1926                 napi_schedule(&queue->backlog);
1927                 goto enqueue;
1928         }
1929
1930         __get_cpu_var(netdev_rx_stat).dropped++;
1931         local_irq_restore(flags);
1932
1933         kfree_skb(skb);
1934         return NET_RX_DROP;
1935 }
1936
1937 int netif_rx_ni(struct sk_buff *skb)
1938 {
1939         int err;
1940
1941         preempt_disable();
1942         err = netif_rx(skb);
1943         if (local_softirq_pending())
1944                 do_softirq();
1945         preempt_enable();
1946
1947         return err;
1948 }
1949
1950 EXPORT_SYMBOL(netif_rx_ni);
1951
1952 static void net_tx_action(struct softirq_action *h)
1953 {
1954         struct softnet_data *sd = &__get_cpu_var(softnet_data);
1955
1956         if (sd->completion_queue) {
1957                 struct sk_buff *clist;
1958
1959                 local_irq_disable();
1960                 clist = sd->completion_queue;
1961                 sd->completion_queue = NULL;
1962                 local_irq_enable();
1963
1964                 while (clist) {
1965                         struct sk_buff *skb = clist;
1966                         clist = clist->next;
1967
1968                         WARN_ON(atomic_read(&skb->users));
1969                         __kfree_skb(skb);
1970                 }
1971         }
1972
1973         if (sd->output_queue) {
1974                 struct Qdisc *head;
1975
1976                 local_irq_disable();
1977                 head = sd->output_queue;
1978                 sd->output_queue = NULL;
1979                 local_irq_enable();
1980
1981                 while (head) {
1982                         struct Qdisc *q = head;
1983                         spinlock_t *root_lock;
1984
1985                         head = head->next_sched;
1986
1987                         root_lock = qdisc_lock(q);
1988                         if (spin_trylock(root_lock)) {
1989                                 smp_mb__before_clear_bit();
1990                                 clear_bit(__QDISC_STATE_SCHED,
1991                                           &q->state);
1992                                 qdisc_run(q);
1993                                 spin_unlock(root_lock);
1994                         } else {
1995                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
1996                                               &q->state)) {
1997                                         __netif_reschedule(q);
1998                                 } else {
1999                                         smp_mb__before_clear_bit();
2000                                         clear_bit(__QDISC_STATE_SCHED,
2001                                                   &q->state);
2002                                 }
2003                         }
2004                 }
2005         }
2006 }
2007
2008 static inline int deliver_skb(struct sk_buff *skb,
2009                               struct packet_type *pt_prev,
2010                               struct net_device *orig_dev)
2011 {
2012         atomic_inc(&skb->users);
2013         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2014 }
2015
2016 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2017 /* These hooks defined here for ATM */
2018 struct net_bridge;
2019 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2020                                                 unsigned char *addr);
2021 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2022
2023 /*
2024  * If bridge module is loaded call bridging hook.
2025  *  returns NULL if packet was consumed.
2026  */
2027 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2028                                         struct sk_buff *skb) __read_mostly;
2029 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2030                                             struct packet_type **pt_prev, int *ret,
2031                                             struct net_device *orig_dev)
2032 {
2033         struct net_bridge_port *port;
2034
2035         if (skb->pkt_type == PACKET_LOOPBACK ||
2036             (port = rcu_dereference(skb->dev->br_port)) == NULL)
2037                 return skb;
2038
2039         if (*pt_prev) {
2040                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2041                 *pt_prev = NULL;
2042         }
2043
2044         return br_handle_frame_hook(port, skb);
2045 }
2046 #else
2047 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
2048 #endif
2049
2050 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2051 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2052 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2053
2054 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2055                                              struct packet_type **pt_prev,
2056                                              int *ret,
2057                                              struct net_device *orig_dev)
2058 {
2059         if (skb->dev->macvlan_port == NULL)
2060                 return skb;
2061
2062         if (*pt_prev) {
2063                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2064                 *pt_prev = NULL;
2065         }
2066         return macvlan_handle_frame_hook(skb);
2067 }
2068 #else
2069 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
2070 #endif
2071
2072 #ifdef CONFIG_NET_CLS_ACT
2073 /* TODO: Maybe we should just force sch_ingress to be compiled in
2074  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2075  * a compare and 2 stores extra right now if we dont have it on
2076  * but have CONFIG_NET_CLS_ACT
2077  * NOTE: This doesnt stop any functionality; if you dont have
2078  * the ingress scheduler, you just cant add policies on ingress.
2079  *
2080  */
2081 static int ing_filter(struct sk_buff *skb)
2082 {
2083         struct net_device *dev = skb->dev;
2084         u32 ttl = G_TC_RTTL(skb->tc_verd);
2085         struct netdev_queue *rxq;
2086         int result = TC_ACT_OK;
2087         struct Qdisc *q;
2088
2089         if (MAX_RED_LOOP < ttl++) {
2090                 printk(KERN_WARNING
2091                        "Redir loop detected Dropping packet (%d->%d)\n",
2092                        skb->iif, dev->ifindex);
2093                 return TC_ACT_SHOT;
2094         }
2095
2096         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2097         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2098
2099         rxq = &dev->rx_queue;
2100
2101         q = rxq->qdisc;
2102         if (q != &noop_qdisc) {
2103                 spin_lock(qdisc_lock(q));
2104                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2105                         result = qdisc_enqueue_root(skb, q);
2106                 spin_unlock(qdisc_lock(q));
2107         }
2108
2109         return result;
2110 }
2111
2112 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2113                                          struct packet_type **pt_prev,
2114                                          int *ret, struct net_device *orig_dev)
2115 {
2116         if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2117                 goto out;
2118
2119         if (*pt_prev) {
2120                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2121                 *pt_prev = NULL;
2122         } else {
2123                 /* Huh? Why does turning on AF_PACKET affect this? */
2124                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2125         }
2126
2127         switch (ing_filter(skb)) {
2128         case TC_ACT_SHOT:
2129         case TC_ACT_STOLEN:
2130                 kfree_skb(skb);
2131                 return NULL;
2132         }
2133
2134 out:
2135         skb->tc_verd = 0;
2136         return skb;
2137 }
2138 #endif
2139
2140 /*
2141  *      netif_nit_deliver - deliver received packets to network taps
2142  *      @skb: buffer
2143  *
2144  *      This function is used to deliver incoming packets to network
2145  *      taps. It should be used when the normal netif_receive_skb path
2146  *      is bypassed, for example because of VLAN acceleration.
2147  */
2148 void netif_nit_deliver(struct sk_buff *skb)
2149 {
2150         struct packet_type *ptype;
2151
2152         if (list_empty(&ptype_all))
2153                 return;
2154
2155         skb_reset_network_header(skb);
2156         skb_reset_transport_header(skb);
2157         skb->mac_len = skb->network_header - skb->mac_header;
2158
2159         rcu_read_lock();
2160         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2161                 if (!ptype->dev || ptype->dev == skb->dev)
2162                         deliver_skb(skb, ptype, skb->dev);
2163         }
2164         rcu_read_unlock();
2165 }
2166
2167 /**
2168  *      netif_receive_skb - process receive buffer from network
2169  *      @skb: buffer to process
2170  *
2171  *      netif_receive_skb() is the main receive data processing function.
2172  *      It always succeeds. The buffer may be dropped during processing
2173  *      for congestion control or by the protocol layers.
2174  *
2175  *      This function may only be called from softirq context and interrupts
2176  *      should be enabled.
2177  *
2178  *      Return values (usually ignored):
2179  *      NET_RX_SUCCESS: no congestion
2180  *      NET_RX_DROP: packet was dropped
2181  */
2182 int netif_receive_skb(struct sk_buff *skb)
2183 {
2184         struct packet_type *ptype, *pt_prev;
2185         struct net_device *orig_dev;
2186         struct net_device *null_or_orig;
2187         int ret = NET_RX_DROP;
2188         __be16 type;
2189
2190         /* if we've gotten here through NAPI, check netpoll */
2191         if (netpoll_receive_skb(skb))
2192                 return NET_RX_DROP;
2193
2194         if (!skb->tstamp.tv64)
2195                 net_timestamp(skb);
2196
2197         if (!skb->iif)
2198                 skb->iif = skb->dev->ifindex;
2199
2200         null_or_orig = NULL;
2201         orig_dev = skb->dev;
2202         if (orig_dev->master) {
2203                 if (skb_bond_should_drop(skb))
2204                         null_or_orig = orig_dev; /* deliver only exact match */
2205                 else
2206                         skb->dev = orig_dev->master;
2207         }
2208
2209         __get_cpu_var(netdev_rx_stat).total++;
2210
2211         skb_reset_network_header(skb);
2212         skb_reset_transport_header(skb);
2213         skb->mac_len = skb->network_header - skb->mac_header;
2214
2215         pt_prev = NULL;
2216
2217         rcu_read_lock();
2218
2219         /* Don't receive packets in an exiting network namespace */
2220         if (!net_alive(dev_net(skb->dev)))
2221                 goto out;
2222
2223 #ifdef CONFIG_NET_CLS_ACT
2224         if (skb->tc_verd & TC_NCLS) {
2225                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2226                 goto ncls;
2227         }
2228 #endif
2229
2230         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2231                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2232                     ptype->dev == orig_dev) {
2233                         if (pt_prev)
2234                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2235                         pt_prev = ptype;
2236                 }
2237         }
2238
2239 #ifdef CONFIG_NET_CLS_ACT
2240         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2241         if (!skb)
2242                 goto out;
2243 ncls:
2244 #endif
2245
2246         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2247         if (!skb)
2248                 goto out;
2249         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2250         if (!skb)
2251                 goto out;
2252
2253         type = skb->protocol;
2254         list_for_each_entry_rcu(ptype,
2255                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2256                 if (ptype->type == type &&
2257                     (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2258                      ptype->dev == orig_dev)) {
2259                         if (pt_prev)
2260                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2261                         pt_prev = ptype;
2262                 }
2263         }
2264
2265         if (pt_prev) {
2266                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2267         } else {
2268                 kfree_skb(skb);
2269                 /* Jamal, now you will not able to escape explaining
2270                  * me how you were going to use this. :-)
2271                  */
2272                 ret = NET_RX_DROP;
2273         }
2274
2275 out:
2276         rcu_read_unlock();
2277         return ret;
2278 }
2279
2280 /* Network device is going away, flush any packets still pending  */
2281 static void flush_backlog(void *arg)
2282 {
2283         struct net_device *dev = arg;
2284         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2285         struct sk_buff *skb, *tmp;
2286
2287         skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2288                 if (skb->dev == dev) {
2289                         __skb_unlink(skb, &queue->input_pkt_queue);
2290                         kfree_skb(skb);
2291                 }
2292 }
2293
2294 static int process_backlog(struct napi_struct *napi, int quota)
2295 {
2296         int work = 0;
2297         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2298         unsigned long start_time = jiffies;
2299
2300         napi->weight = weight_p;
2301         do {
2302                 struct sk_buff *skb;
2303
2304                 local_irq_disable();
2305                 skb = __skb_dequeue(&queue->input_pkt_queue);
2306                 if (!skb) {
2307                         __napi_complete(napi);
2308                         local_irq_enable();
2309                         break;
2310                 }
2311                 local_irq_enable();
2312
2313                 netif_receive_skb(skb);
2314         } while (++work < quota && jiffies == start_time);
2315
2316         return work;
2317 }
2318
2319 /**
2320  * __napi_schedule - schedule for receive
2321  * @n: entry to schedule
2322  *
2323  * The entry's receive function will be scheduled to run
2324  */
2325 void __napi_schedule(struct napi_struct *n)
2326 {
2327         unsigned long flags;
2328
2329         local_irq_save(flags);
2330         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2331         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2332         local_irq_restore(flags);
2333 }
2334 EXPORT_SYMBOL(__napi_schedule);
2335
2336
2337 static void net_rx_action(struct softirq_action *h)
2338 {
2339         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2340         unsigned long start_time = jiffies;
2341         int budget = netdev_budget;
2342         void *have;
2343
2344         local_irq_disable();
2345
2346         while (!list_empty(list)) {
2347                 struct napi_struct *n;
2348                 int work, weight;
2349
2350                 /* If softirq window is exhuasted then punt.
2351                  *
2352                  * Note that this is a slight policy change from the
2353                  * previous NAPI code, which would allow up to 2
2354                  * jiffies to pass before breaking out.  The test
2355                  * used to be "jiffies - start_time > 1".
2356                  */
2357                 if (unlikely(budget <= 0 || jiffies != start_time))
2358                         goto softnet_break;
2359
2360                 local_irq_enable();
2361
2362                 /* Even though interrupts have been re-enabled, this
2363                  * access is safe because interrupts can only add new
2364                  * entries to the tail of this list, and only ->poll()
2365                  * calls can remove this head entry from the list.
2366                  */
2367                 n = list_entry(list->next, struct napi_struct, poll_list);
2368
2369                 have = netpoll_poll_lock(n);
2370
2371                 weight = n->weight;
2372
2373                 /* This NAPI_STATE_SCHED test is for avoiding a race
2374                  * with netpoll's poll_napi().  Only the entity which
2375                  * obtains the lock and sees NAPI_STATE_SCHED set will
2376                  * actually make the ->poll() call.  Therefore we avoid
2377                  * accidently calling ->poll() when NAPI is not scheduled.
2378                  */
2379                 work = 0;
2380                 if (test_bit(NAPI_STATE_SCHED, &n->state))
2381                         work = n->poll(n, weight);
2382
2383                 WARN_ON_ONCE(work > weight);
2384
2385                 budget -= work;
2386
2387                 local_irq_disable();
2388
2389                 /* Drivers must not modify the NAPI state if they
2390                  * consume the entire weight.  In such cases this code
2391                  * still "owns" the NAPI instance and therefore can
2392                  * move the instance around on the list at-will.
2393                  */
2394                 if (unlikely(work == weight)) {
2395                         if (unlikely(napi_disable_pending(n)))
2396                                 __napi_complete(n);
2397                         else
2398                                 list_move_tail(&n->poll_list, list);
2399                 }
2400
2401                 netpoll_poll_unlock(have);
2402         }
2403 out:
2404         local_irq_enable();
2405
2406 #ifdef CONFIG_NET_DMA
2407         /*
2408          * There may not be any more sk_buffs coming right now, so push
2409          * any pending DMA copies to hardware
2410          */
2411         if (!cpus_empty(net_dma.channel_mask)) {
2412                 int chan_idx;
2413                 for_each_cpu_mask_nr(chan_idx, net_dma.channel_mask) {
2414                         struct dma_chan *chan = net_dma.channels[chan_idx];
2415                         if (chan)
2416                                 dma_async_memcpy_issue_pending(chan);
2417                 }
2418         }
2419 #endif
2420
2421         return;
2422
2423 softnet_break:
2424         __get_cpu_var(netdev_rx_stat).time_squeeze++;
2425         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2426         goto out;
2427 }
2428
2429 static gifconf_func_t * gifconf_list [NPROTO];
2430
2431 /**
2432  *      register_gifconf        -       register a SIOCGIF handler
2433  *      @family: Address family
2434  *      @gifconf: Function handler
2435  *
2436  *      Register protocol dependent address dumping routines. The handler
2437  *      that is passed must not be freed or reused until it has been replaced
2438  *      by another handler.
2439  */
2440 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2441 {
2442         if (family >= NPROTO)
2443                 return -EINVAL;
2444         gifconf_list[family] = gifconf;
2445         return 0;
2446 }
2447
2448
2449 /*
2450  *      Map an interface index to its name (SIOCGIFNAME)
2451  */
2452
2453 /*
2454  *      We need this ioctl for efficient implementation of the
2455  *      if_indextoname() function required by the IPv6 API.  Without
2456  *      it, we would have to search all the interfaces to find a
2457  *      match.  --pb
2458  */
2459
2460 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2461 {
2462         struct net_device *dev;
2463         struct ifreq ifr;
2464
2465         /*
2466          *      Fetch the caller's info block.
2467          */
2468
2469         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2470                 return -EFAULT;
2471
2472         read_lock(&dev_base_lock);
2473         dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2474         if (!dev) {
2475                 read_unlock(&dev_base_lock);
2476                 return -ENODEV;
2477         }
2478
2479         strcpy(ifr.ifr_name, dev->name);
2480         read_unlock(&dev_base_lock);
2481
2482         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2483                 return -EFAULT;
2484         return 0;
2485 }
2486
2487 /*
2488  *      Perform a SIOCGIFCONF call. This structure will change
2489  *      size eventually, and there is nothing I can do about it.
2490  *      Thus we will need a 'compatibility mode'.
2491  */
2492
2493 static int dev_ifconf(struct net *net, char __user *arg)
2494 {
2495         struct ifconf ifc;
2496         struct net_device *dev;
2497         char __user *pos;
2498         int len;
2499         int total;
2500         int i;
2501
2502         /*
2503          *      Fetch the caller's info block.
2504          */
2505
2506         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2507                 return -EFAULT;
2508
2509         pos = ifc.ifc_buf;
2510         len = ifc.ifc_len;
2511
2512         /*
2513          *      Loop over the interfaces, and write an info block for each.
2514          */
2515
2516         total = 0;
2517         for_each_netdev(net, dev) {
2518                 for (i = 0; i < NPROTO; i++) {
2519                         if (gifconf_list[i]) {
2520                                 int done;
2521                                 if (!pos)
2522                                         done = gifconf_list[i](dev, NULL, 0);
2523                                 else
2524                                         done = gifconf_list[i](dev, pos + total,
2525                                                                len - total);
2526                                 if (done < 0)
2527                                         return -EFAULT;
2528                                 total += done;
2529                         }
2530                 }
2531         }
2532
2533         /*
2534          *      All done.  Write the updated control block back to the caller.
2535          */
2536         ifc.ifc_len = total;
2537
2538         /*
2539          *      Both BSD and Solaris return 0 here, so we do too.
2540          */
2541         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2542 }
2543
2544 #ifdef CONFIG_PROC_FS
2545 /*
2546  *      This is invoked by the /proc filesystem handler to display a device
2547  *      in detail.
2548  */
2549 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2550         __acquires(dev_base_lock)
2551 {
2552         struct net *net = seq_file_net(seq);
2553         loff_t off;
2554         struct net_device *dev;
2555
2556         read_lock(&dev_base_lock);
2557         if (!*pos)
2558                 return SEQ_START_TOKEN;
2559
2560         off = 1;
2561         for_each_netdev(net, dev)
2562                 if (off++ == *pos)
2563                         return dev;
2564
2565         return NULL;
2566 }
2567
2568 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2569 {
2570         struct net *net = seq_file_net(seq);
2571         ++*pos;
2572         return v == SEQ_START_TOKEN ?
2573                 first_net_device(net) : next_net_device((struct net_device *)v);
2574 }
2575
2576 void dev_seq_stop(struct seq_file *seq, void *v)
2577         __releases(dev_base_lock)
2578 {
2579         read_unlock(&dev_base_lock);
2580 }
2581
2582 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2583 {
2584         struct net_device_stats *stats = dev->get_stats(dev);
2585
2586         seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2587                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2588                    dev->name, stats->rx_bytes, stats->rx_packets,
2589                    stats->rx_errors,
2590                    stats->rx_dropped + stats->rx_missed_errors,
2591                    stats->rx_fifo_errors,
2592                    stats->rx_length_errors + stats->rx_over_errors +
2593                     stats->rx_crc_errors + stats->rx_frame_errors,
2594                    stats->rx_compressed, stats->multicast,
2595                    stats->tx_bytes, stats->tx_packets,
2596                    stats->tx_errors, stats->tx_dropped,
2597                    stats->tx_fifo_errors, stats->collisions,
2598                    stats->tx_carrier_errors +
2599                     stats->tx_aborted_errors +
2600                     stats->tx_window_errors +
2601                     stats->tx_heartbeat_errors,
2602                    stats->tx_compressed);
2603 }
2604
2605 /*
2606  *      Called from the PROCfs module. This now uses the new arbitrary sized
2607  *      /proc/net interface to create /proc/net/dev
2608  */
2609 static int dev_seq_show(struct seq_file *seq, void *v)
2610 {
2611         if (v == SEQ_START_TOKEN)
2612                 seq_puts(seq, "Inter-|   Receive                            "
2613                               "                    |  Transmit\n"
2614                               " face |bytes    packets errs drop fifo frame "
2615                               "compressed multicast|bytes    packets errs "
2616                               "drop fifo colls carrier compressed\n");
2617         else
2618                 dev_seq_printf_stats(seq, v);
2619         return 0;
2620 }
2621
2622 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2623 {
2624         struct netif_rx_stats *rc = NULL;
2625
2626         while (*pos < nr_cpu_ids)
2627                 if (cpu_online(*pos)) {
2628                         rc = &per_cpu(netdev_rx_stat, *pos);
2629                         break;
2630                 } else
2631                         ++*pos;
2632         return rc;
2633 }
2634
2635 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2636 {
2637         return softnet_get_online(pos);
2638 }
2639
2640 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2641 {
2642         ++*pos;
2643         return softnet_get_online(pos);
2644 }
2645
2646 static void softnet_seq_stop(struct seq_file *seq, void *v)
2647 {
2648 }
2649
2650 static int softnet_seq_show(struct seq_file *seq, void *v)
2651 {
2652         struct netif_rx_stats *s = v;
2653
2654         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2655                    s->total, s->dropped, s->time_squeeze, 0,
2656                    0, 0, 0, 0, /* was fastroute */
2657                    s->cpu_collision );
2658         return 0;
2659 }
2660
2661 static const struct seq_operations dev_seq_ops = {
2662         .start = dev_seq_start,
2663         .next  = dev_seq_next,
2664         .stop  = dev_seq_stop,
2665         .show  = dev_seq_show,
2666 };
2667
2668 static int dev_seq_open(struct inode *inode, struct file *file)
2669 {
2670         return seq_open_net(inode, file, &dev_seq_ops,
2671                             sizeof(struct seq_net_private));
2672 }
2673
2674 static const struct file_operations dev_seq_fops = {
2675         .owner   = THIS_MODULE,
2676         .open    = dev_seq_open,
2677         .read    = seq_read,
2678         .llseek  = seq_lseek,
2679         .release = seq_release_net,
2680 };
2681
2682 static const struct seq_operations softnet_seq_ops = {
2683         .start = softnet_seq_start,
2684         .next  = softnet_seq_next,
2685         .stop  = softnet_seq_stop,
2686         .show  = softnet_seq_show,
2687 };
2688
2689 static int softnet_seq_open(struct inode *inode, struct file *file)
2690 {
2691         return seq_open(file, &softnet_seq_ops);
2692 }
2693
2694 static const struct file_operations softnet_seq_fops = {
2695         .owner   = THIS_MODULE,
2696         .open    = softnet_seq_open,
2697         .read    = seq_read,
2698         .llseek  = seq_lseek,
2699         .release = seq_release,
2700 };
2701
2702 static void *ptype_get_idx(loff_t pos)
2703 {
2704         struct packet_type *pt = NULL;
2705         loff_t i = 0;
2706         int t;
2707
2708         list_for_each_entry_rcu(pt, &ptype_all, list) {
2709                 if (i == pos)
2710                         return pt;
2711                 ++i;
2712         }
2713
2714         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
2715                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
2716                         if (i == pos)
2717                                 return pt;
2718                         ++i;
2719                 }
2720         }
2721         return NULL;
2722 }
2723
2724 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
2725         __acquires(RCU)
2726 {
2727         rcu_read_lock();
2728         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
2729 }
2730
2731 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2732 {
2733         struct packet_type *pt;
2734         struct list_head *nxt;
2735         int hash;
2736
2737         ++*pos;
2738         if (v == SEQ_START_TOKEN)
2739                 return ptype_get_idx(0);
2740
2741         pt = v;
2742         nxt = pt->list.next;
2743         if (pt->type == htons(ETH_P_ALL)) {
2744                 if (nxt != &ptype_all)
2745                         goto found;
2746                 hash = 0;
2747                 nxt = ptype_base[0].next;
2748         } else
2749                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
2750
2751         while (nxt == &ptype_base[hash]) {
2752                 if (++hash >= PTYPE_HASH_SIZE)
2753                         return NULL;
2754                 nxt = ptype_base[hash].next;
2755         }
2756 found:
2757         return list_entry(nxt, struct packet_type, list);
2758 }
2759
2760 static void ptype_seq_stop(struct seq_file *seq, void *v)
2761         __releases(RCU)
2762 {
2763         rcu_read_unlock();
2764 }
2765
2766 static void ptype_seq_decode(struct seq_file *seq, void *sym)
2767 {
2768 #ifdef CONFIG_KALLSYMS
2769         unsigned long offset = 0, symsize;
2770         const char *symname;
2771         char *modname;
2772         char namebuf[128];
2773
2774         symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset,
2775                                   &modname, namebuf);
2776
2777         if (symname) {
2778                 char *delim = ":";
2779
2780                 if (!modname)
2781                         modname = delim = "";
2782                 seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim,
2783                            symname, offset);
2784                 return;
2785         }
2786 #endif
2787
2788         seq_printf(seq, "[%p]", sym);
2789 }
2790
2791 static int ptype_seq_show(struct seq_file *seq, void *v)
2792 {
2793         struct packet_type *pt = v;
2794
2795         if (v == SEQ_START_TOKEN)
2796                 seq_puts(seq, "Type Device      Function\n");
2797         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
2798                 if (pt->type == htons(ETH_P_ALL))
2799                         seq_puts(seq, "ALL ");
2800                 else
2801                         seq_printf(seq, "%04x", ntohs(pt->type));
2802
2803                 seq_printf(seq, " %-8s ",
2804                            pt->dev ? pt->dev->name : "");
2805                 ptype_seq_decode(seq,  pt->func);
2806                 seq_putc(seq, '\n');
2807         }
2808
2809         return 0;
2810 }
2811
2812 static const struct seq_operations ptype_seq_ops = {
2813         .start = ptype_seq_start,
2814         .next  = ptype_seq_next,
2815         .stop  = ptype_seq_stop,
2816         .show  = ptype_seq_show,
2817 };
2818
2819 static int ptype_seq_open(struct inode *inode, struct file *file)
2820 {
2821         return seq_open_net(inode, file, &ptype_seq_ops,
2822                         sizeof(struct seq_net_private));
2823 }
2824
2825 static const struct file_operations ptype_seq_fops = {
2826         .owner   = THIS_MODULE,
2827         .open    = ptype_seq_open,
2828         .read    = seq_read,
2829         .llseek  = seq_lseek,
2830         .release = seq_release_net,
2831 };
2832
2833
2834 static int __net_init dev_proc_net_init(struct net *net)
2835 {
2836         int rc = -ENOMEM;
2837
2838         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
2839                 goto out;
2840         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
2841                 goto out_dev;
2842         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
2843                 goto out_softnet;
2844
2845         if (wext_proc_init(net))
2846                 goto out_ptype;
2847         rc = 0;
2848 out:
2849         return rc;
2850 out_ptype:
2851         proc_net_remove(net, "ptype");
2852 out_softnet:
2853         proc_net_remove(net, "softnet_stat");
2854 out_dev:
2855         proc_net_remove(net, "dev");
2856         goto out;
2857 }
2858
2859 static void __net_exit dev_proc_net_exit(struct net *net)
2860 {
2861         wext_proc_exit(net);
2862
2863         proc_net_remove(net, "ptype");
2864         proc_net_remove(net, "softnet_stat");
2865         proc_net_remove(net, "dev");
2866 }
2867
2868 static struct pernet_operations __net_initdata dev_proc_ops = {
2869         .init = dev_proc_net_init,
2870         .exit = dev_proc_net_exit,
2871 };
2872
2873 static int __init dev_proc_init(void)
2874 {
2875         return register_pernet_subsys(&dev_proc_ops);
2876 }
2877 #else
2878 #define dev_proc_init() 0
2879 #endif  /* CONFIG_PROC_FS */
2880
2881
2882 /**
2883  *      netdev_set_master       -       set up master/slave pair
2884  *      @slave: slave device
2885  *      @master: new master device
2886  *
2887  *      Changes the master device of the slave. Pass %NULL to break the
2888  *      bonding. The caller must hold the RTNL semaphore. On a failure
2889  *      a negative errno code is returned. On success the reference counts
2890  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2891  *      function returns zero.
2892  */
2893 int netdev_set_master(struct net_device *slave, struct net_device *master)
2894 {
2895         struct net_device *old = slave->master;
2896
2897         ASSERT_RTNL();
2898
2899         if (master) {
2900                 if (old)
2901                         return -EBUSY;
2902                 dev_hold(master);
2903         }
2904
2905         slave->master = master;
2906
2907         synchronize_net();
2908
2909         if (old)
2910                 dev_put(old);
2911
2912         if (master)
2913                 slave->flags |= IFF_SLAVE;
2914         else
2915                 slave->flags &= ~IFF_SLAVE;
2916
2917         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2918         return 0;
2919 }
2920
2921 static int __dev_set_promiscuity(struct net_device *dev, int inc)
2922 {
2923         unsigned short old_flags = dev->flags;
2924
2925         ASSERT_RTNL();
2926
2927         dev->flags |= IFF_PROMISC;
2928         dev->promiscuity += inc;
2929         if (dev->promiscuity == 0) {
2930                 /*
2931                  * Avoid overflow.
2932                  * If inc causes overflow, untouch promisc and return error.
2933                  */
2934                 if (inc < 0)
2935                         dev->flags &= ~IFF_PROMISC;
2936                 else {
2937                         dev->promiscuity -= inc;
2938                         printk(KERN_WARNING "%s: promiscuity touches roof, "
2939                                 "set promiscuity failed, promiscuity feature "
2940                                 "of device might be broken.\n", dev->name);
2941                         return -EOVERFLOW;
2942                 }
2943         }
2944         if (dev->flags != old_flags) {
2945                 printk(KERN_INFO "device %s %s promiscuous mode\n",
2946                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2947                                                                "left");
2948                 if (audit_enabled)
2949                         audit_log(current->audit_context, GFP_ATOMIC,
2950                                 AUDIT_ANOM_PROMISCUOUS,
2951                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
2952                                 dev->name, (dev->flags & IFF_PROMISC),
2953                                 (old_flags & IFF_PROMISC),
2954                                 audit_get_loginuid(current),
2955                                 current->uid, current->gid,
2956                                 audit_get_sessionid(current));
2957
2958                 if (dev->change_rx_flags)
2959                         dev->change_rx_flags(dev, IFF_PROMISC);
2960         }
2961         return 0;
2962 }
2963
2964 /**
2965  *      dev_set_promiscuity     - update promiscuity count on a device
2966  *      @dev: device
2967  *      @inc: modifier
2968  *
2969  *      Add or remove promiscuity from a device. While the count in the device
2970  *      remains above zero the interface remains promiscuous. Once it hits zero
2971  *      the device reverts back to normal filtering operation. A negative inc
2972  *      value is used to drop promiscuity on the device.
2973  *      Return 0 if successful or a negative errno code on error.
2974  */
2975 int dev_set_promiscuity(struct net_device *dev, int inc)
2976 {
2977         unsigned short old_flags = dev->flags;
2978         int err;
2979
2980         err = __dev_set_promiscuity(dev, inc);
2981         if (err < 0)
2982                 return err;
2983         if (dev->flags != old_flags)
2984                 dev_set_rx_mode(dev);
2985         return err;
2986 }
2987
2988 /**
2989  *      dev_set_allmulti        - update allmulti count on a device
2990  *      @dev: device
2991  *      @inc: modifier
2992  *
2993  *      Add or remove reception of all multicast frames to a device. While the
2994  *      count in the device remains above zero the interface remains listening
2995  *      to all interfaces. Once it hits zero the device reverts back to normal
2996  *      filtering operation. A negative @inc value is used to drop the counter
2997  *      when releasing a resource needing all multicasts.
2998  *      Return 0 if successful or a negative errno code on error.
2999  */
3000
3001 int dev_set_allmulti(struct net_device *dev, int inc)
3002 {
3003         unsigned short old_flags = dev->flags;
3004
3005         ASSERT_RTNL();
3006
3007         dev->flags |= IFF_ALLMULTI;
3008         dev->allmulti += inc;
3009         if (dev->allmulti == 0) {
3010                 /*
3011                  * Avoid overflow.
3012                  * If inc causes overflow, untouch allmulti and return error.
3013                  */
3014                 if (inc < 0)
3015                         dev->flags &= ~IFF_ALLMULTI;
3016                 else {
3017                         dev->allmulti -= inc;
3018                         printk(KERN_WARNING "%s: allmulti touches roof, "
3019                                 "set allmulti failed, allmulti feature of "
3020                                 "device might be broken.\n", dev->name);
3021                         return -EOVERFLOW;
3022                 }
3023         }
3024         if (dev->flags ^ old_flags) {
3025                 if (dev->change_rx_flags)
3026                         dev->change_rx_flags(dev, IFF_ALLMULTI);
3027                 dev_set_rx_mode(dev);
3028         }
3029         return 0;
3030 }
3031
3032 /*
3033  *      Upload unicast and multicast address lists to device and
3034  *      configure RX filtering. When the device doesn't support unicast
3035  *      filtering it is put in promiscuous mode while unicast addresses
3036  *      are present.
3037  */
3038 void __dev_set_rx_mode(struct net_device *dev)
3039 {
3040         /* dev_open will call this function so the list will stay sane. */
3041         if (!(dev->flags&IFF_UP))
3042                 return;
3043
3044         if (!netif_device_present(dev))
3045                 return;
3046
3047         if (dev->set_rx_mode)
3048                 dev->set_rx_mode(dev);
3049         else {
3050                 /* Unicast addresses changes may only happen under the rtnl,
3051                  * therefore calling __dev_set_promiscuity here is safe.
3052                  */
3053                 if (dev->uc_count > 0 && !dev->uc_promisc) {
3054                         __dev_set_promiscuity(dev, 1);
3055                         dev->uc_promisc = 1;
3056                 } else if (dev->uc_count == 0 && dev->uc_promisc) {
3057                         __dev_set_promiscuity(dev, -1);
3058                         dev->uc_promisc = 0;
3059                 }
3060
3061                 if (dev->set_multicast_list)
3062                         dev->set_multicast_list(dev);
3063         }
3064 }
3065
3066 void dev_set_rx_mode(struct net_device *dev)
3067 {
3068         netif_addr_lock_bh(dev);
3069         __dev_set_rx_mode(dev);
3070         netif_addr_unlock_bh(dev);
3071 }
3072
3073 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3074                       void *addr, int alen, int glbl)
3075 {
3076         struct dev_addr_list *da;
3077
3078         for (; (da = *list) != NULL; list = &da->next) {
3079                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3080                     alen == da->da_addrlen) {
3081                         if (glbl) {
3082                                 int old_glbl = da->da_gusers;
3083                                 da->da_gusers = 0;
3084                                 if (old_glbl == 0)
3085                                         break;
3086                         }
3087                         if (--da->da_users)
3088                                 return 0;
3089
3090                         *list = da->next;
3091                         kfree(da);
3092                         (*count)--;
3093                         return 0;
3094                 }
3095         }
3096         return -ENOENT;
3097 }
3098
3099 int __dev_addr_add(struct dev_addr_list **list, int *count,
3100                    void *addr, int alen, int glbl)
3101 {
3102         struct dev_addr_list *da;
3103
3104         for (da = *list; da != NULL; da = da->next) {
3105                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3106                     da->da_addrlen == alen) {
3107                         if (glbl) {
3108                                 int old_glbl = da->da_gusers;
3109                                 da->da_gusers = 1;
3110                                 if (old_glbl)
3111                                         return 0;
3112                         }
3113                         da->da_users++;
3114                         return 0;
3115                 }
3116         }
3117
3118         da = kzalloc(sizeof(*da), GFP_ATOMIC);
3119         if (da == NULL)
3120                 return -ENOMEM;
3121         memcpy(da->da_addr, addr, alen);
3122         da->da_addrlen = alen;
3123         da->da_users = 1;
3124         da->da_gusers = glbl ? 1 : 0;
3125         da->next = *list;
3126         *list = da;
3127         (*count)++;
3128         return 0;
3129 }
3130
3131 /**
3132  *      dev_unicast_delete      - Release secondary unicast address.
3133  *      @dev: device
3134  *      @addr: address to delete
3135  *      @alen: length of @addr
3136  *
3137  *      Release reference to a secondary unicast address and remove it
3138  *      from the device if the reference count drops to zero.
3139  *
3140  *      The caller must hold the rtnl_mutex.
3141  */
3142 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3143 {
3144         int err;
3145
3146         ASSERT_RTNL();
3147
3148         netif_addr_lock_bh(dev);
3149         err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3150         if (!err)
3151                 __dev_set_rx_mode(dev);
3152         netif_addr_unlock_bh(dev);
3153         return err;
3154 }
3155 EXPORT_SYMBOL(dev_unicast_delete);
3156
3157 /**
3158  *      dev_unicast_add         - add a secondary unicast address
3159  *      @dev: device
3160  *      @addr: address to add
3161  *      @alen: length of @addr
3162  *
3163  *      Add a secondary unicast address to the device or increase
3164  *      the reference count if it already exists.
3165  *
3166  *      The caller must hold the rtnl_mutex.
3167  */
3168 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3169 {
3170         int err;
3171
3172         ASSERT_RTNL();
3173
3174         netif_addr_lock_bh(dev);
3175         err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3176         if (!err)
3177                 __dev_set_rx_mode(dev);
3178         netif_addr_unlock_bh(dev);
3179         return err;
3180 }
3181 EXPORT_SYMBOL(dev_unicast_add);
3182
3183 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3184                     struct dev_addr_list **from, int *from_count)
3185 {
3186         struct dev_addr_list *da, *next;
3187         int err = 0;
3188
3189         da = *from;
3190         while (da != NULL) {
3191                 next = da->next;
3192                 if (!da->da_synced) {
3193                         err = __dev_addr_add(to, to_count,
3194                                              da->da_addr, da->da_addrlen, 0);
3195                         if (err < 0)
3196                                 break;
3197                         da->da_synced = 1;
3198                         da->da_users++;
3199                 } else if (da->da_users == 1) {
3200                         __dev_addr_delete(to, to_count,
3201                                           da->da_addr, da->da_addrlen, 0);
3202                         __dev_addr_delete(from, from_count,
3203                                           da->da_addr, da->da_addrlen, 0);
3204                 }
3205                 da = next;
3206         }
3207         return err;
3208 }
3209
3210 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3211                        struct dev_addr_list **from, int *from_count)
3212 {
3213         struct dev_addr_list *da, *next;
3214
3215         da = *from;
3216         while (da != NULL) {
3217                 next = da->next;
3218                 if (da->da_synced) {
3219                         __dev_addr_delete(to, to_count,
3220                                           da->da_addr, da->da_addrlen, 0);
3221                         da->da_synced = 0;
3222                         __dev_addr_delete(from, from_count,
3223                                           da->da_addr, da->da_addrlen, 0);
3224                 }
3225                 da = next;
3226         }
3227 }
3228
3229 /**
3230  *      dev_unicast_sync - Synchronize device's unicast list to another device
3231  *      @to: destination device
3232  *      @from: source device
3233  *
3234  *      Add newly added addresses to the destination device and release
3235  *      addresses that have no users left. The source device must be
3236  *      locked by netif_tx_lock_bh.
3237  *
3238  *      This function is intended to be called from the dev->set_rx_mode
3239  *      function of layered software devices.
3240  */
3241 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3242 {
3243         int err = 0;
3244
3245         netif_addr_lock_bh(to);
3246         err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3247                               &from->uc_list, &from->uc_count);
3248         if (!err)
3249                 __dev_set_rx_mode(to);
3250         netif_addr_unlock_bh(to);
3251         return err;
3252 }
3253 EXPORT_SYMBOL(dev_unicast_sync);
3254
3255 /**
3256  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
3257  *      @to: destination device
3258  *      @from: source device
3259  *
3260  *      Remove all addresses that were added to the destination device by
3261  *      dev_unicast_sync(). This function is intended to be called from the
3262  *      dev->stop function of layered software devices.
3263  */
3264 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3265 {
3266         netif_addr_lock_bh(from);
3267         netif_addr_lock(to);
3268
3269         __dev_addr_unsync(&to->uc_list, &to->uc_count,
3270                           &from->uc_list, &from->uc_count);
3271         __dev_set_rx_mode(to);
3272
3273         netif_addr_unlock(to);
3274         netif_addr_unlock_bh(from);
3275 }
3276 EXPORT_SYMBOL(dev_unicast_unsync);
3277
3278 static void __dev_addr_discard(struct dev_addr_list **list)
3279 {
3280         struct dev_addr_list *tmp;
3281
3282         while (*list != NULL) {
3283                 tmp = *list;
3284                 *list = tmp->next;
3285                 if (tmp->da_users > tmp->da_gusers)
3286                         printk("__dev_addr_discard: address leakage! "
3287                                "da_users=%d\n", tmp->da_users);
3288                 kfree(tmp);
3289         }
3290 }
3291
3292 static void dev_addr_discard(struct net_device *dev)
3293 {
3294         netif_addr_lock_bh(dev);
3295
3296         __dev_addr_discard(&dev->uc_list);
3297         dev->uc_count = 0;
3298
3299         __dev_addr_discard(&dev->mc_list);
3300         dev->mc_count = 0;
3301
3302         netif_addr_unlock_bh(dev);
3303 }
3304
3305 unsigned dev_get_flags(const struct net_device *dev)
3306 {
3307         unsigned flags;
3308
3309         flags = (dev->flags & ~(IFF_PROMISC |
3310                                 IFF_ALLMULTI |
3311                                 IFF_RUNNING |
3312                                 IFF_LOWER_UP |
3313                                 IFF_DORMANT)) |
3314                 (dev->gflags & (IFF_PROMISC |
3315                                 IFF_ALLMULTI));
3316
3317         if (netif_running(dev)) {
3318                 if (netif_oper_up(dev))
3319                         flags |= IFF_RUNNING;
3320                 if (netif_carrier_ok(dev))
3321                         flags |= IFF_LOWER_UP;
3322                 if (netif_dormant(dev))
3323                         flags |= IFF_DORMANT;
3324         }
3325
3326         return flags;
3327 }
3328
3329 int dev_change_flags(struct net_device *dev, unsigned flags)
3330 {
3331         int ret, changes;
3332         int old_flags = dev->flags;
3333
3334         ASSERT_RTNL();
3335
3336         /*
3337          *      Set the flags on our device.
3338          */
3339
3340         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3341                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3342                                IFF_AUTOMEDIA)) |
3343                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3344                                     IFF_ALLMULTI));
3345
3346         /*
3347          *      Load in the correct multicast list now the flags have changed.
3348          */
3349
3350         if (dev->change_rx_flags && (old_flags ^ flags) & IFF_MULTICAST)
3351                 dev->change_rx_flags(dev, IFF_MULTICAST);
3352
3353         dev_set_rx_mode(dev);
3354
3355         /*
3356          *      Have we downed the interface. We handle IFF_UP ourselves
3357          *      according to user attempts to set it, rather than blindly
3358          *      setting it.
3359          */
3360
3361         ret = 0;
3362         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
3363                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3364
3365                 if (!ret)
3366                         dev_set_rx_mode(dev);
3367         }
3368
3369         if (dev->flags & IFF_UP &&
3370             ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3371                                           IFF_VOLATILE)))
3372                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
3373
3374         if ((flags ^ dev->gflags) & IFF_PROMISC) {
3375                 int inc = (flags & IFF_PROMISC) ? +1 : -1;
3376                 dev->gflags ^= IFF_PROMISC;
3377                 dev_set_promiscuity(dev, inc);
3378         }
3379
3380         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3381            is important. Some (broken) drivers set IFF_PROMISC, when
3382            IFF_ALLMULTI is requested not asking us and not reporting.
3383          */
3384         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3385                 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3386                 dev->gflags ^= IFF_ALLMULTI;
3387                 dev_set_allmulti(dev, inc);
3388         }
3389
3390         /* Exclude state transition flags, already notified */
3391         changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3392         if (changes)
3393                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3394
3395         return ret;
3396 }
3397
3398 int dev_set_mtu(struct net_device *dev, int new_mtu)
3399 {
3400         int err;
3401
3402         if (new_mtu == dev->mtu)
3403                 return 0;
3404
3405         /*      MTU must be positive.    */
3406         if (new_mtu < 0)
3407                 return -EINVAL;
3408
3409         if (!netif_device_present(dev))
3410                 return -ENODEV;
3411
3412         err = 0;
3413         if (dev->change_mtu)
3414                 err = dev->change_mtu(dev, new_mtu);
3415         else
3416                 dev->mtu = new_mtu;
3417         if (!err && dev->flags & IFF_UP)
3418                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3419         return err;
3420 }
3421
3422 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3423 {
3424         int err;
3425
3426         if (!dev->set_mac_address)
3427                 return -EOPNOTSUPP;
3428         if (sa->sa_family != dev->type)
3429                 return -EINVAL;
3430         if (!netif_device_present(dev))
3431                 return -ENODEV;
3432         err = dev->set_mac_address(dev, sa);
3433         if (!err)
3434                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3435         return err;
3436 }
3437
3438 /*
3439  *      Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3440  */
3441 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3442 {
3443         int err;
3444         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3445
3446         if (!dev)
3447                 return -ENODEV;
3448
3449         switch (cmd) {
3450                 case SIOCGIFFLAGS:      /* Get interface flags */
3451                         ifr->ifr_flags = dev_get_flags(dev);
3452                         return 0;
3453
3454                 case SIOCGIFMETRIC:     /* Get the metric on the interface
3455                                            (currently unused) */
3456                         ifr->ifr_metric = 0;
3457                         return 0;
3458
3459                 case SIOCGIFMTU:        /* Get the MTU of a device */
3460                         ifr->ifr_mtu = dev->mtu;
3461                         return 0;
3462
3463                 case SIOCGIFHWADDR:
3464                         if (!dev->addr_len)
3465                                 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3466                         else
3467                                 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3468                                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3469                         ifr->ifr_hwaddr.sa_family = dev->type;
3470                         return 0;
3471
3472                 case SIOCGIFSLAVE:
3473                         err = -EINVAL;
3474                         break;
3475
3476                 case SIOCGIFMAP:
3477                         ifr->ifr_map.mem_start = dev->mem_start;
3478                         ifr->ifr_map.mem_end   = dev->mem_end;
3479                         ifr->ifr_map.base_addr = dev->base_addr;
3480                         ifr->ifr_map.irq       = dev->irq;
3481                         ifr->ifr_map.dma       = dev->dma;
3482                         ifr->ifr_map.port      = dev->if_port;
3483                         return 0;
3484
3485                 case SIOCGIFINDEX:
3486                         ifr->ifr_ifindex = dev->ifindex;
3487                         return 0;
3488
3489                 case SIOCGIFTXQLEN:
3490                         ifr->ifr_qlen = dev->tx_queue_len;
3491                         return 0;
3492
3493                 default:
3494                         /* dev_ioctl() should ensure this case
3495                          * is never reached
3496                          */
3497                         WARN_ON(1);
3498                         err = -EINVAL;
3499                         break;
3500
3501         }
3502         return err;
3503 }
3504
3505 /*
3506  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
3507  */
3508 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3509 {
3510         int err;
3511         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3512
3513         if (!dev)
3514                 return -ENODEV;
3515
3516         switch (cmd) {
3517                 case SIOCSIFFLAGS:      /* Set interface flags */
3518                         return dev_change_flags(dev, ifr->ifr_flags);
3519
3520                 case SIOCSIFMETRIC:     /* Set the metric on the interface
3521                                            (currently unused) */
3522                         return -EOPNOTSUPP;
3523
3524                 case SIOCSIFMTU:        /* Set the MTU of a device */
3525                         return dev_set_mtu(dev, ifr->ifr_mtu);
3526
3527                 case SIOCSIFHWADDR:
3528                         return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3529
3530                 case SIOCSIFHWBROADCAST:
3531                         if (ifr->ifr_hwaddr.sa_family != dev->type)
3532                                 return -EINVAL;
3533                         memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3534                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3535                         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3536                         return 0;
3537
3538                 case SIOCSIFMAP:
3539                         if (dev->set_config) {
3540                                 if (!netif_device_present(dev))
3541                                         return -ENODEV;
3542                                 return dev->set_config(dev, &ifr->ifr_map);
3543                         }
3544                         return -EOPNOTSUPP;
3545
3546                 case SIOCADDMULTI:
3547                         if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
3548                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3549                                 return -EINVAL;
3550                         if (!netif_device_present(dev))
3551                                 return -ENODEV;
3552                         return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3553                                           dev->addr_len, 1);
3554
3555                 case SIOCDELMULTI:
3556                         if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
3557                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3558                                 return -EINVAL;
3559                         if (!netif_device_present(dev))
3560                                 return -ENODEV;
3561                         return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3562                                              dev->addr_len, 1);
3563
3564                 case SIOCSIFTXQLEN:
3565                         if (ifr->ifr_qlen < 0)
3566                                 return -EINVAL;
3567                         dev->tx_queue_len = ifr->ifr_qlen;
3568                         return 0;
3569
3570                 case SIOCSIFNAME:
3571                         ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3572                         return dev_change_name(dev, ifr->ifr_newname);
3573
3574                 /*
3575                  *      Unknown or private ioctl
3576                  */
3577
3578                 default:
3579                         if ((cmd >= SIOCDEVPRIVATE &&
3580                             cmd <= SIOCDEVPRIVATE + 15) ||
3581                             cmd == SIOCBONDENSLAVE ||
3582                             cmd == SIOCBONDRELEASE ||
3583                             cmd == SIOCBONDSETHWADDR ||
3584                             cmd == SIOCBONDSLAVEINFOQUERY ||
3585                             cmd == SIOCBONDINFOQUERY ||
3586                             cmd == SIOCBONDCHANGEACTIVE ||
3587                             cmd == SIOCGMIIPHY ||
3588                             cmd == SIOCGMIIREG ||
3589                             cmd == SIOCSMIIREG ||
3590                             cmd == SIOCBRADDIF ||
3591                             cmd == SIOCBRDELIF ||
3592                             cmd == SIOCWANDEV) {
3593                                 err = -EOPNOTSUPP;
3594                                 if (dev->do_ioctl) {
3595                                         if (netif_device_present(dev))
3596                                                 err = dev->do_ioctl(dev, ifr,
3597                                                                     cmd);
3598                                         else
3599                                                 err = -ENODEV;
3600                                 }
3601                         } else
3602                                 err = -EINVAL;
3603
3604         }
3605         return err;
3606 }
3607
3608 /*
3609  *      This function handles all "interface"-type I/O control requests. The actual
3610  *      'doing' part of this is dev_ifsioc above.
3611  */
3612
3613 /**
3614  *      dev_ioctl       -       network device ioctl
3615  *      @net: the applicable net namespace
3616  *      @cmd: command to issue
3617  *      @arg: pointer to a struct ifreq in user space
3618  *
3619  *      Issue ioctl functions to devices. This is normally called by the
3620  *      user space syscall interfaces but can sometimes be useful for
3621  *      other purposes. The return value is the return from the syscall if
3622  *      positive or a negative errno code on error.
3623  */
3624
3625 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3626 {
3627         struct ifreq ifr;
3628         int ret;
3629         char *colon;
3630
3631         /* One special case: SIOCGIFCONF takes ifconf argument
3632            and requires shared lock, because it sleeps writing
3633            to user space.
3634          */
3635
3636         if (cmd == SIOCGIFCONF) {
3637                 rtnl_lock();
3638                 ret = dev_ifconf(net, (char __user *) arg);
3639                 rtnl_unlock();
3640                 return ret;
3641         }
3642         if (cmd == SIOCGIFNAME)
3643                 return dev_ifname(net, (struct ifreq __user *)arg);
3644
3645         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3646                 return -EFAULT;
3647
3648         ifr.ifr_name[IFNAMSIZ-1] = 0;
3649
3650         colon = strchr(ifr.ifr_name, ':');
3651         if (colon)
3652                 *colon = 0;
3653
3654         /*
3655          *      See which interface the caller is talking about.
3656          */
3657
3658         switch (cmd) {
3659                 /*
3660                  *      These ioctl calls:
3661                  *      - can be done by all.
3662                  *      - atomic and do not require locking.
3663                  *      - return a value
3664                  */
3665                 case SIOCGIFFLAGS:
3666                 case SIOCGIFMETRIC:
3667                 case SIOCGIFMTU:
3668                 case SIOCGIFHWADDR:
3669                 case SIOCGIFSLAVE:
3670                 case SIOCGIFMAP:
3671                 case SIOCGIFINDEX:
3672                 case SIOCGIFTXQLEN:
3673                         dev_load(net, ifr.ifr_name);
3674                         read_lock(&dev_base_lock);
3675                         ret = dev_ifsioc_locked(net, &ifr, cmd);
3676                         read_unlock(&dev_base_lock);
3677                         if (!ret) {
3678                                 if (colon)
3679                                         *colon = ':';
3680                                 if (copy_to_user(arg, &ifr,
3681                                                  sizeof(struct ifreq)))
3682                                         ret = -EFAULT;
3683                         }
3684                         return ret;
3685
3686                 case SIOCETHTOOL:
3687                         dev_load(net, ifr.ifr_name);
3688                         rtnl_lock();
3689                         ret = dev_ethtool(net, &ifr);
3690                         rtnl_unlock();
3691                         if (!ret) {
3692                                 if (colon)
3693                                         *colon = ':';
3694                                 if (copy_to_user(arg, &ifr,
3695                                                  sizeof(struct ifreq)))
3696                                         ret = -EFAULT;
3697                         }
3698                         return ret;
3699
3700                 /*
3701                  *      These ioctl calls:
3702                  *      - require superuser power.
3703                  *      - require strict serialization.
3704                  *      - return a value
3705                  */
3706                 case SIOCGMIIPHY:
3707                 case SIOCGMIIREG:
3708                 case SIOCSIFNAME:
3709                         if (!capable(CAP_NET_ADMIN))
3710                                 return -EPERM;
3711                         dev_load(net, ifr.ifr_name);
3712                         rtnl_lock();
3713                         ret = dev_ifsioc(net, &ifr, cmd);
3714                         rtnl_unlock();
3715                         if (!ret) {
3716                                 if (colon)
3717                                         *colon = ':';
3718                                 if (copy_to_user(arg, &ifr,
3719                                                  sizeof(struct ifreq)))
3720                                         ret = -EFAULT;
3721                         }
3722                         return ret;
3723
3724                 /*
3725                  *      These ioctl calls:
3726                  *      - require superuser power.
3727                  *      - require strict serialization.
3728                  *      - do not return a value
3729                  */
3730                 case SIOCSIFFLAGS:
3731                 case SIOCSIFMETRIC:
3732                 case SIOCSIFMTU:
3733                 case SIOCSIFMAP:
3734                 case SIOCSIFHWADDR:
3735                 case SIOCSIFSLAVE:
3736                 case SIOCADDMULTI:
3737                 case SIOCDELMULTI:
3738                 case SIOCSIFHWBROADCAST:
3739                 case SIOCSIFTXQLEN:
3740                 case SIOCSMIIREG:
3741                 case SIOCBONDENSLAVE:
3742                 case SIOCBONDRELEASE:
3743                 case SIOCBONDSETHWADDR:
3744                 case SIOCBONDCHANGEACTIVE:
3745                 case SIOCBRADDIF:
3746                 case SIOCBRDELIF:
3747                         if (!capable(CAP_NET_ADMIN))
3748                                 return -EPERM;
3749                         /* fall through */
3750                 case SIOCBONDSLAVEINFOQUERY:
3751                 case SIOCBONDINFOQUERY:
3752                         dev_load(net, ifr.ifr_name);
3753                         rtnl_lock();
3754                         ret = dev_ifsioc(net, &ifr, cmd);
3755                         rtnl_unlock();
3756                         return ret;
3757
3758                 case SIOCGIFMEM:
3759                         /* Get the per device memory space. We can add this but
3760                          * currently do not support it */
3761                 case SIOCSIFMEM:
3762                         /* Set the per device memory buffer space.
3763                          * Not applicable in our case */
3764                 case SIOCSIFLINK:
3765                         return -EINVAL;
3766
3767                 /*
3768                  *      Unknown or private ioctl.
3769                  */
3770                 default:
3771                         if (cmd == SIOCWANDEV ||
3772                             (cmd >= SIOCDEVPRIVATE &&
3773                              cmd <= SIOCDEVPRIVATE + 15)) {
3774                                 dev_load(net, ifr.ifr_name);
3775                                 rtnl_lock();
3776                                 ret = dev_ifsioc(net, &ifr, cmd);
3777                                 rtnl_unlock();
3778                                 if (!ret && copy_to_user(arg, &ifr,
3779                                                          sizeof(struct ifreq)))
3780                                         ret = -EFAULT;
3781                                 return ret;
3782                         }
3783                         /* Take care of Wireless Extensions */
3784                         if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
3785                                 return wext_handle_ioctl(net, &ifr, cmd, arg);
3786                         return -EINVAL;
3787         }
3788 }
3789
3790
3791 /**
3792  *      dev_new_index   -       allocate an ifindex
3793  *      @net: the applicable net namespace
3794  *
3795  *      Returns a suitable unique value for a new device interface
3796  *      number.  The caller must hold the rtnl semaphore or the
3797  *      dev_base_lock to be sure it remains unique.
3798  */
3799 static int dev_new_index(struct net *net)
3800 {
3801         static int ifindex;
3802         for (;;) {
3803                 if (++ifindex <= 0)
3804                         ifindex = 1;
3805                 if (!__dev_get_by_index(net, ifindex))
3806                         return ifindex;
3807         }
3808 }
3809
3810 /* Delayed registration/unregisteration */
3811 static DEFINE_SPINLOCK(net_todo_list_lock);
3812 static LIST_HEAD(net_todo_list);
3813
3814 static void net_set_todo(struct net_device *dev)
3815 {
3816         spin_lock(&net_todo_list_lock);
3817         list_add_tail(&dev->todo_list, &net_todo_list);
3818         spin_unlock(&net_todo_list_lock);
3819 }
3820
3821 static void rollback_registered(struct net_device *dev)
3822 {
3823         BUG_ON(dev_boot_phase);
3824         ASSERT_RTNL();
3825
3826         /* Some devices call without registering for initialization unwind. */
3827         if (dev->reg_state == NETREG_UNINITIALIZED) {
3828                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3829                                   "was registered\n", dev->name, dev);
3830
3831                 WARN_ON(1);
3832                 return;
3833         }
3834
3835         BUG_ON(dev->reg_state != NETREG_REGISTERED);
3836
3837         /* If device is running, close it first. */
3838         dev_close(dev);
3839
3840         /* And unlink it from device chain. */
3841         unlist_netdevice(dev);
3842
3843         dev->reg_state = NETREG_UNREGISTERING;
3844
3845         synchronize_net();
3846
3847         /* Shutdown queueing discipline. */
3848         dev_shutdown(dev);
3849
3850
3851         /* Notify protocols, that we are about to destroy
3852            this device. They should clean all the things.
3853         */
3854         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3855
3856         /*
3857          *      Flush the unicast and multicast chains
3858          */
3859         dev_addr_discard(dev);
3860
3861         if (dev->uninit)
3862                 dev->uninit(dev);
3863
3864         /* Notifier chain MUST detach us from master device. */
3865         WARN_ON(dev->master);
3866
3867         /* Remove entries from kobject tree */
3868         netdev_unregister_kobject(dev);
3869
3870         synchronize_net();
3871
3872         dev_put(dev);
3873 }
3874
3875 static void __netdev_init_queue_locks_one(struct net_device *dev,
3876                                           struct netdev_queue *dev_queue,
3877                                           void *_unused)
3878 {
3879         spin_lock_init(&dev_queue->_xmit_lock);
3880         netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
3881         dev_queue->xmit_lock_owner = -1;
3882 }
3883
3884 static void netdev_init_queue_locks(struct net_device *dev)
3885 {
3886         netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
3887         __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
3888 }
3889
3890 /**
3891  *      register_netdevice      - register a network device
3892  *      @dev: device to register
3893  *
3894  *      Take a completed network device structure and add it to the kernel
3895  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3896  *      chain. 0 is returned on success. A negative errno code is returned
3897  *      on a failure to set up the device, or if the name is a duplicate.
3898  *
3899  *      Callers must hold the rtnl semaphore. You may want
3900  *      register_netdev() instead of this.
3901  *
3902  *      BUGS:
3903  *      The locking appears insufficient to guarantee two parallel registers
3904  *      will not get the same name.
3905  */
3906
3907 int register_netdevice(struct net_device *dev)
3908 {
3909         struct hlist_head *head;
3910         struct hlist_node *p;
3911         int ret;
3912         struct net *net;
3913
3914         BUG_ON(dev_boot_phase);
3915         ASSERT_RTNL();
3916
3917         might_sleep();
3918
3919         /* When net_device's are persistent, this will be fatal. */
3920         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
3921         BUG_ON(!dev_net(dev));
3922         net = dev_net(dev);
3923
3924         spin_lock_init(&dev->addr_list_lock);
3925         netdev_set_addr_lockdep_class(dev);
3926         netdev_init_queue_locks(dev);
3927
3928         dev->iflink = -1;
3929
3930         /* Init, if this function is available */
3931         if (dev->init) {
3932                 ret = dev->init(dev);
3933                 if (ret) {
3934                         if (ret > 0)
3935                                 ret = -EIO;
3936                         goto out;
3937                 }
3938         }
3939
3940         if (!dev_valid_name(dev->name)) {
3941                 ret = -EINVAL;
3942                 goto err_uninit;
3943         }
3944
3945         dev->ifindex = dev_new_index(net);
3946         if (dev->iflink == -1)
3947                 dev->iflink = dev->ifindex;
3948
3949         /* Check for existence of name */
3950         head = dev_name_hash(net, dev->name);
3951         hlist_for_each(p, head) {
3952                 struct net_device *d
3953                         = hlist_entry(p, struct net_device, name_hlist);
3954                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3955                         ret = -EEXIST;
3956                         goto err_uninit;
3957                 }
3958         }
3959
3960         /* Fix illegal checksum combinations */
3961         if ((dev->features & NETIF_F_HW_CSUM) &&
3962             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3963                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
3964                        dev->name);
3965                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
3966         }
3967
3968         if ((dev->features & NETIF_F_NO_CSUM) &&
3969             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3970                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
3971                        dev->name);
3972                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
3973         }
3974
3975
3976         /* Fix illegal SG+CSUM combinations. */
3977         if ((dev->features & NETIF_F_SG) &&
3978             !(dev->features & NETIF_F_ALL_CSUM)) {
3979                 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
3980                        dev->name);
3981                 dev->features &= ~NETIF_F_SG;
3982         }
3983
3984         /* TSO requires that SG is present as well. */
3985         if ((dev->features & NETIF_F_TSO) &&
3986             !(dev->features & NETIF_F_SG)) {
3987                 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
3988                        dev->name);
3989                 dev->features &= ~NETIF_F_TSO;
3990         }
3991         if (dev->features & NETIF_F_UFO) {
3992                 if (!(dev->features & NETIF_F_HW_CSUM)) {
3993                         printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3994                                         "NETIF_F_HW_CSUM feature.\n",
3995                                                         dev->name);
3996                         dev->features &= ~NETIF_F_UFO;
3997                 }
3998                 if (!(dev->features & NETIF_F_SG)) {
3999                         printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
4000                                         "NETIF_F_SG feature.\n",
4001                                         dev->name);
4002                         dev->features &= ~NETIF_F_UFO;
4003                 }
4004         }
4005
4006         /* Enable software GSO if SG is supported. */
4007         if (dev->features & NETIF_F_SG)
4008                 dev->features |= NETIF_F_GSO;
4009
4010         netdev_initialize_kobject(dev);
4011         ret = netdev_register_kobject(dev);
4012         if (ret)
4013                 goto err_uninit;
4014         dev->reg_state = NETREG_REGISTERED;
4015
4016         /*
4017          *      Default initial state at registry is that the
4018          *      device is present.
4019          */
4020
4021         set_bit(__LINK_STATE_PRESENT, &dev->state);
4022
4023         dev_init_scheduler(dev);
4024         dev_hold(dev);
4025         list_netdevice(dev);
4026
4027         /* Notify protocols, that a new device appeared. */
4028         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4029         ret = notifier_to_errno(ret);
4030         if (ret) {
4031                 rollback_registered(dev);
4032                 dev->reg_state = NETREG_UNREGISTERED;
4033         }
4034
4035 out:
4036         return ret;
4037
4038 err_uninit:
4039         if (dev->uninit)
4040                 dev->uninit(dev);
4041         goto out;
4042 }
4043
4044 /**
4045  *      register_netdev - register a network device
4046  *      @dev: device to register
4047  *
4048  *      Take a completed network device structure and add it to the kernel
4049  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4050  *      chain. 0 is returned on success. A negative errno code is returned
4051  *      on a failure to set up the device, or if the name is a duplicate.
4052  *
4053  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
4054  *      and expands the device name if you passed a format string to
4055  *      alloc_netdev.
4056  */
4057 int register_netdev(struct net_device *dev)
4058 {
4059         int err;
4060
4061         rtnl_lock();
4062
4063         /*
4064          * If the name is a format string the caller wants us to do a
4065          * name allocation.
4066          */
4067         if (strchr(dev->name, '%')) {
4068                 err = dev_alloc_name(dev, dev->name);
4069                 if (err < 0)
4070                         goto out;
4071         }
4072
4073         err = register_netdevice(dev);
4074 out:
4075         rtnl_unlock();
4076         return err;
4077 }
4078 EXPORT_SYMBOL(register_netdev);
4079
4080 /*
4081  * netdev_wait_allrefs - wait until all references are gone.
4082  *
4083  * This is called when unregistering network devices.
4084  *
4085  * Any protocol or device that holds a reference should register
4086  * for netdevice notification, and cleanup and put back the
4087  * reference if they receive an UNREGISTER event.
4088  * We can get stuck here if buggy protocols don't correctly
4089  * call dev_put.
4090  */
4091 static void netdev_wait_allrefs(struct net_device *dev)
4092 {
4093         unsigned long rebroadcast_time, warning_time;
4094
4095         rebroadcast_time = warning_time = jiffies;
4096         while (atomic_read(&dev->refcnt) != 0) {
4097                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4098                         rtnl_lock();
4099
4100                         /* Rebroadcast unregister notification */
4101                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4102
4103                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4104                                      &dev->state)) {
4105                                 /* We must not have linkwatch events
4106                                  * pending on unregister. If this
4107                                  * happens, we simply run the queue
4108                                  * unscheduled, resulting in a noop
4109                                  * for this device.
4110                                  */
4111                                 linkwatch_run_queue();
4112                         }
4113
4114                         __rtnl_unlock();
4115
4116                         rebroadcast_time = jiffies;
4117                 }
4118
4119                 msleep(250);
4120
4121                 if (time_after(jiffies, warning_time + 10 * HZ)) {
4122                         printk(KERN_EMERG "unregister_netdevice: "
4123                                "waiting for %s to become free. Usage "
4124                                "count = %d\n",
4125                                dev->name, atomic_read(&dev->refcnt));
4126                         warning_time = jiffies;
4127                 }
4128         }
4129 }
4130
4131 /* The sequence is:
4132  *
4133  *      rtnl_lock();
4134  *      ...
4135  *      register_netdevice(x1);
4136  *      register_netdevice(x2);
4137  *      ...
4138  *      unregister_netdevice(y1);
4139  *      unregister_netdevice(y2);
4140  *      ...
4141  *      rtnl_unlock();
4142  *      free_netdev(y1);
4143  *      free_netdev(y2);
4144  *
4145  * We are invoked by rtnl_unlock() after it drops the semaphore.
4146  * This allows us to deal with problems:
4147  * 1) We can delete sysfs objects which invoke hotplug
4148  *    without deadlocking with linkwatch via keventd.
4149  * 2) Since we run with the RTNL semaphore not held, we can sleep
4150  *    safely in order to wait for the netdev refcnt to drop to zero.
4151  */
4152 static DEFINE_MUTEX(net_todo_run_mutex);
4153 void netdev_run_todo(void)
4154 {
4155         struct list_head list;
4156
4157         /* Need to guard against multiple cpu's getting out of order. */
4158         mutex_lock(&net_todo_run_mutex);
4159
4160         /* Not safe to do outside the semaphore.  We must not return
4161          * until all unregister events invoked by the local processor
4162          * have been completed (either by this todo run, or one on
4163          * another cpu).
4164          */
4165         if (list_empty(&net_todo_list))
4166                 goto out;
4167
4168         /* Snapshot list, allow later requests */
4169         spin_lock(&net_todo_list_lock);
4170         list_replace_init(&net_todo_list, &list);
4171         spin_unlock(&net_todo_list_lock);
4172
4173         while (!list_empty(&list)) {
4174                 struct net_device *dev
4175                         = list_entry(list.next, struct net_device, todo_list);
4176                 list_del(&dev->todo_list);
4177
4178                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4179                         printk(KERN_ERR "network todo '%s' but state %d\n",
4180                                dev->name, dev->reg_state);
4181                         dump_stack();
4182                         continue;
4183                 }
4184
4185                 dev->reg_state = NETREG_UNREGISTERED;
4186
4187                 on_each_cpu(flush_backlog, dev, 1);
4188
4189                 netdev_wait_allrefs(dev);
4190
4191                 /* paranoia */
4192                 BUG_ON(atomic_read(&dev->refcnt));
4193                 WARN_ON(dev->ip_ptr);
4194                 WARN_ON(dev->ip6_ptr);
4195                 WARN_ON(dev->dn_ptr);
4196
4197                 if (dev->destructor)
4198                         dev->destructor(dev);
4199
4200                 /* Free network device */
4201                 kobject_put(&dev->dev.kobj);
4202         }
4203
4204 out:
4205         mutex_unlock(&net_todo_run_mutex);
4206 }
4207
4208 static struct net_device_stats *internal_stats(struct net_device *dev)
4209 {
4210         return &dev->stats;
4211 }
4212
4213 static void netdev_init_one_queue(struct net_device *dev,
4214                                   struct netdev_queue *queue,
4215                                   void *_unused)
4216 {
4217         queue->dev = dev;
4218 }
4219
4220 static void netdev_init_queues(struct net_device *dev)
4221 {
4222         netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4223         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4224         spin_lock_init(&dev->tx_global_lock);
4225 }
4226
4227 /**
4228  *      alloc_netdev_mq - allocate network device
4229  *      @sizeof_priv:   size of private data to allocate space for
4230  *      @name:          device name format string
4231  *      @setup:         callback to initialize device
4232  *      @queue_count:   the number of subqueues to allocate
4233  *
4234  *      Allocates a struct net_device with private data area for driver use
4235  *      and performs basic initialization.  Also allocates subquue structs
4236  *      for each queue on the device at the end of the netdevice.
4237  */
4238 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4239                 void (*setup)(struct net_device *), unsigned int queue_count)
4240 {
4241         struct netdev_queue *tx;
4242         struct net_device *dev;
4243         size_t alloc_size;
4244         void *p;
4245
4246         BUG_ON(strlen(name) >= sizeof(dev->name));
4247
4248         alloc_size = sizeof(struct net_device);
4249         if (sizeof_priv) {
4250                 /* ensure 32-byte alignment of private area */
4251                 alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4252                 alloc_size += sizeof_priv;
4253         }
4254         /* ensure 32-byte alignment of whole construct */
4255         alloc_size += NETDEV_ALIGN_CONST;
4256
4257         p = kzalloc(alloc_size, GFP_KERNEL);
4258         if (!p) {
4259                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4260                 return NULL;
4261         }
4262
4263         tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4264         if (!tx) {
4265                 printk(KERN_ERR "alloc_netdev: Unable to allocate "
4266                        "tx qdiscs.\n");
4267                 kfree(p);
4268                 return NULL;
4269         }
4270
4271         dev = (struct net_device *)
4272                 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4273         dev->padded = (char *)dev - (char *)p;
4274         dev_net_set(dev, &init_net);
4275
4276         dev->_tx = tx;
4277         dev->num_tx_queues = queue_count;
4278         dev->real_num_tx_queues = queue_count;
4279
4280         if (sizeof_priv) {
4281                 dev->priv = ((char *)dev +
4282                              ((sizeof(struct net_device) + NETDEV_ALIGN_CONST)
4283                               & ~NETDEV_ALIGN_CONST));
4284         }
4285
4286         dev->gso_max_size = GSO_MAX_SIZE;
4287
4288         netdev_init_queues(dev);
4289
4290         dev->get_stats = internal_stats;
4291         netpoll_netdev_init(dev);
4292         setup(dev);
4293         strcpy(dev->name, name);
4294         return dev;
4295 }
4296 EXPORT_SYMBOL(alloc_netdev_mq);
4297
4298 /**
4299  *      free_netdev - free network device
4300  *      @dev: device
4301  *
4302  *      This function does the last stage of destroying an allocated device
4303  *      interface. The reference to the device object is released.
4304  *      If this is the last reference then it will be freed.
4305  */
4306 void free_netdev(struct net_device *dev)
4307 {
4308         release_net(dev_net(dev));
4309
4310         kfree(dev->_tx);
4311
4312         /*  Compatibility with error handling in drivers */
4313         if (dev->reg_state == NETREG_UNINITIALIZED) {
4314                 kfree((char *)dev - dev->padded);
4315                 return;
4316         }
4317
4318         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4319         dev->reg_state = NETREG_RELEASED;
4320
4321         /* will free via device release */
4322         put_device(&dev->dev);
4323 }
4324
4325 /* Synchronize with packet receive processing. */
4326 void synchronize_net(void)
4327 {
4328         might_sleep();
4329         synchronize_rcu();
4330 }
4331
4332 /**
4333  *      unregister_netdevice - remove device from the kernel
4334  *      @dev: device
4335  *
4336  *      This function shuts down a device interface and removes it
4337  *      from the kernel tables.
4338  *
4339  *      Callers must hold the rtnl semaphore.  You may want
4340  *      unregister_netdev() instead of this.
4341  */
4342
4343 void unregister_netdevice(struct net_device *dev)
4344 {
4345         ASSERT_RTNL();
4346
4347         rollback_registered(dev);
4348         /* Finish processing unregister after unlock */
4349         net_set_todo(dev);
4350 }
4351
4352 /**
4353  *      unregister_netdev - remove device from the kernel
4354  *      @dev: device
4355  *
4356  *      This function shuts down a device interface and removes it
4357  *      from the kernel tables.
4358  *
4359  *      This is just a wrapper for unregister_netdevice that takes
4360  *      the rtnl semaphore.  In general you want to use this and not
4361  *      unregister_netdevice.
4362  */
4363 void unregister_netdev(struct net_device *dev)
4364 {
4365         rtnl_lock();
4366         unregister_netdevice(dev);
4367         rtnl_unlock();
4368 }
4369
4370 EXPORT_SYMBOL(unregister_netdev);
4371
4372 /**
4373  *      dev_change_net_namespace - move device to different nethost namespace
4374  *      @dev: device
4375  *      @net: network namespace
4376  *      @pat: If not NULL name pattern to try if the current device name
4377  *            is already taken in the destination network namespace.
4378  *
4379  *      This function shuts down a device interface and moves it
4380  *      to a new network namespace. On success 0 is returned, on
4381  *      a failure a netagive errno code is returned.
4382  *
4383  *      Callers must hold the rtnl semaphore.
4384  */
4385
4386 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4387 {
4388         char buf[IFNAMSIZ];
4389         const char *destname;
4390         int err;
4391
4392         ASSERT_RTNL();
4393
4394         /* Don't allow namespace local devices to be moved. */
4395         err = -EINVAL;
4396         if (dev->features & NETIF_F_NETNS_LOCAL)
4397                 goto out;
4398
4399         /* Ensure the device has been registrered */
4400         err = -EINVAL;
4401         if (dev->reg_state != NETREG_REGISTERED)
4402                 goto out;
4403
4404         /* Get out if there is nothing todo */
4405         err = 0;
4406         if (net_eq(dev_net(dev), net))
4407                 goto out;
4408
4409         /* Pick the destination device name, and ensure
4410          * we can use it in the destination network namespace.
4411          */
4412         err = -EEXIST;
4413         destname = dev->name;
4414         if (__dev_get_by_name(net, destname)) {
4415                 /* We get here if we can't use the current device name */
4416                 if (!pat)
4417                         goto out;
4418                 if (!dev_valid_name(pat))
4419                         goto out;
4420                 if (strchr(pat, '%')) {
4421                         if (__dev_alloc_name(net, pat, buf) < 0)
4422                                 goto out;
4423                         destname = buf;
4424                 } else
4425                         destname = pat;
4426                 if (__dev_get_by_name(net, destname))
4427                         goto out;
4428         }
4429
4430         /*
4431          * And now a mini version of register_netdevice unregister_netdevice.
4432          */
4433
4434         /* If device is running close it first. */
4435         dev_close(dev);
4436
4437         /* And unlink it from device chain */
4438         err = -ENODEV;
4439         unlist_netdevice(dev);
4440
4441         synchronize_net();
4442
4443         /* Shutdown queueing discipline. */
4444         dev_shutdown(dev);
4445
4446         /* Notify protocols, that we are about to destroy
4447            this device. They should clean all the things.
4448         */
4449         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4450
4451         /*
4452          *      Flush the unicast and multicast chains
4453          */
4454         dev_addr_discard(dev);
4455
4456         /* Actually switch the network namespace */
4457         dev_net_set(dev, net);
4458
4459         /* Assign the new device name */
4460         if (destname != dev->name)
4461                 strcpy(dev->name, destname);
4462
4463         /* If there is an ifindex conflict assign a new one */
4464         if (__dev_get_by_index(net, dev->ifindex)) {
4465                 int iflink = (dev->iflink == dev->ifindex);
4466                 dev->ifindex = dev_new_index(net);
4467                 if (iflink)
4468                         dev->iflink = dev->ifindex;
4469         }
4470
4471         /* Fixup kobjects */
4472         netdev_unregister_kobject(dev);
4473         err = netdev_register_kobject(dev);
4474         WARN_ON(err);
4475
4476         /* Add the device back in the hashes */
4477         list_netdevice(dev);
4478
4479         /* Notify protocols, that a new device appeared. */
4480         call_netdevice_notifiers(NETDEV_REGISTER, dev);
4481
4482         synchronize_net();
4483         err = 0;
4484 out:
4485         return err;
4486 }
4487
4488 static int dev_cpu_callback(struct notifier_block *nfb,
4489                             unsigned long action,
4490                             void *ocpu)
4491 {
4492         struct sk_buff **list_skb;
4493         struct Qdisc **list_net;
4494         struct sk_buff *skb;
4495         unsigned int cpu, oldcpu = (unsigned long)ocpu;
4496         struct softnet_data *sd, *oldsd;
4497
4498         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4499                 return NOTIFY_OK;
4500
4501         local_irq_disable();
4502         cpu = smp_processor_id();
4503         sd = &per_cpu(softnet_data, cpu);
4504         oldsd = &per_cpu(softnet_data, oldcpu);
4505
4506         /* Find end of our completion_queue. */
4507         list_skb = &sd->completion_queue;
4508         while (*list_skb)
4509                 list_skb = &(*list_skb)->next;
4510         /* Append completion queue from offline CPU. */
4511         *list_skb = oldsd->completion_queue;
4512         oldsd->completion_queue = NULL;
4513
4514         /* Find end of our output_queue. */
4515         list_net = &sd->output_queue;
4516         while (*list_net)
4517                 list_net = &(*list_net)->next_sched;
4518         /* Append output queue from offline CPU. */
4519         *list_net = oldsd->output_queue;
4520         oldsd->output_queue = NULL;
4521
4522         raise_softirq_irqoff(NET_TX_SOFTIRQ);
4523         local_irq_enable();
4524
4525         /* Process offline CPU's input_pkt_queue */
4526         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4527                 netif_rx(skb);
4528
4529         return NOTIFY_OK;
4530 }
4531
4532 #ifdef CONFIG_NET_DMA
4533 /**
4534  * net_dma_rebalance - try to maintain one DMA channel per CPU
4535  * @net_dma: DMA client and associated data (lock, channels, channel_mask)
4536  *
4537  * This is called when the number of channels allocated to the net_dma client
4538  * changes.  The net_dma client tries to have one DMA channel per CPU.
4539  */
4540
4541 static void net_dma_rebalance(struct net_dma *net_dma)
4542 {
4543         unsigned int cpu, i, n, chan_idx;
4544         struct dma_chan *chan;
4545
4546         if (cpus_empty(net_dma->channel_mask)) {
4547                 for_each_online_cpu(cpu)
4548                         rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
4549                 return;
4550         }
4551
4552         i = 0;
4553         cpu = first_cpu(cpu_online_map);
4554
4555         for_each_cpu_mask_nr(chan_idx, net_dma->channel_mask) {
4556                 chan = net_dma->channels[chan_idx];
4557
4558                 n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
4559                    + (i < (num_online_cpus() %
4560                         cpus_weight(net_dma->channel_mask)) ? 1 : 0));
4561
4562                 while(n) {
4563                         per_cpu(softnet_data, cpu).net_dma = chan;
4564                         cpu = next_cpu(cpu, cpu_online_map);
4565                         n--;
4566                 }
4567                 i++;
4568         }
4569 }
4570
4571 /**
4572  * netdev_dma_event - event callback for the net_dma_client
4573  * @client: should always be net_dma_client
4574  * @chan: DMA channel for the event
4575  * @state: DMA state to be handled
4576  */
4577 static enum dma_state_client
4578 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
4579         enum dma_state state)
4580 {
4581         int i, found = 0, pos = -1;
4582         struct net_dma *net_dma =
4583                 container_of(client, struct net_dma, client);
4584         enum dma_state_client ack = DMA_DUP; /* default: take no action */
4585
4586         spin_lock(&net_dma->lock);
4587         switch (state) {
4588         case DMA_RESOURCE_AVAILABLE:
4589                 for (i = 0; i < nr_cpu_ids; i++)
4590                         if (net_dma->channels[i] == chan) {
4591                                 found = 1;
4592                                 break;
4593                         } else if (net_dma->channels[i] == NULL && pos < 0)
4594                                 pos = i;
4595
4596                 if (!found && pos >= 0) {
4597                         ack = DMA_ACK;
4598                         net_dma->channels[pos] = chan;
4599                         cpu_set(pos, net_dma->channel_mask);
4600                         net_dma_rebalance(net_dma);
4601                 }
4602                 break;
4603         case DMA_RESOURCE_REMOVED:
4604                 for (i = 0; i < nr_cpu_ids; i++)
4605                         if (net_dma->channels[i] == chan) {
4606                                 found = 1;
4607                                 pos = i;
4608                                 break;
4609                         }
4610
4611                 if (found) {
4612                         ack = DMA_ACK;
4613                         cpu_clear(pos, net_dma->channel_mask);
4614                         net_dma->channels[i] = NULL;
4615                         net_dma_rebalance(net_dma);
4616                 }
4617                 break;
4618         default:
4619                 break;
4620         }
4621         spin_unlock(&net_dma->lock);
4622
4623         return ack;
4624 }
4625
4626 /**
4627  * netdev_dma_regiser - register the networking subsystem as a DMA client
4628  */
4629 static int __init netdev_dma_register(void)
4630 {
4631         net_dma.channels = kzalloc(nr_cpu_ids * sizeof(struct net_dma),
4632                                                                 GFP_KERNEL);
4633         if (unlikely(!net_dma.channels)) {
4634                 printk(KERN_NOTICE
4635                                 "netdev_dma: no memory for net_dma.channels\n");
4636                 return -ENOMEM;
4637         }
4638         spin_lock_init(&net_dma.lock);
4639         dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
4640         dma_async_client_register(&net_dma.client);
4641         dma_async_client_chan_request(&net_dma.client);
4642         return 0;
4643 }
4644
4645 #else
4646 static int __init netdev_dma_register(void) { return -ENODEV; }
4647 #endif /* CONFIG_NET_DMA */
4648
4649 /**
4650  *      netdev_compute_feature - compute conjunction of two feature sets
4651  *      @all: first feature set
4652  *      @one: second feature set
4653  *
4654  *      Computes a new feature set after adding a device with feature set
4655  *      @one to the master device with current feature set @all.  Returns
4656  *      the new feature set.
4657  */
4658 int netdev_compute_features(unsigned long all, unsigned long one)
4659 {
4660         /* if device needs checksumming, downgrade to hw checksumming */
4661         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4662                 all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
4663
4664         /* if device can't do all checksum, downgrade to ipv4/ipv6 */
4665         if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM))
4666                 all ^= NETIF_F_HW_CSUM
4667                         | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
4668
4669         if (one & NETIF_F_GSO)
4670                 one |= NETIF_F_GSO_SOFTWARE;
4671         one |= NETIF_F_GSO;
4672
4673         /* If even one device supports robust GSO, enable it for all. */
4674         if (one & NETIF_F_GSO_ROBUST)
4675                 all |= NETIF_F_GSO_ROBUST;
4676
4677         all &= one | NETIF_F_LLTX;
4678
4679         if (!(all & NETIF_F_ALL_CSUM))
4680                 all &= ~NETIF_F_SG;
4681         if (!(all & NETIF_F_SG))
4682                 all &= ~NETIF_F_GSO_MASK;
4683
4684         return all;
4685 }
4686 EXPORT_SYMBOL(netdev_compute_features);
4687
4688 static struct hlist_head *netdev_create_hash(void)
4689 {
4690         int i;
4691         struct hlist_head *hash;
4692
4693         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
4694         if (hash != NULL)
4695                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
4696                         INIT_HLIST_HEAD(&hash[i]);
4697
4698         return hash;
4699 }
4700
4701 /* Initialize per network namespace state */
4702 static int __net_init netdev_init(struct net *net)
4703 {
4704         INIT_LIST_HEAD(&net->dev_base_head);
4705
4706         net->dev_name_head = netdev_create_hash();
4707         if (net->dev_name_head == NULL)
4708                 goto err_name;
4709
4710         net->dev_index_head = netdev_create_hash();
4711         if (net->dev_index_head == NULL)
4712                 goto err_idx;
4713
4714         return 0;
4715
4716 err_idx:
4717         kfree(net->dev_name_head);
4718 err_name:
4719         return -ENOMEM;
4720 }
4721
4722 char *netdev_drivername(struct net_device *dev, char *buffer, int len)
4723 {
4724         struct device_driver *driver;
4725         struct device *parent;
4726
4727         if (len <= 0 || !buffer)
4728                 return buffer;
4729         buffer[0] = 0;
4730
4731         parent = dev->dev.parent;
4732
4733         if (!parent)
4734                 return buffer;
4735
4736         driver = parent->driver;
4737         if (driver && driver->name)
4738                 strlcpy(buffer, driver->name, len);
4739         return buffer;
4740 }
4741
4742 static void __net_exit netdev_exit(struct net *net)
4743 {
4744         kfree(net->dev_name_head);
4745         kfree(net->dev_index_head);
4746 }
4747
4748 static struct pernet_operations __net_initdata netdev_net_ops = {
4749         .init = netdev_init,
4750         .exit = netdev_exit,
4751 };
4752
4753 static void __net_exit default_device_exit(struct net *net)
4754 {
4755         struct net_device *dev, *next;
4756         /*
4757          * Push all migratable of the network devices back to the
4758          * initial network namespace
4759          */
4760         rtnl_lock();
4761         for_each_netdev_safe(net, dev, next) {
4762                 int err;
4763                 char fb_name[IFNAMSIZ];
4764
4765                 /* Ignore unmoveable devices (i.e. loopback) */
4766                 if (dev->features & NETIF_F_NETNS_LOCAL)
4767                         continue;
4768
4769                 /* Push remaing network devices to init_net */
4770                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
4771                 err = dev_change_net_namespace(dev, &init_net, fb_name);
4772                 if (err) {
4773                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
4774                                 __func__, dev->name, err);
4775                         BUG();
4776                 }
4777         }
4778         rtnl_unlock();
4779 }
4780
4781 static struct pernet_operations __net_initdata default_device_ops = {
4782         .exit = default_device_exit,
4783 };
4784
4785 /*
4786  *      Initialize the DEV module. At boot time this walks the device list and
4787  *      unhooks any devices that fail to initialise (normally hardware not
4788  *      present) and leaves us with a valid list of present and active devices.
4789  *
4790  */
4791
4792 /*
4793  *       This is called single threaded during boot, so no need
4794  *       to take the rtnl semaphore.
4795  */
4796 static int __init net_dev_init(void)
4797 {
4798         int i, rc = -ENOMEM;
4799
4800         BUG_ON(!dev_boot_phase);
4801
4802         if (dev_proc_init())
4803                 goto out;
4804
4805         if (netdev_kobject_init())
4806                 goto out;
4807
4808         INIT_LIST_HEAD(&ptype_all);
4809         for (i = 0; i < PTYPE_HASH_SIZE; i++)
4810                 INIT_LIST_HEAD(&ptype_base[i]);
4811
4812         if (register_pernet_subsys(&netdev_net_ops))
4813                 goto out;
4814
4815         if (register_pernet_device(&default_device_ops))
4816                 goto out;
4817
4818         /*
4819          *      Initialise the packet receive queues.
4820          */
4821
4822         for_each_possible_cpu(i) {
4823                 struct softnet_data *queue;
4824
4825                 queue = &per_cpu(softnet_data, i);
4826                 skb_queue_head_init(&queue->input_pkt_queue);
4827                 queue->completion_queue = NULL;
4828                 INIT_LIST_HEAD(&queue->poll_list);
4829
4830                 queue->backlog.poll = process_backlog;
4831                 queue->backlog.weight = weight_p;
4832         }
4833
4834         netdev_dma_register();
4835
4836         dev_boot_phase = 0;
4837
4838         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
4839         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
4840
4841         hotcpu_notifier(dev_cpu_callback, 0);
4842         dst_init();
4843         dev_mcast_init();
4844         rc = 0;
4845 out:
4846         return rc;
4847 }
4848
4849 subsys_initcall(net_dev_init);
4850
4851 EXPORT_SYMBOL(__dev_get_by_index);
4852 EXPORT_SYMBOL(__dev_get_by_name);
4853 EXPORT_SYMBOL(__dev_remove_pack);
4854 EXPORT_SYMBOL(dev_valid_name);
4855 EXPORT_SYMBOL(dev_add_pack);
4856 EXPORT_SYMBOL(dev_alloc_name);
4857 EXPORT_SYMBOL(dev_close);
4858 EXPORT_SYMBOL(dev_get_by_flags);
4859 EXPORT_SYMBOL(dev_get_by_index);
4860 EXPORT_SYMBOL(dev_get_by_name);
4861 EXPORT_SYMBOL(dev_open);
4862 EXPORT_SYMBOL(dev_queue_xmit);
4863 EXPORT_SYMBOL(dev_remove_pack);
4864 EXPORT_SYMBOL(dev_set_allmulti);
4865 EXPORT_SYMBOL(dev_set_promiscuity);
4866 EXPORT_SYMBOL(dev_change_flags);
4867 EXPORT_SYMBOL(dev_set_mtu);
4868 EXPORT_SYMBOL(dev_set_mac_address);
4869 EXPORT_SYMBOL(free_netdev);
4870 EXPORT_SYMBOL(netdev_boot_setup_check);
4871 EXPORT_SYMBOL(netdev_set_master);
4872 EXPORT_SYMBOL(netdev_state_change);
4873 EXPORT_SYMBOL(netif_receive_skb);
4874 EXPORT_SYMBOL(netif_rx);
4875 EXPORT_SYMBOL(register_gifconf);
4876 EXPORT_SYMBOL(register_netdevice);
4877 EXPORT_SYMBOL(register_netdevice_notifier);
4878 EXPORT_SYMBOL(skb_checksum_help);
4879 EXPORT_SYMBOL(synchronize_net);
4880 EXPORT_SYMBOL(unregister_netdevice);
4881 EXPORT_SYMBOL(unregister_netdevice_notifier);
4882 EXPORT_SYMBOL(net_enable_timestamp);
4883 EXPORT_SYMBOL(net_disable_timestamp);
4884 EXPORT_SYMBOL(dev_get_flags);
4885
4886 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
4887 EXPORT_SYMBOL(br_handle_frame_hook);
4888 EXPORT_SYMBOL(br_fdb_get_hook);
4889 EXPORT_SYMBOL(br_fdb_put_hook);
4890 #endif
4891
4892 #ifdef CONFIG_KMOD
4893 EXPORT_SYMBOL(dev_load);
4894 #endif
4895
4896 EXPORT_PER_CPU_SYMBOL(softnet_data);