Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[pandora-kernel.git] / net / core / dev.c
index 8e726cb..6392ea0 100644 (file)
 #include <trace/events/skb.h>
 #include <linux/pci.h>
 #include <linux/inetdevice.h>
+#include <linux/cpu_rmap.h>
 
 #include "net-sysfs.h"
 
@@ -1286,7 +1287,7 @@ static int __dev_close(struct net_device *dev)
        return __dev_close_many(&single);
 }
 
-int dev_close_many(struct list_head *head)
+static int dev_close_many(struct list_head *head)
 {
        struct net_device *dev, *tmp;
        LIST_HEAD(tmp_list);
@@ -1594,6 +1595,48 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
        rcu_read_unlock();
 }
 
+/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
+ * @dev: Network device
+ * @txq: number of queues available
+ *
+ * If real_num_tx_queues is changed the tc mappings may no longer be
+ * valid. To resolve this verify the tc mapping remains valid and if
+ * not NULL the mapping. With no priorities mapping to this
+ * offset/count pair it will no longer be used. In the worst case TC0
+ * is invalid nothing can be done so disable priority mappings. If is
+ * expected that drivers will fix this mapping if they can before
+ * calling netif_set_real_num_tx_queues.
+ */
+static void netif_setup_tc(struct net_device *dev, unsigned int txq)
+{
+       int i;
+       struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+
+       /* If TC0 is invalidated disable TC mapping */
+       if (tc->offset + tc->count > txq) {
+               pr_warning("Number of in use tx queues changed "
+                          "invalidating tc mappings. Priority "
+                          "traffic classification disabled!\n");
+               dev->num_tc = 0;
+               return;
+       }
+
+       /* Invalidated prio to tc mappings set to TC0 */
+       for (i = 1; i < TC_BITMASK + 1; i++) {
+               int q = netdev_get_prio_tc_map(dev, i);
+
+               tc = &dev->tc_to_txq[q];
+               if (tc->offset + tc->count > txq) {
+                       pr_warning("Number of in use tx queues "
+                                  "changed. Priority %i to tc "
+                                  "mapping %i is no longer valid "
+                                  "setting map to 0\n",
+                                  i, q);
+                       netdev_set_prio_tc_map(dev, i, 0);
+               }
+       }
+}
+
 /*
  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -1613,6 +1656,9 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
                if (rc)
                        return rc;
 
+               if (dev->num_tc)
+                       netif_setup_tc(dev, txq);
+
                if (txq < dev->real_num_tx_queues)
                        qdisc_reset_all_tx_gt(dev, txq);
        }
@@ -1812,7 +1858,7 @@ EXPORT_SYMBOL(skb_checksum_help);
  *     It may return NULL if the skb requires no segmentation.  This is
  *     only possible when GSO is used for verifying header integrity.
  */
-struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
+struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
 {
        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
        struct packet_type *ptype;
@@ -2000,7 +2046,7 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol)
                 protocol == htons(ETH_P_FCOE)));
 }
 
-static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features)
+static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
 {
        if (!can_checksum_protocol(features, protocol)) {
                features &= ~NETIF_F_ALL_CSUM;
@@ -2012,10 +2058,10 @@ static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features
        return features;
 }
 
-int netif_skb_features(struct sk_buff *skb)
+u32 netif_skb_features(struct sk_buff *skb)
 {
        __be16 protocol = skb->protocol;
-       int features = skb->dev->features;
+       u32 features = skb->dev->features;
 
        if (protocol == htons(ETH_P_8021Q)) {
                struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
@@ -2060,7 +2106,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
        int rc = NETDEV_TX_OK;
 
        if (likely(!skb->next)) {
-               int features;
+               u32 features;
 
                /*
                 * If device doesnt need skb->dst, release it right now while
@@ -2162,6 +2208,8 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
                  unsigned int num_tx_queues)
 {
        u32 hash;
+       u16 qoffset = 0;
+       u16 qcount = num_tx_queues;
 
        if (skb_rx_queue_recorded(skb)) {
                hash = skb_get_rx_queue(skb);
@@ -2170,13 +2218,19 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
                return hash;
        }
 
+       if (dev->num_tc) {
+               u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+               qoffset = dev->tc_to_txq[tc].offset;
+               qcount = dev->tc_to_txq[tc].count;
+       }
+
        if (skb->sk && skb->sk->sk_hash)
                hash = skb->sk->sk_hash;
        else
                hash = (__force u16) skb->protocol ^ skb->rxhash;
        hash = jhash_1word(hash, hashrnd);
 
-       return (u16) (((u64) hash * num_tx_queues) >> 32);
+       return (u16) (((u64) hash * qcount) >> 32) + qoffset;
 }
 EXPORT_SYMBOL(__skb_tx_hash);
 
@@ -2273,15 +2327,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
                                 struct netdev_queue *txq)
 {
        spinlock_t *root_lock = qdisc_lock(q);
-       bool contended = qdisc_is_running(q);
+       bool contended;
        int rc;
 
+       qdisc_skb_cb(skb)->pkt_len = skb->len;
+       qdisc_calculate_pkt_len(skb, q);
        /*
         * Heuristic to force contended enqueues to serialize on a
         * separate lock before trying to get qdisc main lock.
         * This permits __QDISC_STATE_RUNNING owner to get the lock more often
         * and dequeue packets faster.
         */
+       contended = qdisc_is_running(q);
        if (unlikely(contended))
                spin_lock(&q->busylock);
 
@@ -2299,7 +2356,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
                if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
                        skb_dst_force(skb);
 
-               qdisc_skb_cb(skb)->pkt_len = skb->len;
                qdisc_bstats_update(q, skb);
 
                if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
@@ -2314,7 +2370,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
                rc = NET_XMIT_SUCCESS;
        } else {
                skb_dst_force(skb);
-               rc = qdisc_enqueue_root(skb, q);
+               rc = q->enqueue(skb, q) & NET_XMIT_MASK;
                if (qdisc_run_begin(q)) {
                        if (unlikely(contended)) {
                                spin_unlock(&q->busylock);
@@ -2533,6 +2589,53 @@ EXPORT_SYMBOL(__skb_get_rxhash);
 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 EXPORT_SYMBOL(rps_sock_flow_table);
 
+static struct rps_dev_flow *
+set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+           struct rps_dev_flow *rflow, u16 next_cpu)
+{
+       u16 tcpu;
+
+       tcpu = rflow->cpu = next_cpu;
+       if (tcpu != RPS_NO_CPU) {
+#ifdef CONFIG_RFS_ACCEL
+               struct netdev_rx_queue *rxqueue;
+               struct rps_dev_flow_table *flow_table;
+               struct rps_dev_flow *old_rflow;
+               u32 flow_id;
+               u16 rxq_index;
+               int rc;
+
+               /* Should we steer this flow to a different hardware queue? */
+               if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap)
+                       goto out;
+               rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
+               if (rxq_index == skb_get_rx_queue(skb))
+                       goto out;
+
+               rxqueue = dev->_rx + rxq_index;
+               flow_table = rcu_dereference(rxqueue->rps_flow_table);
+               if (!flow_table)
+                       goto out;
+               flow_id = skb->rxhash & flow_table->mask;
+               rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
+                                                       rxq_index, flow_id);
+               if (rc < 0)
+                       goto out;
+               old_rflow = rflow;
+               rflow = &flow_table->flows[flow_id];
+               rflow->cpu = next_cpu;
+               rflow->filter = rc;
+               if (old_rflow->filter == rflow->filter)
+                       old_rflow->filter = RPS_NO_FILTER;
+       out:
+#endif
+               rflow->last_qtail =
+                       per_cpu(softnet_data, tcpu).input_queue_head;
+       }
+
+       return rflow;
+}
+
 /*
  * get_rps_cpu is called from netif_receive_skb and returns the target
  * CPU from the RPS map of the receiving queue for a given skb.
@@ -2604,12 +2707,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
                if (unlikely(tcpu != next_cpu) &&
                    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
-                     rflow->last_qtail)) >= 0)) {
-                       tcpu = rflow->cpu = next_cpu;
-                       if (tcpu != RPS_NO_CPU)
-                               rflow->last_qtail = per_cpu(softnet_data,
-                                   tcpu).input_queue_head;
-               }
+                     rflow->last_qtail)) >= 0))
+                       rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+
                if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
                        *rflowp = rflow;
                        cpu = tcpu;
@@ -2630,6 +2730,46 @@ done:
        return cpu;
 }
 
+#ifdef CONFIG_RFS_ACCEL
+
+/**
+ * rps_may_expire_flow - check whether an RFS hardware filter may be removed
+ * @dev: Device on which the filter was set
+ * @rxq_index: RX queue index
+ * @flow_id: Flow ID passed to ndo_rx_flow_steer()
+ * @filter_id: Filter ID returned by ndo_rx_flow_steer()
+ *
+ * Drivers that implement ndo_rx_flow_steer() should periodically call
+ * this function for each installed filter and remove the filters for
+ * which it returns %true.
+ */
+bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+                        u32 flow_id, u16 filter_id)
+{
+       struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+       struct rps_dev_flow_table *flow_table;
+       struct rps_dev_flow *rflow;
+       bool expire = true;
+       int cpu;
+
+       rcu_read_lock();
+       flow_table = rcu_dereference(rxqueue->rps_flow_table);
+       if (flow_table && flow_id <= flow_table->mask) {
+               rflow = &flow_table->flows[flow_id];
+               cpu = ACCESS_ONCE(rflow->cpu);
+               if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
+                   ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+                          rflow->last_qtail) <
+                    (int)(10 * flow_table->mask)))
+                       expire = false;
+       }
+       rcu_read_unlock();
+       return expire;
+}
+EXPORT_SYMBOL(rps_may_expire_flow);
+
+#endif /* CONFIG_RFS_ACCEL */
+
 /* Called from hardirq (IPI) context */
 static void rps_trigger_softirq(void *data)
 {
@@ -3914,12 +4054,15 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos)
 
 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-       struct net_device *dev = (v == SEQ_START_TOKEN) ?
-                                 first_net_device(seq_file_net(seq)) :
-                                 next_net_device((struct net_device *)v);
+       struct net_device *dev = v;
+
+       if (v == SEQ_START_TOKEN)
+               dev = first_net_device_rcu(seq_file_net(seq));
+       else
+               dev = next_net_device_rcu(dev);
 
        ++*pos;
-       return rcu_dereference(dev);
+       return dev;
 }
 
 void dev_seq_stop(struct seq_file *seq, void *v)
@@ -4575,6 +4718,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 }
 EXPORT_SYMBOL(dev_set_mtu);
 
+/**
+ *     dev_set_group - Change group this device belongs to
+ *     @dev: device
+ *     @new_group: group this device should belong to
+ */
+void dev_set_group(struct net_device *dev, int new_group)
+{
+       dev->group = new_group;
+}
+EXPORT_SYMBOL(dev_set_group);
+
 /**
  *     dev_set_mac_address - Change Media Access Control Address
  *     @dev: device
@@ -5065,41 +5219,49 @@ static void rollback_registered(struct net_device *dev)
        rollback_registered_many(&single);
 }
 
-unsigned long netdev_fix_features(unsigned long features, const char *name)
+u32 netdev_fix_features(struct net_device *dev, u32 features)
 {
+       /* Fix illegal checksum combinations */
+       if ((features & NETIF_F_HW_CSUM) &&
+           (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+               netdev_info(dev, "mixed HW and IP checksum settings.\n");
+               features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
+       }
+
+       if ((features & NETIF_F_NO_CSUM) &&
+           (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+               netdev_info(dev, "mixed no checksumming and other settings.\n");
+               features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
+       }
+
        /* Fix illegal SG+CSUM combinations. */
        if ((features & NETIF_F_SG) &&
            !(features & NETIF_F_ALL_CSUM)) {
-               if (name)
-                       printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
-                              "checksum feature.\n", name);
+               netdev_info(dev,
+                           "Dropping NETIF_F_SG since no checksum feature.\n");
                features &= ~NETIF_F_SG;
        }
 
        /* TSO requires that SG is present as well. */
        if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
-               if (name)
-                       printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
-                              "SG feature.\n", name);
+               netdev_info(dev, "Dropping NETIF_F_TSO since no SG feature.\n");
                features &= ~NETIF_F_TSO;
        }
 
+       /* UFO needs SG and checksumming */
        if (features & NETIF_F_UFO) {
                /* maybe split UFO into V4 and V6? */
                if (!((features & NETIF_F_GEN_CSUM) ||
                    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
                            == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
-                       if (name)
-                               printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
-                                      "since no checksum offload features.\n",
-                                      name);
+                       netdev_info(dev,
+                               "Dropping NETIF_F_UFO since no checksum offload features.\n");
                        features &= ~NETIF_F_UFO;
                }
 
                if (!(features & NETIF_F_SG)) {
-                       if (name)
-                               printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
-                                      "since no NETIF_F_SG feature.\n", name);
+                       netdev_info(dev,
+                               "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
                        features &= ~NETIF_F_UFO;
                }
        }
@@ -5242,22 +5404,7 @@ int register_netdevice(struct net_device *dev)
        if (dev->iflink == -1)
                dev->iflink = dev->ifindex;
 
-       /* Fix illegal checksum combinations */
-       if ((dev->features & NETIF_F_HW_CSUM) &&
-           (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
-               printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
-                      dev->name);
-               dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
-       }
-
-       if ((dev->features & NETIF_F_NO_CSUM) &&
-           (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
-               printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
-                      dev->name);
-               dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
-       }
-
-       dev->features = netdev_fix_features(dev->features, dev->name);
+       dev->features = netdev_fix_features(dev, dev->features);
 
        /* Enable software GSO if SG is supported. */
        if (dev->features & NETIF_F_SG)
@@ -5683,6 +5830,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 #endif
 
        strcpy(dev->name, name);
+       dev->group = INIT_NETDEV_GROUP;
        return dev;
 
 free_all:
@@ -5997,8 +6145,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,
  *     @one to the master device with current feature set @all.  Will not
  *     enable anything that is off in @mask. Returns the new feature set.
  */
-unsigned long netdev_increment_features(unsigned long all, unsigned long one,
-                                       unsigned long mask)
+u32 netdev_increment_features(u32 all, u32 one, u32 mask)
 {
        /* If device needs checksumming, downgrade to it. */
        if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))