Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[pandora-kernel.git] / net / core / dev.c
index 3acff09..683d493 100644 (file)
 #include <linux/if_vlan.h>
 #include <linux/ip.h>
 #include <net/ip.h>
+#include <net/mpls.h>
 #include <linux/ipv6.h>
 #include <linux/in.h>
 #include <linux/jhash.h>
 #include <linux/vmalloc.h>
 #include <linux/if_macvlan.h>
 #include <linux/errqueue.h>
+#include <linux/hrtimer.h>
 
 #include "net-sysfs.h"
 
@@ -1435,22 +1437,17 @@ EXPORT_SYMBOL(dev_close);
  */
 void dev_disable_lro(struct net_device *dev)
 {
-       /*
-        * If we're trying to disable lro on a vlan device
-        * use the underlying physical device instead
-        */
-       if (is_vlan_dev(dev))
-               dev = vlan_dev_real_dev(dev);
-
-       /* the same for macvlan devices */
-       if (netif_is_macvlan(dev))
-               dev = macvlan_dev_real_dev(dev);
+       struct net_device *lower_dev;
+       struct list_head *iter;
 
        dev->wanted_features &= ~NETIF_F_LRO;
        netdev_update_features(dev);
 
        if (unlikely(dev->features & NETIF_F_LRO))
                netdev_WARN(dev, "failed to disable LRO!\n");
+
+       netdev_for_each_lower_dev(dev, lower_dev, iter)
+               dev_disable_lro(lower_dev);
 }
 EXPORT_SYMBOL(dev_disable_lro);
 
@@ -1697,6 +1694,7 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 
        skb_scrub_packet(skb, true);
        skb->protocol = eth_type_trans(skb, dev);
+       skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 
        return 0;
 }
@@ -2525,12 +2523,12 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 /* If MPLS offload request, verify we are testing hardware MPLS features
  * instead of standard features for the netdev.
  */
-#ifdef CONFIG_NET_MPLS_GSO
+#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
 static netdev_features_t net_mpls_features(struct sk_buff *skb,
                                           netdev_features_t features,
                                           __be16 type)
 {
-       if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
+       if (eth_p_mpls(type))
                features &= skb->dev->mpls_features;
 
        return features;
@@ -2565,7 +2563,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
 
 netdev_features_t netif_skb_features(struct sk_buff *skb)
 {
-       const struct net_device *dev = skb->dev;
+       struct net_device *dev = skb->dev;
        netdev_features_t features = dev->features;
        u16 gso_segs = skb_shinfo(skb)->gso_segs;
        __be16 protocol = skb->protocol;
@@ -2573,11 +2571,21 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
        if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
                features &= ~NETIF_F_GSO_MASK;
 
-       if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
-               struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
-               protocol = veh->h_vlan_encapsulated_proto;
-       } else if (!vlan_tx_tag_present(skb)) {
-               return harmonize_features(skb, features);
+       /* If encapsulation offload request, verify we are testing
+        * hardware encapsulation features instead of standard
+        * features for the netdev
+        */
+       if (skb->encapsulation)
+               features &= dev->hw_enc_features;
+
+       if (!vlan_tx_tag_present(skb)) {
+               if (unlikely(protocol == htons(ETH_P_8021Q) ||
+                            protocol == htons(ETH_P_8021AD))) {
+                       struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+                       protocol = veh->h_vlan_encapsulated_proto;
+               } else {
+                       goto finalize;
+               }
        }
 
        features = netdev_intersect_features(features,
@@ -2594,6 +2602,11 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
                                                     NETIF_F_HW_VLAN_CTAG_TX |
                                                     NETIF_F_HW_VLAN_STAG_TX);
 
+finalize:
+       if (dev->netdev_ops->ndo_features_check)
+               features &= dev->netdev_ops->ndo_features_check(skb, dev,
+                                                               features);
+
        return harmonize_features(skb, features);
 }
 EXPORT_SYMBOL(netif_skb_features);
@@ -2647,12 +2660,8 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
                                          netdev_features_t features)
 {
        if (vlan_tx_tag_present(skb) &&
-           !vlan_hw_offload_capable(features, skb->vlan_proto)) {
-               skb = __vlan_put_tag(skb, skb->vlan_proto,
-                                    vlan_tx_tag_get(skb));
-               if (skb)
-                       skb->vlan_tci = 0;
-       }
+           !vlan_hw_offload_capable(features, skb->vlan_proto))
+               skb = __vlan_hwaccel_push_inside(skb);
        return skb;
 }
 
@@ -2668,19 +2677,12 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
        if (unlikely(!skb))
                goto out_null;
 
-       /* If encapsulation offload request, verify we are testing
-        * hardware encapsulation features instead of standard
-        * features for the netdev
-        */
-       if (skb->encapsulation)
-               features &= dev->hw_enc_features;
-
        if (netif_needs_gso(dev, skb, features)) {
                struct sk_buff *segs;
 
                segs = skb_gso_segment(skb, features);
                if (IS_ERR(segs)) {
-                       segs = NULL;
+                       goto out_kfree_skb;
                } else if (segs) {
                        consume_skb(skb);
                        skb = segs;
@@ -3304,7 +3306,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
        rps_lock(sd);
        qlen = skb_queue_len(&sd->input_pkt_queue);
        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
-               if (skb_queue_len(&sd->input_pkt_queue)) {
+               if (qlen) {
 enqueue:
                        __skb_queue_tail(&sd->input_pkt_queue, skb);
                        input_queue_tail_incr_save(sd, qtail);
@@ -4179,7 +4181,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
        struct sk_buff *skb = napi->skb;
 
        if (!skb) {
-               skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
+               skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
                napi->skb = skb;
        }
        return skb;
@@ -4316,20 +4318,28 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
                local_irq_enable();
 }
 
+static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+       return sd->rps_ipi_list != NULL;
+#else
+       return false;
+#endif
+}
+
 static int process_backlog(struct napi_struct *napi, int quota)
 {
        int work = 0;
        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 
-#ifdef CONFIG_RPS
        /* Check if we have pending ipi, its better to send them now,
         * not waiting net_rx_action() end.
         */
-       if (sd->rps_ipi_list) {
+       if (sd_has_rps_ipi_waiting(sd)) {
                local_irq_disable();
                net_rps_action_and_irq_enable(sd);
        }
-#endif
+
        napi->weight = weight_p;
        local_irq_disable();
        while (1) {
@@ -4356,7 +4366,6 @@ static int process_backlog(struct napi_struct *napi, int quota)
                         * We can use a plain write instead of clear_bit(),
                         * and we dont need an smp_mb() memory barrier.
                         */
-                       list_del(&napi->poll_list);
                        napi->state = 0;
                        rps_unlock(sd);
 
@@ -4376,7 +4385,8 @@ static int process_backlog(struct napi_struct *napi, int quota)
  * __napi_schedule - schedule for receive
  * @n: entry to schedule
  *
- * The entry's receive function will be scheduled to run
+ * The entry's receive function will be scheduled to run.
+ * Consider using __napi_schedule_irqoff() if hard irqs are masked.
  */
 void __napi_schedule(struct napi_struct *n)
 {
@@ -4388,18 +4398,29 @@ void __napi_schedule(struct napi_struct *n)
 }
 EXPORT_SYMBOL(__napi_schedule);
 
+/**
+ * __napi_schedule_irqoff - schedule for receive
+ * @n: entry to schedule
+ *
+ * Variant of __napi_schedule() assuming hard irqs are masked
+ */
+void __napi_schedule_irqoff(struct napi_struct *n)
+{
+       ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+}
+EXPORT_SYMBOL(__napi_schedule_irqoff);
+
 void __napi_complete(struct napi_struct *n)
 {
        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
-       BUG_ON(n->gro_list);
 
-       list_del(&n->poll_list);
+       list_del_init(&n->poll_list);
        smp_mb__before_atomic();
        clear_bit(NAPI_STATE_SCHED, &n->state);
 }
 EXPORT_SYMBOL(__napi_complete);
 
-void napi_complete(struct napi_struct *n)
+void napi_complete_done(struct napi_struct *n, int work_done)
 {
        unsigned long flags;
 
@@ -4410,12 +4431,28 @@ void napi_complete(struct napi_struct *n)
        if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
                return;
 
-       napi_gro_flush(n, false);
-       local_irq_save(flags);
-       __napi_complete(n);
-       local_irq_restore(flags);
+       if (n->gro_list) {
+               unsigned long timeout = 0;
+
+               if (work_done)
+                       timeout = n->dev->gro_flush_timeout;
+
+               if (timeout)
+                       hrtimer_start(&n->timer, ns_to_ktime(timeout),
+                                     HRTIMER_MODE_REL_PINNED);
+               else
+                       napi_gro_flush(n, false);
+       }
+       if (likely(list_empty(&n->poll_list))) {
+               WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
+       } else {
+               /* If n->poll_list is not empty, we need to mask irqs */
+               local_irq_save(flags);
+               __napi_complete(n);
+               local_irq_restore(flags);
+       }
 }
-EXPORT_SYMBOL(napi_complete);
+EXPORT_SYMBOL(napi_complete_done);
 
 /* must be called under rcu_read_lock(), as we dont take a reference */
 struct napi_struct *napi_by_id(unsigned int napi_id)
@@ -4469,10 +4506,23 @@ void napi_hash_del(struct napi_struct *napi)
 }
 EXPORT_SYMBOL_GPL(napi_hash_del);
 
+static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
+{
+       struct napi_struct *napi;
+
+       napi = container_of(timer, struct napi_struct, timer);
+       if (napi->gro_list)
+               napi_schedule(napi);
+
+       return HRTIMER_NORESTART;
+}
+
 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
                    int (*poll)(struct napi_struct *, int), int weight)
 {
        INIT_LIST_HEAD(&napi->poll_list);
+       hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+       napi->timer.function = napi_watchdog;
        napi->gro_count = 0;
        napi->gro_list = NULL;
        napi->skb = NULL;
@@ -4491,6 +4541,20 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 }
 EXPORT_SYMBOL(netif_napi_add);
 
+void napi_disable(struct napi_struct *n)
+{
+       might_sleep();
+       set_bit(NAPI_STATE_DISABLE, &n->state);
+
+       while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
+               msleep(1);
+
+       hrtimer_cancel(&n->timer);
+
+       clear_bit(NAPI_STATE_DISABLE, &n->state);
+}
+EXPORT_SYMBOL(napi_disable);
+
 void netif_napi_del(struct napi_struct *napi)
 {
        list_del_init(&napi->dev_list);
@@ -4502,91 +4566,112 @@ void netif_napi_del(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(netif_napi_del);
 
-static void net_rx_action(struct softirq_action *h)
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 {
-       struct softnet_data *sd = this_cpu_ptr(&softnet_data);
-       unsigned long time_limit = jiffies + 2;
-       int budget = netdev_budget;
        void *have;
+       int work, weight;
 
-       local_irq_disable();
+       list_del_init(&n->poll_list);
 
-       while (!list_empty(&sd->poll_list)) {
-               struct napi_struct *n;
-               int work, weight;
+       have = netpoll_poll_lock(n);
 
-               /* If softirq window is exhuasted then punt.
-                * Allow this to run for 2 jiffies since which will allow
-                * an average latency of 1.5/HZ.
-                */
-               if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
-                       goto softnet_break;
+       weight = n->weight;
 
-               local_irq_enable();
+       /* This NAPI_STATE_SCHED test is for avoiding a race
+        * with netpoll's poll_napi().  Only the entity which
+        * obtains the lock and sees NAPI_STATE_SCHED set will
+        * actually make the ->poll() call.  Therefore we avoid
+        * accidentally calling ->poll() when NAPI is not scheduled.
+        */
+       work = 0;
+       if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+               work = n->poll(n, weight);
+               trace_napi_poll(n);
+       }
 
-               /* Even though interrupts have been re-enabled, this
-                * access is safe because interrupts can only add new
-                * entries to the tail of this list, and only ->poll()
-                * calls can remove this head entry from the list.
-                */
-               n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
+       WARN_ON_ONCE(work > weight);
 
-               have = netpoll_poll_lock(n);
+       if (likely(work < weight))
+               goto out_unlock;
 
-               weight = n->weight;
+       /* Drivers must not modify the NAPI state if they
+        * consume the entire weight.  In such cases this code
+        * still "owns" the NAPI instance and therefore can
+        * move the instance around on the list at-will.
+        */
+       if (unlikely(napi_disable_pending(n))) {
+               napi_complete(n);
+               goto out_unlock;
+       }
 
-               /* This NAPI_STATE_SCHED test is for avoiding a race
-                * with netpoll's poll_napi().  Only the entity which
-                * obtains the lock and sees NAPI_STATE_SCHED set will
-                * actually make the ->poll() call.  Therefore we avoid
-                * accidentally calling ->poll() when NAPI is not scheduled.
+       if (n->gro_list) {
+               /* flush too old packets
+                * If HZ < 1000, flush all packets.
                 */
-               work = 0;
-               if (test_bit(NAPI_STATE_SCHED, &n->state)) {
-                       work = n->poll(n, weight);
-                       trace_napi_poll(n);
-               }
+               napi_gro_flush(n, HZ >= 1000);
+       }
 
-               WARN_ON_ONCE(work > weight);
+       /* Some drivers may have called napi_schedule
+        * prior to exhausting their budget.
+        */
+       if (unlikely(!list_empty(&n->poll_list))) {
+               pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
+                            n->dev ? n->dev->name : "backlog");
+               goto out_unlock;
+       }
 
-               budget -= work;
+       list_add_tail(&n->poll_list, repoll);
 
-               local_irq_disable();
+out_unlock:
+       netpoll_poll_unlock(have);
 
-               /* Drivers must not modify the NAPI state if they
-                * consume the entire weight.  In such cases this code
-                * still "owns" the NAPI instance and therefore can
-                * move the instance around on the list at-will.
-                */
-               if (unlikely(work == weight)) {
-                       if (unlikely(napi_disable_pending(n))) {
-                               local_irq_enable();
-                               napi_complete(n);
-                               local_irq_disable();
-                       } else {
-                               if (n->gro_list) {
-                                       /* flush too old packets
-                                        * If HZ < 1000, flush all packets.
-                                        */
-                                       local_irq_enable();
-                                       napi_gro_flush(n, HZ >= 1000);
-                                       local_irq_disable();
-                               }
-                               list_move_tail(&n->poll_list, &sd->poll_list);
-                       }
+       return work;
+}
+
+static void net_rx_action(struct softirq_action *h)
+{
+       struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+       unsigned long time_limit = jiffies + 2;
+       int budget = netdev_budget;
+       LIST_HEAD(list);
+       LIST_HEAD(repoll);
+
+       local_irq_disable();
+       list_splice_init(&sd->poll_list, &list);
+       local_irq_enable();
+
+       for (;;) {
+               struct napi_struct *n;
+
+               if (list_empty(&list)) {
+                       if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
+                               return;
+                       break;
                }
 
-               netpoll_poll_unlock(have);
+               n = list_first_entry(&list, struct napi_struct, poll_list);
+               budget -= napi_poll(n, &repoll);
+
+               /* If softirq window is exhausted then punt.
+                * Allow this to run for 2 jiffies since which will allow
+                * an average latency of 1.5/HZ.
+                */
+               if (unlikely(budget <= 0 ||
+                            time_after_eq(jiffies, time_limit))) {
+                       sd->time_squeeze++;
+                       break;
+               }
        }
-out:
-       net_rps_action_and_irq_enable(sd);
 
-       return;
+       local_irq_disable();
 
-softnet_break:
-       sd->time_squeeze++;
-       __raise_softirq_irqoff(NET_RX_SOFTIRQ);
-       goto out;
+       list_splice_tail_init(&sd->poll_list, &list);
+       list_splice_tail(&repoll, &list);
+       list_splice(&list, &sd->poll_list);
+       if (!list_empty(&sd->poll_list))
+               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+
+       net_rps_action_and_irq_enable(sd);
 }
 
 struct netdev_adjacent {
@@ -5786,7 +5871,7 @@ EXPORT_SYMBOL(dev_change_carrier);
  *     Get device physical port ID
  */
 int dev_get_phys_port_id(struct net_device *dev,
-                        struct netdev_phys_port_id *ppid)
+                        struct netdev_phys_item_id *ppid)
 {
        const struct net_device_ops *ops = dev->netdev_ops;
 
@@ -5865,6 +5950,8 @@ static void rollback_registered_many(struct list_head *head)
        synchronize_net();
 
        list_for_each_entry(dev, head, unreg_list) {
+               struct sk_buff *skb = NULL;
+
                /* Shutdown queueing discipline. */
                dev_shutdown(dev);
 
@@ -5874,6 +5961,11 @@ static void rollback_registered_many(struct list_head *head)
                */
                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
+               if (!dev->rtnl_link_ops ||
+                   dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+                       skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
+                                                    GFP_KERNEL);
+
                /*
                 *      Flush the unicast and multicast chains
                 */
@@ -5883,9 +5975,8 @@ static void rollback_registered_many(struct list_head *head)
                if (dev->netdev_ops->ndo_uninit)
                        dev->netdev_ops->ndo_uninit(dev);
 
-               if (!dev->rtnl_link_ops ||
-                   dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
-                       rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+               if (skb)
+                       rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
 
                /* Notifier chain MUST detach us all upper devices. */
                WARN_ON(netdev_has_any_upper_dev(dev));