Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[pandora-kernel.git] / net / core / dev.c
diff --git a/net/core/dev.c b/net/core/dev.c

index 3acff09..683d493 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -118,6 +118,7 @@
  #include <linux/if_vlan.h>
  #include <linux/ip.h>
  #include <net/ip.h>
+#include <net/mpls.h>
  #include <linux/ipv6.h>
  #include <linux/in.h>
  #include <linux/jhash.h>
@@ -133,6 +134,7 @@
  #include <linux/vmalloc.h>
  #include <linux/if_macvlan.h>
  #include <linux/errqueue.h>
+#include <linux/hrtimer.h>
  
  #include "net-sysfs.h"
  
@@ -1435,22 +1437,17 @@ EXPORT_SYMBOL(dev_close);
   */
  void dev_disable_lro(struct net_device *dev)
  {
-       /*
-        * If we're trying to disable lro on a vlan device
-        * use the underlying physical device instead
-        */
-       if (is_vlan_dev(dev))
-               dev = vlan_dev_real_dev(dev);
-
-       /* the same for macvlan devices */
-       if (netif_is_macvlan(dev))
-               dev = macvlan_dev_real_dev(dev);
+       struct net_device *lower_dev;
+       struct list_head *iter;
  
         dev->wanted_features &= ~NETIF_F_LRO;
         netdev_update_features(dev);
  
         if (unlikely(dev->features & NETIF_F_LRO))
                 netdev_WARN(dev, "failed to disable LRO!\n");
+
+       netdev_for_each_lower_dev(dev, lower_dev, iter)
+               dev_disable_lro(lower_dev);
  }
  EXPORT_SYMBOL(dev_disable_lro);
  
@@ -1697,6 +1694,7 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
  
         skb_scrub_packet(skb, true);
         skb->protocol = eth_type_trans(skb, dev);
+       skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
  
         return 0;
  }
@@ -2525,12 +2523,12 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
  /* If MPLS offload request, verify we are testing hardware MPLS features
   * instead of standard features for the netdev.
   */
-#ifdef CONFIG_NET_MPLS_GSO
+#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
  static netdev_features_t net_mpls_features(struct sk_buff *skb,
                                            netdev_features_t features,
                                            __be16 type)
  {
-       if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
+       if (eth_p_mpls(type))
                 features &= skb->dev->mpls_features;
  
         return features;
@@ -2565,7 +2563,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
  
  netdev_features_t netif_skb_features(struct sk_buff *skb)
  {
-       const struct net_device *dev = skb->dev;
+       struct net_device *dev = skb->dev;
         netdev_features_t features = dev->features;
         u16 gso_segs = skb_shinfo(skb)->gso_segs;
         __be16 protocol = skb->protocol;
@@ -2573,11 +2571,21 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
                 features &= ~NETIF_F_GSO_MASK;
  
-       if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
-               struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
-               protocol = veh->h_vlan_encapsulated_proto;
-       } else if (!vlan_tx_tag_present(skb)) {
-               return harmonize_features(skb, features);
+       /* If encapsulation offload request, verify we are testing
+        * hardware encapsulation features instead of standard
+        * features for the netdev
+        */
+       if (skb->encapsulation)
+               features &= dev->hw_enc_features;
+
+       if (!vlan_tx_tag_present(skb)) {
+               if (unlikely(protocol == htons(ETH_P_8021Q) ||
+                            protocol == htons(ETH_P_8021AD))) {
+                       struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+                       protocol = veh->h_vlan_encapsulated_proto;
+               } else {
+                       goto finalize;
+               }
         }
  
         features = netdev_intersect_features(features,
@@ -2594,6 +2602,11 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
                                                      NETIF_F_HW_VLAN_CTAG_TX |
                                                      NETIF_F_HW_VLAN_STAG_TX);
  
+finalize:
+       if (dev->netdev_ops->ndo_features_check)
+               features &= dev->netdev_ops->ndo_features_check(skb, dev,
+                                                               features);
+
         return harmonize_features(skb, features);
  }
  EXPORT_SYMBOL(netif_skb_features);
@@ -2647,12 +2660,8 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
                                           netdev_features_t features)
  {
         if (vlan_tx_tag_present(skb) &&
-           !vlan_hw_offload_capable(features, skb->vlan_proto)) {
-               skb = __vlan_put_tag(skb, skb->vlan_proto,
-                                    vlan_tx_tag_get(skb));
-               if (skb)
-                       skb->vlan_tci = 0;
-       }
+           !vlan_hw_offload_capable(features, skb->vlan_proto))
+               skb = __vlan_hwaccel_push_inside(skb);
         return skb;
  }
  
@@ -2668,19 +2677,12 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
         if (unlikely(!skb))
                 goto out_null;
  
-       /* If encapsulation offload request, verify we are testing
-        * hardware encapsulation features instead of standard
-        * features for the netdev
-        */
-       if (skb->encapsulation)
-               features &= dev->hw_enc_features;
-
         if (netif_needs_gso(dev, skb, features)) {
                 struct sk_buff *segs;
  
                 segs = skb_gso_segment(skb, features);
                 if (IS_ERR(segs)) {
-                       segs = NULL;
+                       goto out_kfree_skb;
                 } else if (segs) {
                         consume_skb(skb);
                         skb = segs;
@@ -3304,7 +3306,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
         rps_lock(sd);
         qlen = skb_queue_len(&sd->input_pkt_queue);
         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
-               if (skb_queue_len(&sd->input_pkt_queue)) {
+               if (qlen) {
  enqueue:
                         __skb_queue_tail(&sd->input_pkt_queue, skb);
                         input_queue_tail_incr_save(sd, qtail);
@@ -4179,7 +4181,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
         struct sk_buff *skb = napi->skb;
  
         if (!skb) {
-               skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
+               skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
                 napi->skb = skb;
         }
         return skb;
@@ -4316,20 +4318,28 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
                 local_irq_enable();
  }
  
+static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+       return sd->rps_ipi_list != NULL;
+#else
+       return false;
+#endif
+}
+
  static int process_backlog(struct napi_struct *napi, int quota)
  {
         int work = 0;
         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
  
-#ifdef CONFIG_RPS
         /* Check if we have pending ipi, its better to send them now,
          * not waiting net_rx_action() end.
          */
-       if (sd->rps_ipi_list) {
+       if (sd_has_rps_ipi_waiting(sd)) {
                 local_irq_disable();
                 net_rps_action_and_irq_enable(sd);
         }
-#endif
+
         napi->weight = weight_p;
         local_irq_disable();
         while (1) {
@@ -4356,7 +4366,6 @@ static int process_backlog(struct napi_struct *napi, int quota)
                          * We can use a plain write instead of clear_bit(),
                          * and we dont need an smp_mb() memory barrier.
                          */
-                       list_del(&napi->poll_list);
                         napi->state = 0;
                         rps_unlock(sd);
  
@@ -4376,7 +4385,8 @@ static int process_backlog(struct napi_struct *napi, int quota)
   * __napi_schedule - schedule for receive
   * @n: entry to schedule
   *
- * The entry's receive function will be scheduled to run
+ * The entry's receive function will be scheduled to run.
+ * Consider using __napi_schedule_irqoff() if hard irqs are masked.
   */
  void __napi_schedule(struct napi_struct *n)
  {
@@ -4388,18 +4398,29 @@ void __napi_schedule(struct napi_struct *n)
  }
  EXPORT_SYMBOL(__napi_schedule);
  
+/**
+ * __napi_schedule_irqoff - schedule for receive
+ * @n: entry to schedule
+ *
+ * Variant of __napi_schedule() assuming hard irqs are masked
+ */
+void __napi_schedule_irqoff(struct napi_struct *n)
+{
+       ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+}
+EXPORT_SYMBOL(__napi_schedule_irqoff);
+
  void __napi_complete(struct napi_struct *n)
  {
         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
-       BUG_ON(n->gro_list);
  
-       list_del(&n->poll_list);
+       list_del_init(&n->poll_list);
         smp_mb__before_atomic();
         clear_bit(NAPI_STATE_SCHED, &n->state);
  }
  EXPORT_SYMBOL(__napi_complete);
  
-void napi_complete(struct napi_struct *n)
+void napi_complete_done(struct napi_struct *n, int work_done)
  {
         unsigned long flags;
  
@@ -4410,12 +4431,28 @@ void napi_complete(struct napi_struct *n)
         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
                 return;
  
-       napi_gro_flush(n, false);
-       local_irq_save(flags);
-       __napi_complete(n);
-       local_irq_restore(flags);
+       if (n->gro_list) {
+               unsigned long timeout = 0;
+
+               if (work_done)
+                       timeout = n->dev->gro_flush_timeout;
+
+               if (timeout)
+                       hrtimer_start(&n->timer, ns_to_ktime(timeout),
+                                     HRTIMER_MODE_REL_PINNED);
+               else
+                       napi_gro_flush(n, false);
+       }
+       if (likely(list_empty(&n->poll_list))) {
+               WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
+       } else {
+               /* If n->poll_list is not empty, we need to mask irqs */
+               local_irq_save(flags);
+               __napi_complete(n);
+               local_irq_restore(flags);
+       }
  }
-EXPORT_SYMBOL(napi_complete);
+EXPORT_SYMBOL(napi_complete_done);
  
  /* must be called under rcu_read_lock(), as we dont take a reference */
  struct napi_struct *napi_by_id(unsigned int napi_id)
@@ -4469,10 +4506,23 @@ void napi_hash_del(struct napi_struct *napi)
  }
  EXPORT_SYMBOL_GPL(napi_hash_del);
  
+static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
+{
+       struct napi_struct *napi;
+
+       napi = container_of(timer, struct napi_struct, timer);
+       if (napi->gro_list)
+               napi_schedule(napi);
+
+       return HRTIMER_NORESTART;
+}
+
  void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
                     int (*poll)(struct napi_struct *, int), int weight)
  {
         INIT_LIST_HEAD(&napi->poll_list);
+       hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+       napi->timer.function = napi_watchdog;
         napi->gro_count = 0;
         napi->gro_list = NULL;
         napi->skb = NULL;
@@ -4491,6 +4541,20 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
  }
  EXPORT_SYMBOL(netif_napi_add);
  
+void napi_disable(struct napi_struct *n)
+{
+       might_sleep();
+       set_bit(NAPI_STATE_DISABLE, &n->state);
+
+       while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
+               msleep(1);
+
+       hrtimer_cancel(&n->timer);
+
+       clear_bit(NAPI_STATE_DISABLE, &n->state);
+}
+EXPORT_SYMBOL(napi_disable);
+
  void netif_napi_del(struct napi_struct *napi)
  {
         list_del_init(&napi->dev_list);
@@ -4502,91 +4566,112 @@ void netif_napi_del(struct napi_struct *napi)
  }
  EXPORT_SYMBOL(netif_napi_del);
  
-static void net_rx_action(struct softirq_action *h)
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
  {
-       struct softnet_data *sd = this_cpu_ptr(&softnet_data);
-       unsigned long time_limit = jiffies + 2;
-       int budget = netdev_budget;
         void *have;
+       int work, weight;
  
-       local_irq_disable();
+       list_del_init(&n->poll_list);
  
-       while (!list_empty(&sd->poll_list)) {
-               struct napi_struct *n;
-               int work, weight;
+       have = netpoll_poll_lock(n);
  
-               /* If softirq window is exhuasted then punt.
-                * Allow this to run for 2 jiffies since which will allow
-                * an average latency of 1.5/HZ.
-                */
-               if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
-                       goto softnet_break;
+       weight = n->weight;
  
-               local_irq_enable();
+       /* This NAPI_STATE_SCHED test is for avoiding a race
+        * with netpoll's poll_napi().  Only the entity which
+        * obtains the lock and sees NAPI_STATE_SCHED set will
+        * actually make the ->poll() call.  Therefore we avoid
+        * accidentally calling ->poll() when NAPI is not scheduled.
+        */
+       work = 0;
+       if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+               work = n->poll(n, weight);
+               trace_napi_poll(n);
+       }
  
-               /* Even though interrupts have been re-enabled, this
-                * access is safe because interrupts can only add new
-                * entries to the tail of this list, and only ->poll()
-                * calls can remove this head entry from the list.
-                */
-               n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
+       WARN_ON_ONCE(work > weight);
  
-               have = netpoll_poll_lock(n);
+       if (likely(work < weight))
+               goto out_unlock;
  
-               weight = n->weight;
+       /* Drivers must not modify the NAPI state if they
+        * consume the entire weight.  In such cases this code
+        * still "owns" the NAPI instance and therefore can
+        * move the instance around on the list at-will.
+        */
+       if (unlikely(napi_disable_pending(n))) {
+               napi_complete(n);
+               goto out_unlock;
+       }
  
-               /* This NAPI_STATE_SCHED test is for avoiding a race
-                * with netpoll's poll_napi().  Only the entity which
-                * obtains the lock and sees NAPI_STATE_SCHED set will
-                * actually make the ->poll() call.  Therefore we avoid
-                * accidentally calling ->poll() when NAPI is not scheduled.
+       if (n->gro_list) {
+               /* flush too old packets
+                * If HZ < 1000, flush all packets.
                  */
-               work = 0;
-               if (test_bit(NAPI_STATE_SCHED, &n->state)) {
-                       work = n->poll(n, weight);
-                       trace_napi_poll(n);
-               }
+               napi_gro_flush(n, HZ >= 1000);
+       }
  
-               WARN_ON_ONCE(work > weight);
+       /* Some drivers may have called napi_schedule
+        * prior to exhausting their budget.
+        */
+       if (unlikely(!list_empty(&n->poll_list))) {
+               pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
+                            n->dev ? n->dev->name : "backlog");
+               goto out_unlock;
+       }
  
-               budget -= work;
+       list_add_tail(&n->poll_list, repoll);
  
-               local_irq_disable();
+out_unlock:
+       netpoll_poll_unlock(have);
  
-               /* Drivers must not modify the NAPI state if they
-                * consume the entire weight.  In such cases this code
-                * still "owns" the NAPI instance and therefore can
-                * move the instance around on the list at-will.
-                */
-               if (unlikely(work == weight)) {
-                       if (unlikely(napi_disable_pending(n))) {
-                               local_irq_enable();
-                               napi_complete(n);
-                               local_irq_disable();
-                       } else {
-                               if (n->gro_list) {
-                                       /* flush too old packets
-                                        * If HZ < 1000, flush all packets.
-                                        */
-                                       local_irq_enable();
-                                       napi_gro_flush(n, HZ >= 1000);
-                                       local_irq_disable();
-                               }
-                               list_move_tail(&n->poll_list, &sd->poll_list);
-                       }
+       return work;
+}
+
+static void net_rx_action(struct softirq_action *h)
+{
+       struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+       unsigned long time_limit = jiffies + 2;
+       int budget = netdev_budget;
+       LIST_HEAD(list);
+       LIST_HEAD(repoll);
+
+       local_irq_disable();
+       list_splice_init(&sd->poll_list, &list);
+       local_irq_enable();
+
+       for (;;) {
+               struct napi_struct *n;
+
+               if (list_empty(&list)) {
+                       if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
+                               return;
+                       break;
                 }
  
-               netpoll_poll_unlock(have);
+               n = list_first_entry(&list, struct napi_struct, poll_list);
+               budget -= napi_poll(n, &repoll);
+
+               /* If softirq window is exhausted then punt.
+                * Allow this to run for 2 jiffies since which will allow
+                * an average latency of 1.5/HZ.
+                */
+               if (unlikely(budget <= 0 ||
+                            time_after_eq(jiffies, time_limit))) {
+                       sd->time_squeeze++;
+                       break;
+               }
         }
-out:
-       net_rps_action_and_irq_enable(sd);
  
-       return;
+       local_irq_disable();
  
-softnet_break:
-       sd->time_squeeze++;
-       __raise_softirq_irqoff(NET_RX_SOFTIRQ);
-       goto out;
+       list_splice_tail_init(&sd->poll_list, &list);
+       list_splice_tail(&repoll, &list);
+       list_splice(&list, &sd->poll_list);
+       if (!list_empty(&sd->poll_list))
+               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+
+       net_rps_action_and_irq_enable(sd);
  }
  
  struct netdev_adjacent {
@@ -5786,7 +5871,7 @@ EXPORT_SYMBOL(dev_change_carrier);
   *     Get device physical port ID
   */
  int dev_get_phys_port_id(struct net_device *dev,
-                        struct netdev_phys_port_id *ppid)
+                        struct netdev_phys_item_id *ppid)
  {
         const struct net_device_ops *ops = dev->netdev_ops;
  
@@ -5865,6 +5950,8 @@ static void rollback_registered_many(struct list_head *head)
         synchronize_net();
  
         list_for_each_entry(dev, head, unreg_list) {
+               struct sk_buff *skb = NULL;
+
                 /* Shutdown queueing discipline. */
                 dev_shutdown(dev);
  
@@ -5874,6 +5961,11 @@ static void rollback_registered_many(struct list_head *head)
                 */
                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
  
+               if (!dev->rtnl_link_ops ||
+                   dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+                       skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
+                                                    GFP_KERNEL);
+
                 /*
                  *      Flush the unicast and multicast chains
                  */
@@ -5883,9 +5975,8 @@ static void rollback_registered_many(struct list_head *head)
                 if (dev->netdev_ops->ndo_uninit)
                         dev->netdev_ops->ndo_uninit(dev);
  
-               if (!dev->rtnl_link_ops ||
-                   dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
-                       rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+               if (skb)
+                       rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
  
                 /* Notifier chain MUST detach us all upper devices. */
                 WARN_ON(netdev_has_any_upper_dev(dev));