net: better IFF_XMIT_DST_RELEASE support
authorEric Dumazet <edumazet@google.com>
Mon, 6 Oct 2014 01:38:35 +0000 (18:38 -0700)
committerDavid S. Miller <davem@davemloft.net>
Tue, 7 Oct 2014 17:22:11 +0000 (13:22 -0400)
Testing xmit_more support with netperf and connected UDP sockets,
I found strange dst refcount false sharing.

Current handling of IFF_XMIT_DST_RELEASE is not optimal.

Dropping dst in validate_xmit_skb() is certainly too late in case
packet was queued by cpu X but dequeued by cpu Y

The logical point to take care of drop/force is in __dev_queue_xmit()
before even taking qdisc lock.

As Julian Anastasov pointed out, need for skb_dst() might come from some
packet schedulers or classifiers.

This patch adds new helper to cleanly express needs of various drivers
or qdiscs/classifiers.

Drivers that need skb_dst() in their ndo_start_xmit() should call
following helper in their setup instead of the prior :

dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
->
netif_keep_dst(dev);

Instead of using a single bit, we use two bits, one being
eventually rebuilt in bonding/team drivers.

The other one, is permanent and blocks IFF_XMIT_DST_RELEASE being
rebuilt in bonding/team. Eventually, we could add something
smarter later.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
27 files changed:
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/net/appletalk/ipddp.c
drivers/net/bonding/bond_main.c
drivers/net/eql.c
drivers/net/ifb.c
drivers/net/loopback.c
drivers/net/macvlan.c
drivers/net/ppp/ppp_generic.c
drivers/net/team/team.c
drivers/net/vxlan.c
drivers/net/wan/hdlc_fr.c
drivers/s390/net/qeth_l3_main.c
include/linux/netdevice.h
net/8021q/vlan_dev.c
net/atm/clip.c
net/core/dev.c
net/ipv4/ip_gre.c
net/ipv4/ip_vti.c
net/ipv4/ipip.c
net/ipv6/ip6_gre.c
net/ipv6/ip6_tunnel.c
net/ipv6/ip6_vti.c
net/ipv6/sit.c
net/sched/cls_flow.c
net/sched/cls_route.c
net/sched/sch_generic.c
net/sched/sch_teql.c

index 13e6e04..58b5aa3 100644 (file)
@@ -1364,7 +1364,7 @@ void ipoib_setup(struct net_device *dev)
        dev->tx_queue_len        = ipoib_sendq_size * 2;
        dev->features            = (NETIF_F_VLAN_CHALLENGED     |
                                    NETIF_F_HIGHDMA);
-       dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
 
        memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
 
index 10d0dba..e90c6a7 100644 (file)
@@ -74,7 +74,7 @@ static struct net_device * __init ipddp_init(void)
        if (!dev)
                return ERR_PTR(-ENOMEM);
 
-       dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
        strcpy(dev->name, "ipddp%d");
 
        if (version_printed++ == 0)
index 3ad5413..c9ac06c 100644 (file)
@@ -1002,7 +1002,8 @@ static netdev_features_t bond_fix_features(struct net_device *dev,
 
 static void bond_compute_features(struct bonding *bond)
 {
-       unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE;
+       unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
+                                       IFF_XMIT_DST_RELEASE_PERM;
        netdev_features_t vlan_features = BOND_VLAN_FEATURES;
        netdev_features_t enc_features  = BOND_ENC_FEATURES;
        struct net_device *bond_dev = bond->dev;
@@ -1038,8 +1039,10 @@ done:
        bond_dev->gso_max_segs = gso_max_segs;
        netif_set_gso_max_size(bond_dev, gso_max_size);
 
-       flags = bond_dev->priv_flags & ~IFF_XMIT_DST_RELEASE;
-       bond_dev->priv_flags = flags | dst_release_flag;
+       bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+       if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
+           dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
+               bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE;
 
        netdev_change_features(bond_dev);
 }
index 957e5c0..a10ad74 100644 (file)
@@ -199,7 +199,7 @@ static void __init eql_setup(struct net_device *dev)
 
        dev->type               = ARPHRD_SLIP;
        dev->tx_queue_len       = 5;            /* Hands them off fast */
-       dev->priv_flags        &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
 }
 
 static int eql_open(struct net_device *dev)
index d2d4a3d..34f846b 100644 (file)
@@ -185,7 +185,8 @@ static void ifb_setup(struct net_device *dev)
 
        dev->flags |= IFF_NOARP;
        dev->flags &= ~IFF_MULTICAST;
-       dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
+       dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+       netif_keep_dst(dev);
        eth_hw_addr_random(dev);
 }
 
index 8f22625..c76283c 100644 (file)
@@ -169,7 +169,7 @@ static void loopback_setup(struct net_device *dev)
        dev->type               = ARPHRD_LOOPBACK;      /* 0x0001*/
        dev->flags              = IFF_LOOPBACK;
        dev->priv_flags         |= IFF_LIVE_ADDR_CHANGE;
-       dev->priv_flags        &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
        dev->hw_features        = NETIF_F_ALL_TSO | NETIF_F_UFO;
        dev->features           = NETIF_F_SG | NETIF_F_FRAGLIST
                | NETIF_F_ALL_TSO
index e8a453f..38b4fae 100644 (file)
@@ -1025,7 +1025,8 @@ void macvlan_common_setup(struct net_device *dev)
 {
        ether_setup(dev);
 
-       dev->priv_flags        &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
+       dev->priv_flags        &= ~IFF_TX_SKB_SHARING;
+       netif_keep_dst(dev);
        dev->priv_flags        |= IFF_UNICAST_FLT;
        dev->netdev_ops         = &macvlan_netdev_ops;
        dev->destructor         = free_netdev;
index fa0d717..80e6f34 100644 (file)
@@ -1103,7 +1103,7 @@ static void ppp_setup(struct net_device *dev)
        dev->type = ARPHRD_PPP;
        dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
        dev->features |= NETIF_F_NETNS_LOCAL;
-       dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
 }
 
 /*
index 2277c36..a94a9df 100644 (file)
@@ -970,7 +970,8 @@ static void __team_compute_features(struct team *team)
        struct team_port *port;
        u32 vlan_features = TEAM_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL;
        unsigned short max_hard_header_len = ETH_HLEN;
-       unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE;
+       unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
+                                       IFF_XMIT_DST_RELEASE_PERM;
 
        list_for_each_entry(port, &team->port_list, list) {
                vlan_features = netdev_increment_features(vlan_features,
@@ -985,8 +986,9 @@ static void __team_compute_features(struct team *team)
        team->dev->vlan_features = vlan_features;
        team->dev->hard_header_len = max_hard_header_len;
 
-       flags = team->dev->priv_flags & ~IFF_XMIT_DST_RELEASE;
-       team->dev->priv_flags = flags | dst_release_flag;
+       team->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+       if (dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
+               team->dev->priv_flags |= IFF_XMIT_DST_RELEASE;
 
        netdev_change_features(team->dev);
 }
index 2af795d..2a51e6e 100644 (file)
@@ -2193,7 +2193,7 @@ static void vxlan_setup(struct net_device *dev)
        dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
        dev->hw_features |= NETIF_F_GSO_SOFTWARE;
        dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
-       dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
 
        INIT_LIST_HEAD(&vxlan->next);
index e5c7e61..3ebed1c 100644 (file)
@@ -1047,7 +1047,7 @@ static void pvc_setup(struct net_device *dev)
        dev->flags = IFF_POINTOPOINT;
        dev->hard_header_len = 10;
        dev->addr_len = 2;
-       dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
 }
 
 static const struct net_device_ops pvc_ops = {
index f8427a2..afebb97 100644 (file)
@@ -3306,7 +3306,7 @@ static int qeth_l3_setup_netdev(struct qeth_card *card)
        card->dev->features |=  NETIF_F_HW_VLAN_CTAG_TX |
                                NETIF_F_HW_VLAN_CTAG_RX |
                                NETIF_F_HW_VLAN_CTAG_FILTER;
-       card->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(card->dev);
        card->dev->gso_max_size = 15 * PAGE_SIZE;
 
        SET_NETDEV_DEV(card->dev, &card->gdev->dev);
index 2df86f5..3a4315b 100644 (file)
@@ -1206,6 +1206,7 @@ enum netdev_priv_flags {
        IFF_SUPP_NOFCS                  = 1<<19,
        IFF_LIVE_ADDR_CHANGE            = 1<<20,
        IFF_MACVLAN                     = 1<<21,
+       IFF_XMIT_DST_RELEASE_PERM       = 1<<22,
 };
 
 #define IFF_802_1Q_VLAN                        IFF_802_1Q_VLAN
@@ -1230,6 +1231,7 @@ enum netdev_priv_flags {
 #define IFF_SUPP_NOFCS                 IFF_SUPP_NOFCS
 #define IFF_LIVE_ADDR_CHANGE           IFF_LIVE_ADDR_CHANGE
 #define IFF_MACVLAN                    IFF_MACVLAN
+#define IFF_XMIT_DST_RELEASE_PERM      IFF_XMIT_DST_RELEASE_PERM
 
 /**
  *     struct net_device - The DEVICE structure.
@@ -3588,6 +3590,12 @@ static inline bool netif_supports_nofcs(struct net_device *dev)
        return dev->priv_flags & IFF_SUPP_NOFCS;
 }
 
+/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
+static inline void netif_keep_dst(struct net_device *dev)
+{
+       dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM);
+}
+
 extern struct pernet_operations __net_initdata loopback_net_ops;
 
 /* Logging, debugging and troubleshooting/diagnostic helpers. */
index 35a6b6b..0d441ec 100644 (file)
@@ -799,7 +799,8 @@ void vlan_setup(struct net_device *dev)
        ether_setup(dev);
 
        dev->priv_flags         |= IFF_802_1Q_VLAN;
-       dev->priv_flags         &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
+       dev->priv_flags         &= ~IFF_TX_SKB_SHARING;
+       netif_keep_dst(dev);
        dev->tx_queue_len       = 0;
 
        dev->netdev_ops         = &vlan_netdev_ops;
index 1d9eaa4..17e55df 100644 (file)
@@ -501,7 +501,7 @@ static void clip_setup(struct net_device *dev)
        /* without any more elaborate queuing. 100 is a reasonable */
        /* compromise between decent burst-tolerance and protection */
        /* against memory hogs. */
-       dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
 }
 
 static int clip_create(int number)
index a63b8c4..3c5bdaa 100644 (file)
@@ -2665,12 +2665,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
        if (skb->next)
                return skb;
 
-       /* If device doesn't need skb->dst, release it right now while
-        * its hot in this cpu cache
-        */
-       if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
-               skb_dst_drop(skb);
-
        features = netif_skb_features(skb);
        skb = validate_xmit_vlan(skb, features);
        if (unlikely(!skb))
@@ -2811,8 +2805,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
                 * waiting to be sent out; and the qdisc is not running -
                 * xmit the skb directly.
                 */
-               if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
-                       skb_dst_force(skb);
 
                qdisc_bstats_update(q, skb);
 
@@ -2827,7 +2819,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 
                rc = NET_XMIT_SUCCESS;
        } else {
-               skb_dst_force(skb);
                rc = q->enqueue(skb, q) & NET_XMIT_MASK;
                if (qdisc_run_begin(q)) {
                        if (unlikely(contended)) {
@@ -2924,6 +2915,14 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 
        skb_update_prio(skb);
 
+       /* If device/qdisc don't need skb->dst, release it right now while
+        * its hot in this cpu cache.
+        */
+       if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+               skb_dst_drop(skb);
+       else
+               skb_dst_force(skb);
+
        txq = netdev_pick_tx(dev, skb, accel_priv);
        q = rcu_dereference_bh(txq->qdisc);
 
@@ -6674,7 +6673,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
        INIT_LIST_HEAD(&dev->adj_list.lower);
        INIT_LIST_HEAD(&dev->all_adj_list.upper);
        INIT_LIST_HEAD(&dev->all_adj_list.lower);
-       dev->priv_flags = IFF_XMIT_DST_RELEASE;
+       dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
        setup(dev);
 
        dev->num_tx_queues = txqs;
index 0485ef1..12055fd 100644 (file)
@@ -510,7 +510,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
        memcpy(dev->broadcast, &iph->daddr, 4);
 
        dev->flags              = IFF_NOARP;
-       dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
        dev->addr_len           = 4;
 
        if (iph->daddr) {
index e453cb7..3e86101 100644 (file)
@@ -364,7 +364,7 @@ static int vti_tunnel_init(struct net_device *dev)
        dev->iflink             = 0;
        dev->addr_len           = 4;
        dev->features           |= NETIF_F_LLTX;
-       dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
 
        return ip_tunnel_init(dev);
 }
index ea88ab3..37096d6 100644 (file)
@@ -289,7 +289,7 @@ static void ipip_tunnel_setup(struct net_device *dev)
        dev->iflink             = 0;
        dev->addr_len           = 4;
        dev->features           |= NETIF_F_LLTX;
-       dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
 
        dev->features           |= IPIP_FEATURES;
        dev->hw_features        |= IPIP_FEATURES;
index 74b6779..de3b1c8 100644 (file)
@@ -1242,7 +1242,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
        dev->flags |= IFF_NOARP;
        dev->iflink = 0;
        dev->addr_len = sizeof(struct in6_addr);
-       dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
 }
 
 static int ip6gre_tunnel_init(struct net_device *dev)
index d3e8888..9409887 100644 (file)
@@ -1493,7 +1493,7 @@ static void ip6_tnl_dev_setup(struct net_device *dev)
                dev->mtu -= 8;
        dev->flags |= IFF_NOARP;
        dev->addr_len = sizeof(struct in6_addr);
-       dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
        /* This perm addr will be used as interface identifier by IPv6 */
        dev->addr_assign_type = NET_ADDR_RANDOM;
        eth_random_addr(dev->perm_addr);
index 5833a22..d440bb5 100644 (file)
@@ -807,7 +807,7 @@ static void vti6_dev_setup(struct net_device *dev)
        dev->mtu = ETH_DATA_LEN;
        dev->flags |= IFF_NOARP;
        dev->addr_len = sizeof(struct in6_addr);
-       dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
 }
 
 /**
index 0d4e274..6eab37c 100644 (file)
@@ -1364,7 +1364,7 @@ static void ipip6_tunnel_setup(struct net_device *dev)
        dev->hard_header_len    = LL_MAX_HEADER + t_hlen;
        dev->mtu                = ETH_DATA_LEN - t_hlen;
        dev->flags              = IFF_NOARP;
-       dev->priv_flags        &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
        dev->iflink             = 0;
        dev->addr_len           = 4;
        dev->features           |= NETIF_F_LLTX;
index a5d2b20..4ac515f 100644 (file)
@@ -493,6 +493,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
        tcf_exts_change(tp, &fnew->exts, &e);
        tcf_em_tree_change(tp, &fnew->ematches, &t);
 
+       netif_keep_dst(qdisc_dev(tp->q));
+
        if (tb[TCA_FLOW_KEYS]) {
                fnew->keymask = keymask;
                fnew->nkeys   = nkeys;
index 6f22baa..109a329 100644 (file)
@@ -524,6 +524,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
                if (f->handle < f1->handle)
                        break;
 
+       netif_keep_dst(qdisc_dev(tp->q));
        rcu_assign_pointer(f->next, f1);
        rcu_assign_pointer(*fp, f);
 
index 2b349a4..38d58e6 100644 (file)
@@ -47,7 +47,6 @@ EXPORT_SYMBOL(default_qdisc_ops);
 
 static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
-       skb_dst_force(skb);
        q->gso_skb = skb;
        q->qstats.requeues++;
        q->q.qlen++;    /* it's still part of the queue */
@@ -218,8 +217,6 @@ static inline int qdisc_restart(struct Qdisc *q)
        if (unlikely(!skb))
                return 0;
 
-       WARN_ON_ONCE(skb_dst_is_noref(skb));
-
        root_lock = qdisc_lock(q);
        dev = qdisc_dev(q);
        txq = skb_get_tx_queue(dev, skb);
index 5cd291b..6ada423 100644 (file)
@@ -470,7 +470,7 @@ static __init void teql_master_setup(struct net_device *dev)
        dev->tx_queue_len       = 100;
        dev->flags              = IFF_NOARP;
        dev->hard_header_len    = LL_MAX_HEADER;
-       dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
+       netif_keep_dst(dev);
 }
 
 static LIST_HEAD(master_dev_list);