ixgbevf: Make next_to_watch a pointer and adjust memory barriers to avoid races
authorAlexander Duyck <alexander.h.duyck@intel.com>
Thu, 31 Jan 2013 07:43:22 +0000 (07:43 +0000)
committerJeff Kirsher <jeffrey.t.kirsher@intel.com>
Fri, 8 Mar 2013 08:03:26 +0000 (00:03 -0800)
This change is meant to address several race issues that become possible
because next_to_watch could possibly be set to a value that shows that the
descriptor is done when it is not.  In order to correct that we instead make
next_to_watch a pointer that is set to NULL during cleanup, and set to the
eop_desc after the descriptor rings have been written.

To enforce proper ordering the next_to_watch pointer is not set until after
a wmb writing the values to the last descriptor in a transmit.  In order to
guarantee that the descriptor is not read until after the eop_desc we use the
read_barrier_depends which is only really necessary on the alpha architecture.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Greg Rose <gregory.v.rose@intel.com>
Tested-by: Sibai Li <sibai.li@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c

index fc0af9a..fff0d98 100644 (file)
@@ -44,8 +44,8 @@ struct ixgbevf_tx_buffer {
        struct sk_buff *skb;
        dma_addr_t dma;
        unsigned long time_stamp;
+       union ixgbe_adv_tx_desc *next_to_watch;
        u16 length;
-       u16 next_to_watch;
        u16 mapped_as_page;
 };
 
index c3db6cd..20736eb 100644 (file)
@@ -190,28 +190,37 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
        struct ixgbevf_adapter *adapter = q_vector->adapter;
        union ixgbe_adv_tx_desc *tx_desc, *eop_desc;
        struct ixgbevf_tx_buffer *tx_buffer_info;
-       unsigned int i, eop, count = 0;
+       unsigned int i, count = 0;
        unsigned int total_bytes = 0, total_packets = 0;
 
        if (test_bit(__IXGBEVF_DOWN, &adapter->state))
                return true;
 
        i = tx_ring->next_to_clean;
-       eop = tx_ring->tx_buffer_info[i].next_to_watch;
-       eop_desc = IXGBEVF_TX_DESC(tx_ring, eop);
+       tx_buffer_info = &tx_ring->tx_buffer_info[i];
+       eop_desc = tx_buffer_info->next_to_watch;
 
-       while ((eop_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD)) &&
-              (count < tx_ring->count)) {
+       do {
                bool cleaned = false;
-               rmb(); /* read buffer_info after eop_desc */
-               /* eop could change between read and DD-check */
-               if (unlikely(eop != tx_ring->tx_buffer_info[i].next_to_watch))
-                       goto cont_loop;
+
+               /* if next_to_watch is not set then there is no work pending */
+               if (!eop_desc)
+                       break;
+
+               /* prevent any other reads prior to eop_desc */
+               read_barrier_depends();
+
+               /* if DD is not set pending work has not been completed */
+               if (!(eop_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD)))
+                       break;
+
+               /* clear next_to_watch to prevent false hangs */
+               tx_buffer_info->next_to_watch = NULL;
+
                for ( ; !cleaned; count++) {
                        struct sk_buff *skb;
                        tx_desc = IXGBEVF_TX_DESC(tx_ring, i);
-                       tx_buffer_info = &tx_ring->tx_buffer_info[i];
-                       cleaned = (i == eop);
+                       cleaned = (tx_desc == eop_desc);
                        skb = tx_buffer_info->skb;
 
                        if (cleaned && skb) {
@@ -234,12 +243,12 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
                        i++;
                        if (i == tx_ring->count)
                                i = 0;
+
+                       tx_buffer_info = &tx_ring->tx_buffer_info[i];
                }
 
-cont_loop:
-               eop = tx_ring->tx_buffer_info[i].next_to_watch;
-               eop_desc = IXGBEVF_TX_DESC(tx_ring, eop);
-       }
+               eop_desc = tx_buffer_info->next_to_watch;
+       } while (count < tx_ring->count);
 
        tx_ring->next_to_clean = i;
 
@@ -2806,8 +2815,7 @@ static bool ixgbevf_tx_csum(struct ixgbevf_ring *tx_ring,
 }
 
 static int ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
-                         struct sk_buff *skb, u32 tx_flags,
-                         unsigned int first)
+                         struct sk_buff *skb, u32 tx_flags)
 {
        struct ixgbevf_tx_buffer *tx_buffer_info;
        unsigned int len;
@@ -2832,7 +2840,6 @@ static int ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
                                                     size, DMA_TO_DEVICE);
                if (dma_mapping_error(tx_ring->dev, tx_buffer_info->dma))
                        goto dma_error;
-               tx_buffer_info->next_to_watch = i;
 
                len -= size;
                total -= size;
@@ -2862,7 +2869,6 @@ static int ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
                                              tx_buffer_info->dma))
                                goto dma_error;
                        tx_buffer_info->mapped_as_page = true;
-                       tx_buffer_info->next_to_watch = i;
 
                        len -= size;
                        total -= size;
@@ -2881,8 +2887,6 @@ static int ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
        else
                i = i - 1;
        tx_ring->tx_buffer_info[i].skb = skb;
-       tx_ring->tx_buffer_info[first].next_to_watch = i;
-       tx_ring->tx_buffer_info[first].time_stamp = jiffies;
 
        return count;
 
@@ -2891,7 +2895,6 @@ dma_error:
 
        /* clear timestamp and dma mappings for failed tx_buffer_info map */
        tx_buffer_info->dma = 0;
-       tx_buffer_info->next_to_watch = 0;
        count--;
 
        /* clear timestamp and dma mappings for remaining portion of packet */
@@ -2908,7 +2911,8 @@ dma_error:
 }
 
 static void ixgbevf_tx_queue(struct ixgbevf_ring *tx_ring, int tx_flags,
-                            int count, u32 paylen, u8 hdr_len)
+                            int count, unsigned int first, u32 paylen,
+                            u8 hdr_len)
 {
        union ixgbe_adv_tx_desc *tx_desc = NULL;
        struct ixgbevf_tx_buffer *tx_buffer_info;
@@ -2959,6 +2963,16 @@ static void ixgbevf_tx_queue(struct ixgbevf_ring *tx_ring, int tx_flags,
 
        tx_desc->read.cmd_type_len |= cpu_to_le32(txd_cmd);
 
+       tx_ring->tx_buffer_info[first].time_stamp = jiffies;
+
+       /* Force memory writes to complete before letting h/w
+        * know there are new descriptors to fetch.  (Only
+        * applicable for weak-ordered memory model archs,
+        * such as IA-64).
+        */
+       wmb();
+
+       tx_ring->tx_buffer_info[first].next_to_watch = tx_desc;
        tx_ring->next_to_use = i;
 }
 
@@ -3050,15 +3064,8 @@ static int ixgbevf_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
                tx_flags |= IXGBE_TX_FLAGS_CSUM;
 
        ixgbevf_tx_queue(tx_ring, tx_flags,
-                        ixgbevf_tx_map(tx_ring, skb, tx_flags, first),
-                        skb->len, hdr_len);
-       /*
-        * Force memory writes to complete before letting h/w
-        * know there are new descriptors to fetch.  (Only
-        * applicable for weak-ordered memory model archs,
-        * such as IA-64).
-        */
-       wmb();
+                        ixgbevf_tx_map(tx_ring, skb, tx_flags),
+                        first, skb->len, hdr_len);
 
        writel(tx_ring->next_to_use, adapter->hw.hw_addr + tx_ring->tail);