When using bnx2 in a high transmit load, bnx2_tx_int() cost is pretty high.
There are two reasons.
One is an expensive call to bnx2_get_hw_tx_cons(bnapi) for each freed skb
One is cpu stalls when accessing skb_is_gso(skb) / skb_shinfo(skb)->nr_frags
because of two cache line misses.
(One to get skb->end/head to compute skb_shinfo(skb),
one to get is_gso/nr_frags)
This patch :
1) avoids calling bnx2_get_hw_tx_cons(bnapi) too many times.
2) makes bnx2_start_xmit() cache is_gso & nr_frags into sw_tx_bd descriptor.
This uses a litle bit more ram (256 longs per device on x86), but helps a lot.
3) uses a prefetch(&skb->end) to speedup dev_kfree_skb(), bringing
cache line that will be needed in skb_release_data()
result is 5 % bandwidth increase in benchmarks, involving UDP or TCP receive
& transmits, when a cpu is dedicated to ksoftirqd for bnx2.
bnx2_tx_int going from 3.33 % cpu to 0.5 % cpu in oprofile
Note : skb_dma_unmap() still very expensive but this is for another patch,
not related to bnx2 (2.9 % of cpu, while it does nothing on x86_32)
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
tx_buf = &txr->tx_buf_ring[sw_ring_cons];
skb = tx_buf->skb;
tx_buf = &txr->tx_buf_ring[sw_ring_cons];
skb = tx_buf->skb;
+ /* prefetch skb_end_pointer() to speedup skb_shinfo(skb) */
+ prefetch(&skb->end);
+
/* partial BD completions possible with TSO packets */
/* partial BD completions possible with TSO packets */
u16 last_idx, last_ring_idx;
u16 last_idx, last_ring_idx;
- last_idx = sw_cons +
- skb_shinfo(skb)->nr_frags + 1;
- last_ring_idx = sw_ring_cons +
- skb_shinfo(skb)->nr_frags + 1;
+ last_idx = sw_cons + tx_buf->nr_frags + 1;
+ last_ring_idx = sw_ring_cons + tx_buf->nr_frags + 1;
if (unlikely(last_ring_idx >= MAX_TX_DESC_CNT)) {
last_idx++;
}
if (unlikely(last_ring_idx >= MAX_TX_DESC_CNT)) {
last_idx++;
}
skb_dma_unmap(&bp->pdev->dev, skb, DMA_TO_DEVICE);
tx_buf->skb = NULL;
skb_dma_unmap(&bp->pdev->dev, skb, DMA_TO_DEVICE);
tx_buf->skb = NULL;
- last = skb_shinfo(skb)->nr_frags;
+ last = tx_buf->nr_frags;
for (i = 0; i < last; i++) {
sw_cons = NEXT_TX_BD(sw_cons);
for (i = 0; i < last; i++) {
sw_cons = NEXT_TX_BD(sw_cons);
if (tx_pkt == budget)
break;
if (tx_pkt == budget)
break;
- hw_cons = bnx2_get_hw_tx_cons(bnapi);
+ if (hw_cons == sw_cons)
+ hw_cons = bnx2_get_hw_tx_cons(bnapi);
}
txr->hw_tx_cons = hw_cons;
}
txr->hw_tx_cons = hw_cons;
txbd->tx_bd_vlan_tag_flags = vlan_tag_flags | TX_BD_FLAGS_START;
last_frag = skb_shinfo(skb)->nr_frags;
txbd->tx_bd_vlan_tag_flags = vlan_tag_flags | TX_BD_FLAGS_START;
last_frag = skb_shinfo(skb)->nr_frags;
+ tx_buf->nr_frags = last_frag;
+ tx_buf->is_gso = skb_is_gso(skb);
for (i = 0; i < last_frag; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
for (i = 0; i < last_frag; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
struct sw_tx_bd {
struct sk_buff *skb;
struct sw_tx_bd {
struct sk_buff *skb;
+ unsigned short is_gso;
+ unsigned short nr_frags;
};
#define SW_RXBD_RING_SIZE (sizeof(struct sw_bd) * RX_DESC_CNT)
};
#define SW_RXBD_RING_SIZE (sizeof(struct sw_bd) * RX_DESC_CNT)