skb->head is currently allocated from kmalloc(). This is convenient but
has the drawback the data cannot be converted to a page fragment if
needed.
We have three spots were it hurts :
1) GRO aggregation
When a linear skb must be appended to another skb, GRO uses the
frag_list fallback, very inefficient since we keep all struct sk_buff
around. So drivers enabling GRO but delivering linear skbs to network
stack aren't enabling full GRO power.
2) splice(socket -> pipe).
We must copy the linear part to a page fragment.
This kind of defeats splice() purpose (zero copy claim)
3) TCP coalescing.
Recently introduced, this permits to group several contiguous segments
into a single skb. This shortens queue lengths and save kernel memory,
and greatly reduce probabilities of TCP collapses. This coalescing
doesnt work on linear skbs (or we would need to copy data, this would be
too slow)
Given all these issues, the following patch introduces the possibility
of having skb->head be a fragment in itself. We use a new skb flag,
skb->head_frag to carry this information.
build_skb() is changed to accept a frag_size argument. Drivers willing
to provide a page fragment instead of kmalloc() data will set a non zero
value, set to the fragment size.
Then, on situations we need to convert the skb head to a frag in itself,
we can check if skb->head_frag is set and avoid the copies or various
fallbacks we have.
This means drivers currently using frags could be updated to avoid the
current skb->head allocation and reduce their memory footprint (aka skb
truesize). (thats 512 or 1024 bytes saved per skb). This also makes
bpf/netfilter faster since the 'first frag' will be part of skb linear
part, no need to copy data.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Matt Carlson <mcarlson@broadcom.com>
Cc: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
dma_unmap_single(&bp->pdev->dev, dma_addr, bp->rx_buf_use_size,
PCI_DMA_FROMDEVICE);
dma_unmap_single(&bp->pdev->dev, dma_addr, bp->rx_buf_use_size,
PCI_DMA_FROMDEVICE);
+ skb = build_skb(data, 0);
if (!skb) {
kfree(data);
goto error;
if (!skb) {
kfree(data);
goto error;
dma_unmap_single(&bp->pdev->dev, dma_unmap_addr(rx_buf, mapping),
fp->rx_buf_size, DMA_FROM_DEVICE);
if (likely(new_data))
dma_unmap_single(&bp->pdev->dev, dma_unmap_addr(rx_buf, mapping),
fp->rx_buf_size, DMA_FROM_DEVICE);
if (likely(new_data))
+ skb = build_skb(data, 0);
if (likely(skb)) {
#ifdef BNX2X_STOP_ON_ERROR
if (likely(skb)) {
#ifdef BNX2X_STOP_ON_ERROR
dma_unmap_addr(rx_buf, mapping),
fp->rx_buf_size,
DMA_FROM_DEVICE);
dma_unmap_addr(rx_buf, mapping),
fp->rx_buf_size,
DMA_FROM_DEVICE);
+ skb = build_skb(data, 0);
if (unlikely(!skb)) {
kfree(data);
fp->eth_q_stats.rx_skb_alloc_failed++;
if (unlikely(!skb)) {
kfree(data);
fp->eth_q_stats.rx_skb_alloc_failed++;
pci_unmap_single(tp->pdev, dma_addr, skb_size,
PCI_DMA_FROMDEVICE);
pci_unmap_single(tp->pdev, dma_addr, skb_size,
PCI_DMA_FROMDEVICE);
+ skb = build_skb(data, 0);
if (!skb) {
kfree(data);
goto drop_it_no_recycle;
if (!skb) {
kfree(data);
goto drop_it_no_recycle;
__u8 wifi_acked_valid:1;
__u8 wifi_acked:1;
__u8 no_fcs:1;
__u8 wifi_acked_valid:1;
__u8 wifi_acked:1;
__u8 no_fcs:1;
- /* 9/11 bit hole (depending on ndisc_nodetype presence) */
+ __u8 head_frag:1;
+ /* 8/10 bit hole (depending on ndisc_nodetype presence) */
kmemcheck_bitfield_end(flags2);
#ifdef CONFIG_NET_DMA
kmemcheck_bitfield_end(flags2);
#ifdef CONFIG_NET_DMA
extern void __kfree_skb(struct sk_buff *skb);
extern struct sk_buff *__alloc_skb(unsigned int size,
gfp_t priority, int fclone, int node);
extern void __kfree_skb(struct sk_buff *skb);
extern struct sk_buff *__alloc_skb(unsigned int size,
gfp_t priority, int fclone, int node);
-extern struct sk_buff *build_skb(void *data);
+extern struct sk_buff *build_skb(void *data, unsigned int frag_size);
static inline struct sk_buff *alloc_skb(unsigned int size,
gfp_t priority)
{
static inline struct sk_buff *alloc_skb(unsigned int size,
gfp_t priority)
{
/**
* build_skb - build a network buffer
* @data: data buffer provided by caller
/**
* build_skb - build a network buffer
* @data: data buffer provided by caller
+ * @frag_size: size of fragment, or 0 if head was kmalloced
*
* Allocate a new &sk_buff. Caller provides space holding head and
* skb_shared_info. @data must have been allocated by kmalloc()
*
* Allocate a new &sk_buff. Caller provides space holding head and
* skb_shared_info. @data must have been allocated by kmalloc()
* before giving packet to stack.
* RX rings only contains data buffers, not full skbs.
*/
* before giving packet to stack.
* RX rings only contains data buffers, not full skbs.
*/
-struct sk_buff *build_skb(void *data)
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
struct skb_shared_info *shinfo;
struct sk_buff *skb;
{
struct skb_shared_info *shinfo;
struct sk_buff *skb;
+ unsigned int size = frag_size ? : ksize(data);
skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
if (!skb)
return NULL;
skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
if (!skb)
return NULL;
- size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
memset(skb, 0, offsetof(struct sk_buff, tail));
skb->truesize = SKB_TRUESIZE(size);
memset(skb, 0, offsetof(struct sk_buff, tail));
skb->truesize = SKB_TRUESIZE(size);
+ skb->head_frag = frag_size != 0;
atomic_set(&skb->users, 1);
skb->head = data;
skb->data = data;
atomic_set(&skb->users, 1);
skb->head = data;
skb->data = data;
+static void skb_free_head(struct sk_buff *skb)
+{
+ if (skb->head_frag)
+ put_page(virt_to_head_page(skb->head));
+ else
+ kfree(skb->head);
+}
+
static void skb_release_data(struct sk_buff *skb)
{
if (!skb->cloned ||
static void skb_release_data(struct sk_buff *skb)
{
if (!skb->cloned ||
if (skb_has_frag_list(skb))
skb_drop_fraglist(skb);
if (skb_has_frag_list(skb))
skb_drop_fraglist(skb);
C(tail);
C(end);
C(head);
C(tail);
C(end);
C(head);
C(data);
C(truesize);
atomic_set(&n->users, 1);
C(data);
C(truesize);
atomic_set(&n->users, 1);
fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
}
fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
}
+ if (fastpath && !skb->head_frag &&
size + sizeof(struct skb_shared_info) <= ksize(skb->head)) {
memmove(skb->head + size, skb_shinfo(skb),
offsetof(struct skb_shared_info,
size + sizeof(struct skb_shared_info) <= ksize(skb->head)) {
memmove(skb->head + size, skb_shinfo(skb),
offsetof(struct skb_shared_info,
offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
if (fastpath) {
offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
if (fastpath) {
} else {
/* copy this zero copy skb frags */
if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
} else {
/* copy this zero copy skb frags */
if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
off = (data + nhead) - skb->head;
skb->head = data;
off = (data + nhead) - skb->head;
skb->head = data;
adjust_others:
skb->data += off;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
adjust_others:
skb->data += off;
#ifdef NET_SKBUFF_DATA_USES_OFFSET