net: allow skb->head to be a page fragment
[pandora-kernel.git] / net / core / skbuff.c
index e598400..effa75d 100644 (file)
@@ -67,8 +67,7 @@
 
 #include <asm/uaccess.h>
 #include <trace/events/skb.h>
-
-#include "kmap_skb.h"
+#include <linux/highmem.h>
 
 static struct kmem_cache *skbuff_head_cache __read_mostly;
 static struct kmem_cache *skbuff_fclone_cache __read_mostly;
@@ -246,6 +245,7 @@ EXPORT_SYMBOL(__alloc_skb);
 /**
  * build_skb - build a network buffer
  * @data: data buffer provided by caller
+ * @frag_size: size of fragment, or 0 if head was kmalloced
  *
  * Allocate a new &sk_buff. Caller provides space holding head and
  * skb_shared_info. @data must have been allocated by kmalloc()
@@ -259,20 +259,21 @@ EXPORT_SYMBOL(__alloc_skb);
  *  before giving packet to stack.
  *  RX rings only contains data buffers, not full skbs.
  */
-struct sk_buff *build_skb(void *data)
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
 {
        struct skb_shared_info *shinfo;
        struct sk_buff *skb;
-       unsigned int size;
+       unsigned int size = frag_size ? : ksize(data);
 
        skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
        if (!skb)
                return NULL;
 
-       size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+       size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
        memset(skb, 0, offsetof(struct sk_buff, tail));
        skb->truesize = SKB_TRUESIZE(size);
+       skb->head_frag = frag_size != 0;
        atomic_set(&skb->users, 1);
        skb->head = data;
        skb->data = data;
@@ -377,6 +378,14 @@ static void skb_clone_fraglist(struct sk_buff *skb)
                skb_get(list);
 }
 
+static void skb_free_head(struct sk_buff *skb)
+{
+       if (skb->head_frag)
+               put_page(virt_to_head_page(skb->head));
+       else
+               kfree(skb->head);
+}
+
 static void skb_release_data(struct sk_buff *skb)
 {
        if (!skb->cloned ||
@@ -403,7 +412,7 @@ static void skb_release_data(struct sk_buff *skb)
                if (skb_has_frag_list(skb))
                        skb_drop_fraglist(skb);
 
-               kfree(skb->head);
+               skb_free_head(skb);
        }
 }
 
@@ -645,6 +654,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
        C(tail);
        C(end);
        C(head);
+       C(head_frag);
        C(data);
        C(truesize);
        atomic_set(&n->users, 1);
@@ -707,10 +717,10 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
                        }
                        return -ENOMEM;
                }
-               vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+               vaddr = kmap_atomic(skb_frag_page(f));
                memcpy(page_address(page),
                       vaddr + f->page_offset, skb_frag_size(f));
-               kunmap_skb_frag(vaddr);
+               kunmap_atomic(vaddr);
                page->private = (unsigned long)head;
                head = page;
        }
@@ -941,7 +951,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
                fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
        }
 
-       if (fastpath &&
+       if (fastpath && !skb->head_frag &&
            size + sizeof(struct skb_shared_info) <= ksize(skb->head)) {
                memmove(skb->head + size, skb_shinfo(skb),
                        offsetof(struct skb_shared_info,
@@ -968,7 +978,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
               offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
 
        if (fastpath) {
-               kfree(skb->head);
+               skb_free_head(skb);
        } else {
                /* copy this zero copy skb frags */
                if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
@@ -986,6 +996,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
        off = (data + nhead) - skb->head;
 
        skb->head     = data;
+       skb->head_frag = 0;
 adjust_others:
        skb->data    += off;
 #ifdef NET_SKBUFF_DATA_USES_OFFSET
@@ -1275,7 +1286,7 @@ drop_pages:
                                return -ENOMEM;
 
                        nfrag->next = frag->next;
-                       kfree_skb(frag);
+                       consume_skb(frag);
                        frag = nfrag;
                        *fragp = frag;
                }
@@ -1487,21 +1498,22 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
 
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
+               skb_frag_t *f = &skb_shinfo(skb)->frags[i];
 
                WARN_ON(start > offset + len);
 
-               end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+               end = start + skb_frag_size(f);
                if ((copy = end - offset) > 0) {
                        u8 *vaddr;
 
                        if (copy > len)
                                copy = len;
 
-                       vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+                       vaddr = kmap_atomic(skb_frag_page(f));
                        memcpy(to,
-                              vaddr + skb_shinfo(skb)->frags[i].page_offset+
-                              offset - start, copy);
-                       kunmap_skb_frag(vaddr);
+                              vaddr + f->page_offset + offset - start,
+                              copy);
+                       kunmap_atomic(vaddr);
 
                        if ((len -= copy) == 0)
                                return 0;
@@ -1547,9 +1559,9 @@ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
        put_page(spd->pages[i]);
 }
 
-static inline struct page *linear_to_page(struct page *page, unsigned int *len,
-                                         unsigned int *offset,
-                                         struct sk_buff *skb, struct sock *sk)
+static struct page *linear_to_page(struct page *page, unsigned int *len,
+                                  unsigned int *offset,
+                                  struct sk_buff *skb, struct sock *sk)
 {
        struct page *p = sk->sk_sndmsg_page;
        unsigned int off;
@@ -1565,6 +1577,9 @@ new_page:
        } else {
                unsigned int mlen;
 
+               /* If we are the only user of the page, we can reset offset */
+               if (page_count(p) == 1)
+                       sk->sk_sndmsg_off = 0;
                off = sk->sk_sndmsg_off;
                mlen = PAGE_SIZE - off;
                if (mlen < 64 && mlen < *len) {
@@ -1578,36 +1593,48 @@ new_page:
        memcpy(page_address(p) + off, page_address(page) + *offset, *len);
        sk->sk_sndmsg_off += *len;
        *offset = off;
-       get_page(p);
 
        return p;
 }
 
+static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
+                            struct page *page,
+                            unsigned int offset)
+{
+       return  spd->nr_pages &&
+               spd->pages[spd->nr_pages - 1] == page &&
+               (spd->partial[spd->nr_pages - 1].offset +
+                spd->partial[spd->nr_pages - 1].len == offset);
+}
+
 /*
  * Fill page/offset/length into spd, if it can hold more pages.
  */
-static inline int spd_fill_page(struct splice_pipe_desc *spd,
-                               struct pipe_inode_info *pipe, struct page *page,
-                               unsigned int *len, unsigned int offset,
-                               struct sk_buff *skb, int linear,
-                               struct sock *sk)
+static bool spd_fill_page(struct splice_pipe_desc *spd,
+                         struct pipe_inode_info *pipe, struct page *page,
+                         unsigned int *len, unsigned int offset,
+                         struct sk_buff *skb, bool linear,
+                         struct sock *sk)
 {
-       if (unlikely(spd->nr_pages == pipe->buffers))
-               return 1;
+       if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
+               return true;
 
        if (linear) {
                page = linear_to_page(page, len, &offset, skb, sk);
                if (!page)
-                       return 1;
-       } else
-               get_page(page);
-
+                       return true;
+       }
+       if (spd_can_coalesce(spd, page, offset)) {
+               spd->partial[spd->nr_pages - 1].len += *len;
+               return false;
+       }
+       get_page(page);
        spd->pages[spd->nr_pages] = page;
        spd->partial[spd->nr_pages].len = *len;
        spd->partial[spd->nr_pages].offset = offset;
        spd->nr_pages++;
 
-       return 0;
+       return false;
 }
 
 static inline void __segment_seek(struct page **page, unsigned int *poff,
@@ -1624,20 +1651,20 @@ static inline void __segment_seek(struct page **page, unsigned int *poff,
        *plen -= off;
 }
 
-static inline int __splice_segment(struct page *page, unsigned int poff,
-                                  unsigned int plen, unsigned int *off,
-                                  unsigned int *len, struct sk_buff *skb,
-                                  struct splice_pipe_desc *spd, int linear,
-                                  struct sock *sk,
-                                  struct pipe_inode_info *pipe)
+static bool __splice_segment(struct page *page, unsigned int poff,
+                            unsigned int plen, unsigned int *off,
+                            unsigned int *len, struct sk_buff *skb,
+                            struct splice_pipe_desc *spd, bool linear,
+                            struct sock *sk,
+                            struct pipe_inode_info *pipe)
 {
        if (!*len)
-               return 1;
+               return true;
 
        /* skip this segment if already processed */
        if (*off >= plen) {
                *off -= plen;
-               return 0;
+               return false;
        }
 
        /* ignore any bits we already processed */
@@ -1653,23 +1680,23 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
                flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
 
                if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk))
-                       return 1;
+                       return true;
 
                __segment_seek(&page, &poff, &plen, flen);
                *len -= flen;
 
        } while (*len && plen);
 
-       return 0;
+       return false;
 }
 
 /*
- * Map linear and fragment data from the skb to spd. It reports failure if the
+ * Map linear and fragment data from the skb to spd. It reports true if the
  * pipe is full or if we already spliced the requested length.
  */
-static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
-                            unsigned int *offset, unsigned int *len,
-                            struct splice_pipe_desc *spd, struct sock *sk)
+static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
+                             unsigned int *offset, unsigned int *len,
+                             struct splice_pipe_desc *spd, struct sock *sk)
 {
        int seg;
 
@@ -1679,8 +1706,8 @@ static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
        if (__splice_segment(virt_to_page(skb->data),
                             (unsigned long) skb->data & (PAGE_SIZE - 1),
                             skb_headlen(skb),
-                            offset, len, skb, spd, 1, sk, pipe))
-               return 1;
+                            offset, len, skb, spd, true, sk, pipe))
+               return true;
 
        /*
         * then map the fragments
@@ -1690,11 +1717,11 @@ static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
 
                if (__splice_segment(skb_frag_page(f),
                                     f->page_offset, skb_frag_size(f),
-                                    offset, len, skb, spd, 0, sk, pipe))
-                       return 1;
+                                    offset, len, skb, spd, false, sk, pipe))
+                       return true;
        }
 
-       return 0;
+       return false;
 }
 
 /*
@@ -1707,8 +1734,8 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
                    struct pipe_inode_info *pipe, unsigned int tlen,
                    unsigned int flags)
 {
-       struct partial_page partial[PIPE_DEF_BUFFERS];
-       struct page *pages[PIPE_DEF_BUFFERS];
+       struct partial_page partial[MAX_SKB_FRAGS];
+       struct page *pages[MAX_SKB_FRAGS];
        struct splice_pipe_desc spd = {
                .pages = pages,
                .partial = partial,
@@ -1720,9 +1747,6 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
        struct sock *sk = skb->sk;
        int ret = 0;
 
-       if (splice_grow_spd(pipe, &spd))
-               return -ENOMEM;
-
        /*
         * __skb_splice_bits() only fails if the output has no room left,
         * so no point in going over the frag_list for the error case.
@@ -1758,7 +1782,6 @@ done:
                lock_sock(sk);
        }
 
-       splice_shrink_spd(pipe, &spd);
        return ret;
 }
 
@@ -1806,10 +1829,10 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
                        if (copy > len)
                                copy = len;
 
-                       vaddr = kmap_skb_frag(frag);
+                       vaddr = kmap_atomic(skb_frag_page(frag));
                        memcpy(vaddr + frag->page_offset + offset - start,
                               from, copy);
-                       kunmap_skb_frag(vaddr);
+                       kunmap_atomic(vaddr);
 
                        if ((len -= copy) == 0)
                                return 0;
@@ -1869,21 +1892,21 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,
 
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
+               skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
                WARN_ON(start > offset + len);
 
-               end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+               end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        __wsum csum2;
                        u8 *vaddr;
-                       skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
                        if (copy > len)
                                copy = len;
-                       vaddr = kmap_skb_frag(frag);
+                       vaddr = kmap_atomic(skb_frag_page(frag));
                        csum2 = csum_partial(vaddr + frag->page_offset +
                                             offset - start, copy, 0);
-                       kunmap_skb_frag(vaddr);
+                       kunmap_atomic(vaddr);
                        csum = csum_block_add(csum, csum2, pos);
                        if (!(len -= copy))
                                return csum;
@@ -1955,12 +1978,12 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
 
                        if (copy > len)
                                copy = len;
-                       vaddr = kmap_skb_frag(frag);
+                       vaddr = kmap_atomic(skb_frag_page(frag));
                        csum2 = csum_partial_copy_nocheck(vaddr +
                                                          frag->page_offset +
                                                          offset - start, to,
                                                          copy, 0);
-                       kunmap_skb_frag(vaddr);
+                       kunmap_atomic(vaddr);
                        csum = csum_block_add(csum, csum2, pos);
                        if (!(len -= copy))
                                return csum;
@@ -2480,7 +2503,7 @@ next_skb:
 
                if (abs_offset < block_limit) {
                        if (!st->frag_data)
-                               st->frag_data = kmap_skb_frag(frag);
+                               st->frag_data = kmap_atomic(skb_frag_page(frag));
 
                        *data = (u8 *) st->frag_data + frag->page_offset +
                                (abs_offset - st->stepped_offset);
@@ -2489,7 +2512,7 @@ next_skb:
                }
 
                if (st->frag_data) {
-                       kunmap_skb_frag(st->frag_data);
+                       kunmap_atomic(st->frag_data);
                        st->frag_data = NULL;
                }
 
@@ -2498,7 +2521,7 @@ next_skb:
        }
 
        if (st->frag_data) {
-               kunmap_skb_frag(st->frag_data);
+               kunmap_atomic(st->frag_data);
                st->frag_data = NULL;
        }
 
@@ -2526,7 +2549,7 @@ EXPORT_SYMBOL(skb_seq_read);
 void skb_abort_seq_read(struct skb_seq_state *st)
 {
        if (st->frag_data)
-               kunmap_skb_frag(st->frag_data);
+               kunmap_atomic(st->frag_data);
 }
 EXPORT_SYMBOL(skb_abort_seq_read);
 
@@ -3166,7 +3189,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
        int len = skb->len;
 
        if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
-           (unsigned)sk->sk_rcvbuf)
+           (unsigned int)sk->sk_rcvbuf)
                return -ENOMEM;
 
        skb_orphan(skb);