X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=net%2Fcore%2Fskbuff.c;h=6ef7008a3ef363edc6df47387ce92910d04f56d5;hb=1483b8744e1a189b2018e2a1bd82f343d6bb52d2;hp=b6283779e93d4432f9f9cad9f4c7a1610b3d1763;hpb=8df042e85c4462f97be286ef0fd82a627f27e9c8;p=pandora-kernel.git diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b6283779e93d..6ef7008a3ef3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -52,6 +52,7 @@ #endif #include #include +#include #include #include #include @@ -71,6 +72,40 @@ static struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; +static void sock_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct sk_buff *skb = (struct sk_buff *) buf->private; + + kfree_skb(skb); +} + +static void sock_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct sk_buff *skb = (struct sk_buff *) buf->private; + + skb_get(skb); +} + +static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + + +/* Pipe buffer operations for a socket. */ +static struct pipe_buf_operations sock_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = sock_pipe_buf_release, + .steal = sock_pipe_buf_steal, + .get = sock_pipe_buf_get, +}; + /* * Keep out-of-line to prevent kernel bloat. * __builtin_return_address is not used because it is not always @@ -228,6 +263,28 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, return skb; } +/** + * dev_alloc_skb - allocate an skbuff for receiving + * @length: length to allocate + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has unspecified headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned if there is no free memory. Although this function + * allocates memory it can be called from an interrupt. + */ +struct sk_buff *dev_alloc_skb(unsigned int length) +{ + /* + * There is more code here than it seems: + * __def_alloc_skb is an inline + */ + return __dev_alloc_skb(length, GFP_ATOMIC); +} +EXPORT_SYMBOL(dev_alloc_skb); + static void skb_drop_list(struct sk_buff **listp) { struct sk_buff *list = *listp; @@ -822,6 +879,78 @@ free_skb: return err; } +/** + * skb_put - add data to a buffer + * @skb: buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the buffer. If this would + * exceed the total buffer size the kernel will panic. A pointer to the + * first byte of the extra data is returned. + */ +unsigned char *skb_put(struct sk_buff *skb, unsigned int len) +{ + unsigned char *tmp = skb_tail_pointer(skb); + SKB_LINEAR_ASSERT(skb); + skb->tail += len; + skb->len += len; + if (unlikely(skb->tail > skb->end)) + skb_over_panic(skb, len, __builtin_return_address(0)); + return tmp; +} +EXPORT_SYMBOL(skb_put); + +/** + * skb_push - add data to the start of a buffer + * @skb: buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the buffer at the buffer + * start. If this would exceed the total buffer headroom the kernel will + * panic. A pointer to the first byte of the extra data is returned. + */ +unsigned char *skb_push(struct sk_buff *skb, unsigned int len) +{ + skb->data -= len; + skb->len += len; + if (unlikely(skb->datahead)) + skb_under_panic(skb, len, __builtin_return_address(0)); + return skb->data; +} +EXPORT_SYMBOL(skb_push); + +/** + * skb_pull - remove data from the start of a buffer + * @skb: buffer to use + * @len: amount of data to remove + * + * This function removes data from the start of a buffer, returning + * the memory to the headroom. A pointer to the next data in the buffer + * is returned. Once the data has been pulled future pushes will overwrite + * the old data. + */ +unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) +{ + return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len); +} +EXPORT_SYMBOL(skb_pull); + +/** + * skb_trim - remove end from a buffer + * @skb: buffer to alter + * @len: new length + * + * Cut the length of a buffer down by removing data from the tail. If + * the buffer is already under the length specified it is not modified. + * The skb must be linear. + */ +void skb_trim(struct sk_buff *skb, unsigned int len) +{ + if (skb->len > len) + __skb_trim(skb, len); +} +EXPORT_SYMBOL(skb_trim); + /* Trims skb to length len. It can change skb pointers. */ @@ -1122,6 +1251,217 @@ fault: return -EFAULT; } +/* + * Callback from splice_to_pipe(), if we need to release some pages + * at the end of the spd in case we error'ed out in filling the pipe. + */ +static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) +{ + struct sk_buff *skb = (struct sk_buff *) spd->partial[i].private; + + kfree_skb(skb); +} + +/* + * Fill page/offset/length into spd, if it can hold more pages. + */ +static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, + unsigned int len, unsigned int offset, + struct sk_buff *skb) +{ + if (unlikely(spd->nr_pages == PIPE_BUFFERS)) + return 1; + + spd->pages[spd->nr_pages] = page; + spd->partial[spd->nr_pages].len = len; + spd->partial[spd->nr_pages].offset = offset; + spd->partial[spd->nr_pages].private = (unsigned long) skb_get(skb); + spd->nr_pages++; + return 0; +} + +/* + * Map linear and fragment data from the skb to spd. Returns number of + * pages mapped. + */ +static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, + unsigned int *total_len, + struct splice_pipe_desc *spd) +{ + unsigned int nr_pages = spd->nr_pages; + unsigned int poff, plen, len, toff, tlen; + int headlen, seg; + + toff = *offset; + tlen = *total_len; + if (!tlen) + goto err; + + /* + * if the offset is greater than the linear part, go directly to + * the fragments. + */ + headlen = skb_headlen(skb); + if (toff >= headlen) { + toff -= headlen; + goto map_frag; + } + + /* + * first map the linear region into the pages/partial map, skipping + * any potential initial offset. + */ + len = 0; + while (len < headlen) { + void *p = skb->data + len; + + poff = (unsigned long) p & (PAGE_SIZE - 1); + plen = min_t(unsigned int, headlen - len, PAGE_SIZE - poff); + len += plen; + + if (toff) { + if (plen <= toff) { + toff -= plen; + continue; + } + plen -= toff; + poff += toff; + toff = 0; + } + + plen = min(plen, tlen); + if (!plen) + break; + + /* + * just jump directly to update and return, no point + * in going over fragments when the output is full. + */ + if (spd_fill_page(spd, virt_to_page(p), plen, poff, skb)) + goto done; + + tlen -= plen; + } + + /* + * then map the fragments + */ +map_frag: + for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { + const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; + + plen = f->size; + poff = f->page_offset; + + if (toff) { + if (plen <= toff) { + toff -= plen; + continue; + } + plen -= toff; + poff += toff; + toff = 0; + } + + plen = min(plen, tlen); + if (!plen) + break; + + if (spd_fill_page(spd, f->page, plen, poff, skb)) + break; + + tlen -= plen; + } + +done: + if (spd->nr_pages - nr_pages) { + *offset = 0; + *total_len = tlen; + return 0; + } +err: + return 1; +} + +/* + * Map data from the skb to a pipe. Should handle both the linear part, + * the fragments, and the frag list. It does NOT handle frag lists within + * the frag list, if such a thing exists. We'd probably need to recurse to + * handle that cleanly. + */ +int skb_splice_bits(struct sk_buff *__skb, unsigned int offset, + struct pipe_inode_info *pipe, unsigned int tlen, + unsigned int flags) +{ + struct partial_page partial[PIPE_BUFFERS]; + struct page *pages[PIPE_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &sock_pipe_buf_ops, + .spd_release = sock_spd_release, + }; + struct sk_buff *skb; + + /* + * I'd love to avoid the clone here, but tcp_read_sock() + * ignores reference counts and unconditonally kills the sk_buff + * on return from the actor. + */ + skb = skb_clone(__skb, GFP_KERNEL); + if (unlikely(!skb)) + return -ENOMEM; + + /* + * __skb_splice_bits() only fails if the output has no room left, + * so no point in going over the frag_list for the error case. + */ + if (__skb_splice_bits(skb, &offset, &tlen, &spd)) + goto done; + else if (!tlen) + goto done; + + /* + * now see if we have a frag_list to map + */ + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list && tlen; list = list->next) { + if (__skb_splice_bits(list, &offset, &tlen, &spd)) + break; + } + } + +done: + /* + * drop our reference to the clone, the pipe consumption will + * drop the rest. + */ + kfree_skb(skb); + + if (spd.nr_pages) { + int ret; + + /* + * Drop the socket lock, otherwise we have reverse + * locking dependencies between sk_lock and i_mutex + * here as compared to sendfile(). We enter here + * with the socket lock held, and splice_to_pipe() will + * grab the pipe inode lock. For sendfile() emulation, + * we call into ->sendpage() with the i_mutex lock held + * and networking will grab the socket lock. + */ + release_sock(__skb->sk); + ret = splice_to_pipe(pipe, &spd); + lock_sock(__skb->sk); + return ret; + } + + return 0; +} + /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer @@ -1661,11 +2001,11 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, * of bytes already consumed and the next call to * skb_seq_read() will return the remaining part of the block. * - * Note: The size of each block of data returned can be arbitary, + * Note 1: The size of each block of data returned can be arbitary, * this limitation is the cost for zerocopy seqeuental * reads of potentially non linear data. * - * Note: Fragment lists within fragments are not implemented + * Note 2: Fragment lists within fragments are not implemented * at the moment, state->root_skb could be replaced with * a stack for this purpose. */ @@ -1860,11 +2200,10 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, /** * skb_pull_rcsum - pull skb and update receive checksum * @skb: buffer to update - * @start: start of data before pull * @len: length of data pulled * * This function performs an skb_pull on the packet and updates - * update the CHECKSUM_COMPLETE checksum. It should be used on + * the CHECKSUM_COMPLETE checksum. It should be used on * receive path processing instead of skb_pull unless you know * that the checksum difference is zero (e.g., a valid IP header) * or you are setting ip_summed to CHECKSUM_NONE. @@ -2215,6 +2554,34 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) return elt; } +/** + * skb_partial_csum_set - set up and verify partial csum values for packet + * @skb: the skb to set + * @start: the number of bytes after skb->data to start checksumming. + * @off: the offset from start to place the checksum. + * + * For untrusted partially-checksummed packets, we need to make sure the values + * for skb->csum_start and skb->csum_offset are valid so we don't oops. + * + * This function checks and sets those values and skb->ip_summed: if this + * returns false you should drop the packet. + */ +bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) +{ + if (unlikely(start > skb->len - 2) || + unlikely((int)start + off > skb->len - 2)) { + if (net_ratelimit()) + printk(KERN_WARNING + "bad partial csum: csum=%u/%u len=%u\n", + start, off, skb->len); + return false; + } + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + start; + skb->csum_offset = off; + return true; +} + EXPORT_SYMBOL(___pskb_trim); EXPORT_SYMBOL(__kfree_skb); EXPORT_SYMBOL(kfree_skb); @@ -2251,3 +2618,4 @@ EXPORT_SYMBOL(skb_append_datato_frags); EXPORT_SYMBOL_GPL(skb_to_sgvec); EXPORT_SYMBOL_GPL(skb_cow_data); +EXPORT_SYMBOL_GPL(skb_partial_csum_set);