net: introduce SO_MAX_PACING_RATE
[pandora-kernel.git] / net / core / sock.c
index 2c097c5..2bd9b3f 100644 (file)
@@ -93,6 +93,7 @@
 
 #include <linux/capability.h>
 #include <linux/errno.h>
+#include <linux/errqueue.h>
 #include <linux/types.h>
 #include <linux/socket.h>
 #include <linux/in.h>
@@ -913,6 +914,13 @@ set_rcvbuf:
                }
                break;
 #endif
+
+       case SO_MAX_PACING_RATE:
+               sk->sk_max_pacing_rate = val;
+               sk->sk_pacing_rate = min(sk->sk_pacing_rate,
+                                        sk->sk_max_pacing_rate);
+               break;
+
        default:
                ret = -ENOPROTOOPT;
                break;
@@ -1176,6 +1184,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
                break;
 #endif
 
+       case SO_MAX_PACING_RATE:
+               v.val = sk->sk_max_pacing_rate;
+               break;
+
        default:
                return -ENOPROTOOPT;
        }
@@ -1575,6 +1587,25 @@ void sock_wfree(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(sock_wfree);
 
+void skb_orphan_partial(struct sk_buff *skb)
+{
+       /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
+        * so we do not completely orphan skb, but transfert all
+        * accounted bytes but one, to avoid unexpected reorders.
+        */
+       if (skb->destructor == sock_wfree
+#ifdef CONFIG_INET
+           || skb->destructor == tcp_wfree
+#endif
+               ) {
+               atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
+               skb->truesize = 1;
+       } else {
+               skb_orphan(skb);
+       }
+}
+EXPORT_SYMBOL(skb_orphan_partial);
+
 /*
  * Read buffer destructor automatically called from kfree_skb.
  */
@@ -1721,24 +1752,23 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
 
 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
                                     unsigned long data_len, int noblock,
-                                    int *errcode)
+                                    int *errcode, int max_page_order)
 {
-       struct sk_buff *skb;
+       struct sk_buff *skb = NULL;
+       unsigned long chunk;
        gfp_t gfp_mask;
        long timeo;
        int err;
        int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+       struct page *page;
+       int i;
 
        err = -EMSGSIZE;
        if (npages > MAX_SKB_FRAGS)
                goto failure;
 
-       gfp_mask = sk->sk_allocation;
-       if (gfp_mask & __GFP_WAIT)
-               gfp_mask |= __GFP_REPEAT;
-
        timeo = sock_sndtimeo(sk, noblock);
-       while (1) {
+       while (!skb) {
                err = sock_error(sk);
                if (err != 0)
                        goto failure;
@@ -1747,50 +1777,52 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
                if (sk->sk_shutdown & SEND_SHUTDOWN)
                        goto failure;
 
-               if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
-                       skb = alloc_skb(header_len, gfp_mask);
-                       if (skb) {
-                               int i;
-
-                               /* No pages, we're done... */
-                               if (!data_len)
-                                       break;
-
-                               skb->truesize += data_len;
-                               skb_shinfo(skb)->nr_frags = npages;
-                               for (i = 0; i < npages; i++) {
-                                       struct page *page;
-
-                                       page = alloc_pages(sk->sk_allocation, 0);
-                                       if (!page) {
-                                               err = -ENOBUFS;
-                                               skb_shinfo(skb)->nr_frags = i;
-                                               kfree_skb(skb);
-                                               goto failure;
-                                       }
-
-                                       __skb_fill_page_desc(skb, i,
-                                                       page, 0,
-                                                       (data_len >= PAGE_SIZE ?
-                                                        PAGE_SIZE :
-                                                        data_len));
-                                       data_len -= PAGE_SIZE;
-                               }
+               if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
+                       set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+                       set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+                       err = -EAGAIN;
+                       if (!timeo)
+                               goto failure;
+                       if (signal_pending(current))
+                               goto interrupted;
+                       timeo = sock_wait_for_wmem(sk, timeo);
+                       continue;
+               }
 
-                               /* Full success... */
-                               break;
-                       }
-                       err = -ENOBUFS;
+               err = -ENOBUFS;
+               gfp_mask = sk->sk_allocation;
+               if (gfp_mask & __GFP_WAIT)
+                       gfp_mask |= __GFP_REPEAT;
+
+               skb = alloc_skb(header_len, gfp_mask);
+               if (!skb)
                        goto failure;
+
+               skb->truesize += data_len;
+
+               for (i = 0; npages > 0; i++) {
+                       int order = max_page_order;
+
+                       while (order) {
+                               if (npages >= 1 << order) {
+                                       page = alloc_pages(sk->sk_allocation |
+                                                          __GFP_COMP | __GFP_NOWARN,
+                                                          order);
+                                       if (page)
+                                               goto fill_page;
+                               }
+                               order--;
+                       }
+                       page = alloc_page(sk->sk_allocation);
+                       if (!page)
+                               goto failure;
+fill_page:
+                       chunk = min_t(unsigned long, data_len,
+                                     PAGE_SIZE << order);
+                       skb_fill_page_desc(skb, i, page, 0, chunk);
+                       data_len -= chunk;
+                       npages -= 1 << order;
                }
-               set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
-               set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-               err = -EAGAIN;
-               if (!timeo)
-                       goto failure;
-               if (signal_pending(current))
-                       goto interrupted;
-               timeo = sock_wait_for_wmem(sk, timeo);
        }
 
        skb_set_owner_w(skb, sk);
@@ -1799,6 +1831,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
 interrupted:
        err = sock_intr_errno(timeo);
 failure:
+       kfree_skb(skb);
        *errcode = err;
        return NULL;
 }
@@ -1807,7 +1840,7 @@ EXPORT_SYMBOL(sock_alloc_send_pskb);
 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
                                    int noblock, int *errcode)
 {
-       return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
+       return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
 }
 EXPORT_SYMBOL(sock_alloc_send_skb);
 
@@ -2297,6 +2330,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
        sk->sk_ll_usec          =       sysctl_net_busy_read;
 #endif
 
+       sk->sk_max_pacing_rate = ~0U;
        /*
         * Before updating sk_refcnt, we must commit prior changes to memory
         * (Documentation/RCU/rculist_nulls.txt for details)
@@ -2425,6 +2459,52 @@ void sock_enable_timestamp(struct sock *sk, int flag)
        }
 }
 
+int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
+                      int level, int type)
+{
+       struct sock_exterr_skb *serr;
+       struct sk_buff *skb, *skb2;
+       int copied, err;
+
+       err = -EAGAIN;
+       skb = skb_dequeue(&sk->sk_error_queue);
+       if (skb == NULL)
+               goto out;
+
+       copied = skb->len;
+       if (copied > len) {
+               msg->msg_flags |= MSG_TRUNC;
+               copied = len;
+       }
+       err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+       if (err)
+               goto out_free_skb;
+
+       sock_recv_timestamp(msg, sk, skb);
+
+       serr = SKB_EXT_ERR(skb);
+       put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
+
+       msg->msg_flags |= MSG_ERRQUEUE;
+       err = copied;
+
+       /* Reset and regenerate socket error */
+       spin_lock_bh(&sk->sk_error_queue.lock);
+       sk->sk_err = 0;
+       if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
+               sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
+               spin_unlock_bh(&sk->sk_error_queue.lock);
+               sk->sk_error_report(sk);
+       } else
+               spin_unlock_bh(&sk->sk_error_queue.lock);
+
+out_free_skb:
+       kfree_skb(skb);
+out:
+       return err;
+}
+EXPORT_SYMBOL(sock_recv_errqueue);
+
 /*
  *     Get a socket option on an socket.
  *