net: introduce SO_MAX_PACING_RATE
[pandora-kernel.git] / net / ipv4 / tcp_input.c
index 25a89ea..66aa816 100644 (file)
@@ -355,6 +355,12 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
        rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
                 tcp_default_init_rwnd(mss);
 
+       /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
+        * Allow enough cushion so that sender is not limited by our window
+        */
+       if (sysctl_tcp_moderate_rcvbuf)
+               rcvmem <<= 2;
+
        if (sk->sk_rcvbuf < rcvmem)
                sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
 }
@@ -373,6 +379,8 @@ void tcp_init_buffer_space(struct sock *sk)
                tcp_fixup_sndbuf(sk);
 
        tp->rcvq_space.space = tp->rcv_wnd;
+       tp->rcvq_space.time = tcp_time_stamp;
+       tp->rcvq_space.seq = tp->copied_seq;
 
        maxwin = tcp_full_space(sk);
 
@@ -512,48 +520,62 @@ void tcp_rcv_space_adjust(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int time;
-       int space;
-
-       if (tp->rcvq_space.time == 0)
-               goto new_measure;
+       int copied;
 
        time = tcp_time_stamp - tp->rcvq_space.time;
        if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
                return;
 
-       space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
+       /* Number of bytes copied to user in last RTT */
+       copied = tp->copied_seq - tp->rcvq_space.seq;
+       if (copied <= tp->rcvq_space.space)
+               goto new_measure;
+
+       /* A bit of theory :
+        * copied = bytes received in previous RTT, our base window
+        * To cope with packet losses, we need a 2x factor
+        * To cope with slow start, and sender growing its cwin by 100 %
+        * every RTT, we need a 4x factor, because the ACK we are sending
+        * now is for the next RTT, not the current one :
+        * <prev RTT . ><current RTT .. ><next RTT .... >
+        */
+
+       if (sysctl_tcp_moderate_rcvbuf &&
+           !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+               int rcvwin, rcvmem, rcvbuf;
 
-       space = max(tp->rcvq_space.space, space);
+               /* minimal window to cope with packet losses, assuming
+                * steady state. Add some cushion because of small variations.
+                */
+               rcvwin = (copied << 1) + 16 * tp->advmss;
 
-       if (tp->rcvq_space.space != space) {
-               int rcvmem;
+               /* If rate increased by 25%,
+                *      assume slow start, rcvwin = 3 * copied
+                * If rate increased by 50%,
+                *      assume sender can use 2x growth, rcvwin = 4 * copied
+                */
+               if (copied >=
+                   tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
+                       if (copied >=
+                           tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
+                               rcvwin <<= 1;
+                       else
+                               rcvwin += (rcvwin >> 1);
+               }
 
-               tp->rcvq_space.space = space;
+               rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
+               while (tcp_win_from_space(rcvmem) < tp->advmss)
+                       rcvmem += 128;
 
-               if (sysctl_tcp_moderate_rcvbuf &&
-                   !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
-                       int new_clamp = space;
+               rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
+               if (rcvbuf > sk->sk_rcvbuf) {
+                       sk->sk_rcvbuf = rcvbuf;
 
-                       /* Receive space grows, normalize in order to
-                        * take into account packet headers and sk_buff
-                        * structure overhead.
-                        */
-                       space /= tp->advmss;
-                       if (!space)
-                               space = 1;
-                       rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
-                       while (tcp_win_from_space(rcvmem) < tp->advmss)
-                               rcvmem += 128;
-                       space *= rcvmem;
-                       space = min(space, sysctl_tcp_rmem[2]);
-                       if (space > sk->sk_rcvbuf) {
-                               sk->sk_rcvbuf = space;
-
-                               /* Make the window clamp follow along.  */
-                               tp->window_clamp = new_clamp;
-                       }
+                       /* Make the window clamp follow along.  */
+                       tp->window_clamp = rcvwin;
                }
        }
+       tp->rcvq_space.space = copied;
 
 new_measure:
        tp->rcvq_space.seq = tp->copied_seq;
@@ -713,7 +735,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
        if (tp->srtt > 8 + 2)
                do_div(rate, tp->srtt);
 
-       sk->sk_pacing_rate = min_t(u64, rate, ~0U);
+       sk->sk_pacing_rate = min_t(u64, rate, sk->sk_max_pacing_rate);
 }
 
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
@@ -5674,8 +5696,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                        tcp_init_congestion_control(sk);
 
                        tcp_mtup_init(sk);
-                       tcp_init_buffer_space(sk);
                        tp->copied_seq = tp->rcv_nxt;
+                       tcp_init_buffer_space(sk);
                }
                smp_mb();
                tcp_set_state(sk, TCP_ESTABLISHED);