tcp: fix cwnd limited checking to improve congestion control

author Eric Dumazet <edumazet@google.com>

Wed, 30 Apr 2014 18:58:13 +0000 (11:58 -0700)

committer David S. Miller <davem@davemloft.net>

Fri, 2 May 2014 21:54:35 +0000 (17:54 -0400)
author Eric Dumazet <edumazet@google.com>
Wed, 30 Apr 2014 18:58:13 +0000 (11:58 -0700)
committer David S. Miller <davem@davemloft.net>
Fri, 2 May 2014 21:54:35 +0000 (17:54 -0400)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h

index 2399468..4e37c71 100644 (file)
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -230,6 +230,7 @@ struct tcp_sock {
         u32     snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
         u32     snd_cwnd_used;
         u32     snd_cwnd_stamp;
+       u32     lsnd_pending;   /* packets inflight or unsent since last xmit */
         u32     prior_cwnd;     /* Congestion window at start of Recovery. */
         u32     prr_delivered;  /* Number of newly delivered packets to
                                  * receiver in Recovery. */
diff --git a/include/net/tcp.h b/include/net/tcp.h

index 163d2b4..a9fe7bc 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -974,7 +974,27 @@ static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
  {
         return tp->snd_una + tp->snd_wnd;
  }
-bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight);
+
+/* We follow the spirit of RFC2861 to validate cwnd but implement a more
+ * flexible approach. The RFC suggests cwnd should not be raised unless
+ * it was fully used previously. But we allow cwnd to grow as long as the
+ * application has used half the cwnd.
+ * Example :
+ *    cwnd is 10 (IW10), but application sends 9 frames.
+ *    We allow cwnd to reach 18 when all frames are ACKed.
+ * This check is safe because it's as aggressive as slow start which already
+ * risks 100% overshoot. The advantage is that we discourage application to
+ * either send more filler packets or data to artificially blow up the cwnd
+ * usage, and allow application-limited process to probe bw more aggressively.
+ *
+ * TODO: remove in_flight once we can fix all callers, and their callers...
+ */
+static inline bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
+{
+       const struct tcp_sock *tp = tcp_sk(sk);
+
+       return tp->snd_cwnd < 2 * tp->lsnd_pending;
+}
  
  static inline void tcp_check_probe_timer(struct sock *sk)
  {
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c

index 2b9464c..a93b41b 100644 (file)
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -276,26 +276,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
         return err;
  }
  
-/* RFC2861 Check whether we are limited by application or congestion window
- * This is the inverse of cwnd check in tcp_tso_should_defer
- */
-bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
-{
-       const struct tcp_sock *tp = tcp_sk(sk);
-       u32 left;
-
-       if (in_flight >= tp->snd_cwnd)
-               return true;
-
-       left = tp->snd_cwnd - in_flight;
-       if (sk_can_gso(sk) &&
-           left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
-           left < tp->xmit_size_goal_segs)
-               return true;
-       return left <= tcp_max_tso_deferred_mss(tp);
-}
-EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
-
  /* Slow start is used when congestion window is no greater than the slow start
   * threshold. We base on RFC2581 and also handle stretch ACKs properly.
   * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index 20847de..f9181a1 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1402,12 +1402,13 @@ static void tcp_cwnd_application_limited(struct sock *sk)
         tp->snd_cwnd_stamp = tcp_time_stamp;
  }
  
-/* Congestion window validation. (RFC2861) */
-static void tcp_cwnd_validate(struct sock *sk)
+static void tcp_cwnd_validate(struct sock *sk, u32 unsent_segs)
  {
         struct tcp_sock *tp = tcp_sk(sk);
  
-       if (tp->packets_out >= tp->snd_cwnd) {
+       tp->lsnd_pending = tp->packets_out + unsent_segs;
+
+       if (tcp_is_cwnd_limited(sk, 0)) {
                 /* Network is feed fully. */
                 tp->snd_cwnd_used = 0;
                 tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -1880,7 +1881,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  {
         struct tcp_sock *tp = tcp_sk(sk);
         struct sk_buff *skb;
-       unsigned int tso_segs, sent_pkts;
+       unsigned int tso_segs, sent_pkts, unsent_segs = 0;
         int cwnd_quota;
         int result;
  
@@ -1924,7 +1925,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                                 break;
                 } else {
                         if (!push_one && tcp_tso_should_defer(sk, skb))
-                               break;
+                               goto compute_unsent_segs;
                 }
  
                 /* TCP Small Queues :
@@ -1949,8 +1950,14 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                          * there is no smp_mb__after_set_bit() yet
                          */
                         smp_mb__after_clear_bit();
-                       if (atomic_read(&sk->sk_wmem_alloc) > limit)
+                       if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+                               u32 unsent_bytes;
+
+compute_unsent_segs:
+                               unsent_bytes = tp->write_seq - tp->snd_nxt;
+                               unsent_segs = DIV_ROUND_UP(unsent_bytes, mss_now);
                                 break;
+                       }
                 }
  
                 limit = mss_now;
@@ -1990,7 +1997,7 @@ repair:
                 /* Send one loss probe per tail loss episode. */
                 if (push_one != 2)
                         tcp_schedule_loss_probe(sk);
-               tcp_cwnd_validate(sk);
+               tcp_cwnd_validate(sk, unsent_segs);
                 return false;
         }
         return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
author	Eric Dumazet <edumazet@google.com>
	Wed, 30 Apr 2014 18:58:13 +0000 (11:58 -0700)
committer	David S. Miller <davem@davemloft.net>
	Fri, 2 May 2014 21:54:35 +0000 (17:54 -0400)
include/linux/tcp.h		patch \| blob \| history
include/net/tcp.h		patch \| blob \| history
net/ipv4/tcp_cong.c		patch \| blob \| history
net/ipv4/tcp_output.c		patch \| blob \| history