tcp: Try to restore large SKBs while SACK processing
[pandora-kernel.git] / net / ipv4 / tcp_input.c
index 3c8e297..97d5767 100644 (file)
@@ -1242,6 +1242,8 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
  * aligned portion of it that matches. Therefore we might need to fragment
  * which may fail and creates some hassle (caller must handle error case
  * returns).
+ *
+ * FIXME: this could be merged to shift decision code
  */
 static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
                                 u32 start_seq, u32 end_seq)
@@ -1353,9 +1355,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
 
                if (fack_count > tp->fackets_out)
                        tp->fackets_out = fack_count;
-
-               if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
-                       tcp_advance_highest_sack(sk, skb);
        }
 
        /* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1370,12 +1369,231 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
        return flag;
 }
 
+static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+                          struct sk_buff *skb, unsigned int pcount,
+                          int shifted, int fack_count, int *reord,
+                          int *flag, int mss)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       u8 dummy_sacked = TCP_SKB_CB(skb)->sacked;      /* We discard results */
+
+       BUG_ON(!pcount);
+
+       TCP_SKB_CB(prev)->end_seq += shifted;
+       TCP_SKB_CB(skb)->seq += shifted;
+
+       skb_shinfo(prev)->gso_segs += pcount;
+       BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
+       skb_shinfo(skb)->gso_segs -= pcount;
+
+       /* When we're adding to gso_segs == 1, gso_size will be zero,
+        * in theory this shouldn't be necessary but as long as DSACK
+        * code can come after this skb later on it's better to keep
+        * setting gso_size to something.
+        */
+       if (!skb_shinfo(prev)->gso_size) {
+               skb_shinfo(prev)->gso_size = mss;
+               skb_shinfo(prev)->gso_type = sk->sk_gso_type;
+       }
+
+       /* CHECKME: To clear or not to clear? Mimics normal skb currently */
+       if (skb_shinfo(skb)->gso_segs <= 1) {
+               skb_shinfo(skb)->gso_size = 0;
+               skb_shinfo(skb)->gso_type = 0;
+       }
+
+       *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked,
+                                pcount);
+
+       /* Difference in this won't matter, both ACKed by the same cumul. ACK */
+       TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
+
+       tcp_clear_all_retrans_hints(tp);
+
+       if (skb->len > 0) {
+               BUG_ON(!tcp_skb_pcount(skb));
+               return 0;
+       }
+
+       /* Whole SKB was eaten :-) */
+
+       TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
+       if (skb == tcp_highest_sack(sk))
+               tcp_advance_highest_sack(sk, skb);
+
+       tcp_unlink_write_queue(skb, sk);
+       sk_wmem_free_skb(sk, skb);
+
+       return 1;
+}
+
+/* I wish gso_size would have a bit more sane initialization than
+ * something-or-zero which complicates things
+ */
+static int tcp_shift_mss(struct sk_buff *skb)
+{
+       int mss = tcp_skb_mss(skb);
+
+       if (!mss)
+               mss = skb->len;
+
+       return mss;
+}
+
+/* Shifting pages past head area doesn't work */
+static int skb_can_shift(struct sk_buff *skb)
+{
+       return !skb_headlen(skb) && skb_is_nonlinear(skb);
+}
+
+/* Try collapsing SACK blocks spanning across multiple skbs to a single
+ * skb.
+ */
+static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+                                         u32 start_seq, u32 end_seq,
+                                         int dup_sack, int *fack_count,
+                                         int *reord, int *flag)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sk_buff *prev;
+       int mss;
+       int pcount = 0;
+       int len;
+       int in_sack;
+
+       if (!sk_can_gso(sk))
+               goto fallback;
+
+       /* Normally R but no L won't result in plain S */
+       if (!dup_sack &&
+           (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS)
+               goto fallback;
+       if (!skb_can_shift(skb))
+               goto fallback;
+       /* This frame is about to be dropped (was ACKed). */
+       if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+               goto fallback;
+
+       /* Can only happen with delayed DSACK + discard craziness */
+       if (unlikely(skb == tcp_write_queue_head(sk)))
+               goto fallback;
+       prev = tcp_write_queue_prev(sk, skb);
+
+       if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+               goto fallback;
+
+       in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+                 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+       if (in_sack) {
+               len = skb->len;
+               pcount = tcp_skb_pcount(skb);
+               mss = tcp_shift_mss(skb);
+
+               /* TODO: Fix DSACKs to not fragment already SACKed and we can
+                * drop this restriction as unnecessary
+                */
+               if (mss != tcp_shift_mss(prev))
+                       goto fallback;
+       } else {
+               if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
+                       goto noop;
+               /* CHECKME: This is non-MSS split case only?, this will
+                * cause skipped skbs due to advancing loop btw, original
+                * has that feature too
+                */
+               if (tcp_skb_pcount(skb) <= 1)
+                       goto noop;
+
+               in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+               if (!in_sack) {
+                       /* TODO: head merge to next could be attempted here
+                        * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
+                        * though it might not be worth of the additional hassle
+                        *
+                        * ...we can probably just fallback to what was done
+                        * previously. We could try merging non-SACKed ones
+                        * as well but it probably isn't going to buy off
+                        * because later SACKs might again split them, and
+                        * it would make skb timestamp tracking considerably
+                        * harder problem.
+                        */
+                       goto fallback;
+               }
+
+               len = end_seq - TCP_SKB_CB(skb)->seq;
+               BUG_ON(len < 0);
+               BUG_ON(len > skb->len);
+
+               /* MSS boundaries should be honoured or else pcount will
+                * severely break even though it makes things bit trickier.
+                * Optimize common case to avoid most of the divides
+                */
+               mss = tcp_skb_mss(skb);
+
+               /* TODO: Fix DSACKs to not fragment already SACKed and we can
+                * drop this restriction as unnecessary
+                */
+               if (mss != tcp_shift_mss(prev))
+                       goto fallback;
+
+               if (len == mss) {
+                       pcount = 1;
+               } else if (len < mss) {
+                       goto noop;
+               } else {
+                       pcount = len / mss;
+                       len = pcount * mss;
+               }
+       }
+
+       if (!skb_shift(prev, skb, len))
+               goto fallback;
+       if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord,
+                            flag, mss))
+               goto out;
+
+       /* Hole filled allows collapsing with the next as well, this is very
+        * useful when hole on every nth skb pattern happens
+        */
+       if (prev == tcp_write_queue_tail(sk))
+               goto out;
+       skb = tcp_write_queue_next(sk, prev);
+
+       if (!skb_can_shift(skb))
+               goto out;
+       if (skb == tcp_send_head(sk))
+               goto out;
+       if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+               goto out;
+
+       len = skb->len;
+       if (skb_shift(prev, skb, len)) {
+               pcount += tcp_skb_pcount(skb);
+               tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len,
+                               *fack_count, reord, flag, mss);
+       }
+
+out:
+       *fack_count += pcount;
+       return prev;
+
+noop:
+       return skb;
+
+fallback:
+       return NULL;
+}
+
 static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                        struct tcp_sack_block *next_dup,
                                        u32 start_seq, u32 end_seq,
                                        int dup_sack_in, int *fack_count,
                                        int *reord, int *flag)
 {
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sk_buff *tmp;
+
        tcp_for_write_queue_from(skb, sk) {
                int in_sack = 0;
                int dup_sack = dup_sack_in;
@@ -1396,18 +1614,42 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                dup_sack = 1;
                }
 
-               if (in_sack <= 0)
-                       in_sack = tcp_match_skb_to_sack(sk, skb, start_seq,
-                                                       end_seq);
+               /* skb reference here is a bit tricky to get right, since
+                * shifting can eat and free both this skb and the next,
+                * so not even _safe variant of the loop is enough.
+                */
+               if (in_sack <= 0) {
+                       tmp = tcp_shift_skb_data(sk, skb, start_seq,
+                                                end_seq, dup_sack,
+                                                fack_count, reord, flag);
+                       if (tmp != NULL) {
+                               if (tmp != skb) {
+                                       skb = tmp;
+                                       continue;
+                               }
+
+                               in_sack = 0;
+                       } else {
+                               in_sack = tcp_match_skb_to_sack(sk, skb,
+                                                               start_seq,
+                                                               end_seq);
+                       }
+               }
+
                if (unlikely(in_sack < 0))
                        break;
 
-               if (in_sack)
+               if (in_sack) {
                        *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack,
                                                 *fack_count,
                                                 &(TCP_SKB_CB(skb)->sacked),
                                                 tcp_skb_pcount(skb));
 
+                       if (!before(TCP_SKB_CB(skb)->seq,
+                                   tcp_highest_sack_seq(tp)))
+                               tcp_advance_highest_sack(sk, skb);
+               }
+
                *fack_count += tcp_skb_pcount(skb);
        }
        return skb;