[TCP]: Fix double adjustment of tp->{lost,left}_out in tcp_fragment().
[pandora-kernel.git] / net / ipv4 / tcp_output.c
index dd30dd1..c10e443 100644 (file)
@@ -105,18 +105,19 @@ static __u16 tcp_advertise_mss(struct sock *sk)
 
 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
  * This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
+static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
 {
+       struct tcp_sock *tp = tcp_sk(sk);
        s32 delta = tcp_time_stamp - tp->lsndtime;
        u32 restart_cwnd = tcp_init_cwnd(tp, dst);
        u32 cwnd = tp->snd_cwnd;
 
-       tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
+       tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
 
-       tp->snd_ssthresh = tcp_current_ssthresh(tp);
+       tp->snd_ssthresh = tcp_current_ssthresh(sk);
        restart_cwnd = min(restart_cwnd, cwnd);
 
-       while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
+       while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
                cwnd >>= 1;
        tp->snd_cwnd = max(cwnd, restart_cwnd);
        tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -126,26 +127,25 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
 static inline void tcp_event_data_sent(struct tcp_sock *tp,
                                       struct sk_buff *skb, struct sock *sk)
 {
-       u32 now = tcp_time_stamp;
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       const u32 now = tcp_time_stamp;
 
-       if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
-               tcp_cwnd_restart(tp, __sk_dst_get(sk));
+       if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
+               tcp_cwnd_restart(sk, __sk_dst_get(sk));
 
        tp->lsndtime = now;
 
        /* If it is a reply for ato after last received
         * packet, enter pingpong mode.
         */
-       if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
-               tp->ack.pingpong = 1;
+       if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+               icsk->icsk_ack.pingpong = 1;
 }
 
 static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 {
-       struct tcp_sock *tp = tcp_sk(sk);
-
-       tcp_dec_quickack_mode(tp, pkts);
-       tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
+       tcp_dec_quickack_mode(sk, pkts);
+       inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
 }
 
 /* Determine a window scaling and initial window to offer.
@@ -265,6 +265,7 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 {
        if (skb != NULL) {
+               const struct inet_connection_sock *icsk = inet_csk(sk);
                struct inet_sock *inet = inet_sk(sk);
                struct tcp_sock *tp = tcp_sk(sk);
                struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -280,8 +281,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 #define SYSCTL_FLAG_SACK       0x4
 
                /* If congestion control is doing timestamping */
-               if (tp->ca_ops->rtt_sample)
-                       do_gettimeofday(&skb->stamp);
+               if (icsk->icsk_ca_ops->rtt_sample)
+                       __net_timestamp(skb);
 
                sysctl_flags = 0;
                if (tcb->flags & TCPCB_FLAG_SYN) {
@@ -308,7 +309,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
                }
                
                if (tcp_packets_in_flight(tp) == 0)
-                       tcp_ca_event(tp, CA_EVENT_TX_START);
+                       tcp_ca_event(sk, CA_EVENT_TX_START);
 
                th = (struct tcphdr *) skb_push(skb, tcp_header_size);
                skb->h.th = th;
@@ -366,7 +367,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
                if (err <= 0)
                        return err;
 
-               tcp_enter_cwr(tp);
+               tcp_enter_cwr(sk);
 
                /* NET_XMIT_CN is special. It does not guarantee,
                 * that this packet is lost. It tells that device
@@ -427,11 +428,11 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned
  * packet to the list.  This won't be called frequently, I hope. 
  * Remember, these are still headerless SKBs at this point.
  */
-static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
+int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *buff;
-       int nsize;
+       int nsize, old_factor;
        u16 flags;
 
        nsize = skb_headlen(skb) - len;
@@ -482,30 +483,36 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned
         * skbs, which it never sent before. --ANK
         */
        TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
-       buff->stamp = skb->stamp;
+       buff->tstamp = skb->tstamp;
 
-       if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
-               tp->lost_out -= tcp_skb_pcount(skb);
-               tp->left_out -= tcp_skb_pcount(skb);
-       }
+       old_factor = tcp_skb_pcount(skb);
 
        /* Fix up tso_factor for both original and new SKB.  */
        tcp_set_skb_tso_segs(sk, skb, mss_now);
        tcp_set_skb_tso_segs(sk, buff, mss_now);
 
-       if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
-               tp->lost_out += tcp_skb_pcount(skb);
-               tp->left_out += tcp_skb_pcount(skb);
-       }
+       /* If this packet has been sent out already, we must
+        * adjust the various packet counters.
+        */
+       if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
+               int diff = old_factor - tcp_skb_pcount(skb) -
+                       tcp_skb_pcount(buff);
 
-       if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
-               tp->lost_out += tcp_skb_pcount(buff);
-               tp->left_out += tcp_skb_pcount(buff);
+               tp->packets_out -= diff;
+               if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
+                       tp->lost_out -= diff;
+                       tp->left_out -= diff;
+               }
+               if (diff > 0) {
+                       tp->fackets_out -= diff;
+                       if ((int)tp->fackets_out < 0)
+                               tp->fackets_out = 0;
+               }
        }
 
        /* Link BUFF into the send queue. */
        skb_header_release(buff);
-       __skb_append(skb, buff);
+       __skb_append(skb, buff, &sk->sk_write_queue);
 
        return 0;
 }
@@ -696,7 +703,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
                if (tp->packets_out > tp->snd_cwnd_used)
                        tp->snd_cwnd_used = tp->packets_out;
 
-               if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
+               if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
                        tcp_cwnd_application_limited(sk);
        }
 }
@@ -893,7 +900,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 
        /* Link BUFF into the send queue. */
        skb_header_release(buff);
-       __skb_append(skb, buff);
+       __skb_append(skb, buff, &sk->sk_write_queue);
 
        return 0;
 }
@@ -905,12 +912,13 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
  */
 static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
 {
+       const struct inet_connection_sock *icsk = inet_csk(sk);
        u32 send_win, cong_win, limit, in_flight;
 
        if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
                return 0;
 
-       if (tp->ca_state != TCP_CA_Open)
+       if (icsk->icsk_ca_state != TCP_CA_Open)
                return 0;
 
        in_flight = tcp_packets_in_flight(tp);
@@ -1147,6 +1155,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
  */
 u32 __tcp_select_window(struct sock *sk)
 {
+       struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        /* MSS for the peer's data.  Previous verions used mss_clamp
         * here.  I don't know if the value based on our guesses
@@ -1154,7 +1163,7 @@ u32 __tcp_select_window(struct sock *sk)
         * but may be worse for the performance because of rcv_mss
         * fluctuations.  --SAW  1998/11/1
         */
-       int mss = tp->ack.rcv_mss;
+       int mss = icsk->icsk_ack.rcv_mss;
        int free_space = tcp_space(sk);
        int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
        int window;
@@ -1163,7 +1172,7 @@ u32 __tcp_select_window(struct sock *sk)
                mss = full_space; 
 
        if (free_space < full_space/2) {
-               tp->ack.quick = 0;
+               icsk->icsk_ack.quick = 0;
 
                if (tcp_memory_pressure)
                        tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
@@ -1238,7 +1247,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
                       tcp_skb_pcount(next_skb) != 1);
 
                /* Ok.  We will be able to collapse the packet. */
-               __skb_unlink(next_skb, next_skb->list);
+               __skb_unlink(next_skb, &sk->sk_write_queue);
 
                memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
 
@@ -1286,6 +1295,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
  */ 
 void tcp_simple_retransmit(struct sock *sk)
 {
+       const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        unsigned int mss = tcp_current_mss(sk, 0);
@@ -1316,12 +1326,12 @@ void tcp_simple_retransmit(struct sock *sk)
         * in network, but units changed and effective
         * cwnd/ssthresh really reduced now.
         */
-       if (tp->ca_state != TCP_CA_Loss) {
+       if (icsk->icsk_ca_state != TCP_CA_Loss) {
                tp->high_seq = tp->snd_nxt;
-               tp->snd_ssthresh = tcp_current_ssthresh(tp);
+               tp->snd_ssthresh = tcp_current_ssthresh(sk);
                tp->prior_ssthresh = 0;
                tp->undo_marker = 0;
-               tcp_set_ca_state(tp, TCP_CA_Loss);
+               tcp_set_ca_state(sk, TCP_CA_Loss);
        }
        tcp_xmit_retransmit_queue(sk);
 }
@@ -1346,12 +1356,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
        if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
                if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
                        BUG();
-
-               if (sk->sk_route_caps & NETIF_F_TSO) {
-                       sk->sk_route_caps &= ~NETIF_F_TSO;
-                       sock_set_flag(sk, SOCK_NO_LARGESEND);
-               }
-
                if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
                        return -ENOMEM;
        }
@@ -1366,22 +1370,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
                return -EAGAIN;
 
        if (skb->len > cur_mss) {
-               int old_factor = tcp_skb_pcount(skb);
-               int diff;
-
                if (tcp_fragment(sk, skb, cur_mss, cur_mss))
                        return -ENOMEM; /* We'll try again later. */
-
-               /* New SKB created, account for it. */
-               diff = old_factor - tcp_skb_pcount(skb) -
-                      tcp_skb_pcount(skb->next);
-               tp->packets_out -= diff;
-
-               if (diff > 0) {
-                       tp->fackets_out -= diff;
-                       if ((int)tp->fackets_out < 0)
-                               tp->fackets_out = 0;
-               }
        }
 
        /* Collapse two adjacent packets if worthwhile and we can. */
@@ -1461,6 +1451,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
  */
 void tcp_xmit_retransmit_queue(struct sock *sk)
 {
+       const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        int packet_cnt = tp->lost_out;
@@ -1484,14 +1475,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                                if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
                                        if (tcp_retransmit_skb(sk, skb))
                                                return;
-                                       if (tp->ca_state != TCP_CA_Loss)
+                                       if (icsk->icsk_ca_state != TCP_CA_Loss)
                                                NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
                                        else
                                                NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
 
                                        if (skb ==
                                            skb_peek(&sk->sk_write_queue))
-                                               tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+                                               inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                                                                         inet_csk(sk)->icsk_rto,
+                                                                         TCP_RTO_MAX);
                                }
 
                                packet_cnt -= tcp_skb_pcount(skb);
@@ -1504,7 +1497,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
        /* OK, demanded retransmission is finished. */
 
        /* Forward retransmissions are possible only during Recovery. */
-       if (tp->ca_state != TCP_CA_Recovery)
+       if (icsk->icsk_ca_state != TCP_CA_Recovery)
                return;
 
        /* No forward retransmissions in Reno are possible. */
@@ -1544,7 +1537,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                        break;
 
                if (skb == skb_peek(&sk->sk_write_queue))
-                       tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+                       inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                                                 inet_csk(sk)->icsk_rto,
+                                                 TCP_RTO_MAX);
 
                NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
        }
@@ -1573,7 +1568,7 @@ void tcp_send_fin(struct sock *sk)
        } else {
                /* Socket is locked, keep trying until memory is available. */
                for (;;) {
-                       skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
+                       skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
                        if (skb)
                                break;
                        yield();
@@ -1780,8 +1775,8 @@ static inline void tcp_connect_init(struct sock *sk)
        tp->rcv_wup = 0;
        tp->copied_seq = 0;
 
-       tp->rto = TCP_TIMEOUT_INIT;
-       tp->retransmits = 0;
+       inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+       inet_csk(sk)->icsk_retransmits = 0;
        tcp_clear_retrans(tp);
 }
 
@@ -1795,7 +1790,7 @@ int tcp_connect(struct sock *sk)
 
        tcp_connect_init(sk);
 
-       buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
+       buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
        if (unlikely(buff == NULL))
                return -ENOBUFS;
 
@@ -1824,7 +1819,8 @@ int tcp_connect(struct sock *sk)
        TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
 
        /* Timer for repeating the SYN until an answer. */
-       tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+       inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                                 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
        return 0;
 }
 
@@ -1834,20 +1830,21 @@ int tcp_connect(struct sock *sk)
  */
 void tcp_send_delayed_ack(struct sock *sk)
 {
-       struct tcp_sock *tp = tcp_sk(sk);
-       int ato = tp->ack.ato;
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       int ato = icsk->icsk_ack.ato;
        unsigned long timeout;
 
        if (ato > TCP_DELACK_MIN) {
+               const struct tcp_sock *tp = tcp_sk(sk);
                int max_ato = HZ/2;
 
-               if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
+               if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
                        max_ato = TCP_DELACK_MAX;
 
                /* Slow path, intersegment interval is "high". */
 
                /* If some rtt estimate is known, use it to bound delayed ack.
-                * Do not use tp->rto here, use results of rtt measurements
+                * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
                 * directly.
                 */
                if (tp->srtt) {
@@ -1864,21 +1861,22 @@ void tcp_send_delayed_ack(struct sock *sk)
        timeout = jiffies + ato;
 
        /* Use new timeout only if there wasn't a older one earlier. */
-       if (tp->ack.pending&TCP_ACK_TIMER) {
+       if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
                /* If delack timer was blocked or is about to expire,
                 * send ACK now.
                 */
-               if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
+               if (icsk->icsk_ack.blocked ||
+                   time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
                        tcp_send_ack(sk);
                        return;
                }
 
-               if (!time_before(timeout, tp->ack.timeout))
-                       timeout = tp->ack.timeout;
+               if (!time_before(timeout, icsk->icsk_ack.timeout))
+                       timeout = icsk->icsk_ack.timeout;
        }
-       tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
-       tp->ack.timeout = timeout;
-       sk_reset_timer(sk, &tp->delack_timer, timeout);
+       icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+       icsk->icsk_ack.timeout = timeout;
+       sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
 }
 
 /* This routine sends an ack and also updates the window. */
@@ -1895,9 +1893,10 @@ void tcp_send_ack(struct sock *sk)
                 */
                buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
                if (buff == NULL) {
-                       tcp_schedule_ack(tp);
-                       tp->ack.ato = TCP_ATO_MIN;
-                       tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+                       inet_csk_schedule_ack(sk);
+                       inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+                       inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+                                                 TCP_DELACK_MAX, TCP_RTO_MAX);
                        return;
                }
 
@@ -1980,12 +1979,6 @@ int tcp_write_wakeup(struct sock *sk)
                                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
                                if (tcp_fragment(sk, skb, seg_size, mss))
                                        return -1;
-                               /* SWS override triggered forced fragmentation.
-                                * Disable TSO, the connection is too sick. */
-                               if (sk->sk_route_caps & NETIF_F_TSO) {
-                                       sock_set_flag(sk, SOCK_NO_LARGESEND);
-                                       sk->sk_route_caps &= ~NETIF_F_TSO;
-                               }
                        } else if (!tcp_skb_pcount(skb))
                                tcp_set_skb_tso_segs(sk, skb, mss);
 
@@ -2011,6 +2004,7 @@ int tcp_write_wakeup(struct sock *sk)
  */
 void tcp_send_probe0(struct sock *sk)
 {
+       struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int err;
 
@@ -2018,28 +2012,31 @@ void tcp_send_probe0(struct sock *sk)
 
        if (tp->packets_out || !sk->sk_send_head) {
                /* Cancel probe timer, if it is not required. */
-               tp->probes_out = 0;
-               tp->backoff = 0;
+               icsk->icsk_probes_out = 0;
+               icsk->icsk_backoff = 0;
                return;
        }
 
        if (err <= 0) {
-               if (tp->backoff < sysctl_tcp_retries2)
-                       tp->backoff++;
-               tp->probes_out++;
-               tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
-                                     min(tp->rto << tp->backoff, TCP_RTO_MAX));
+               if (icsk->icsk_backoff < sysctl_tcp_retries2)
+                       icsk->icsk_backoff++;
+               icsk->icsk_probes_out++;
+               inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 
+                                         min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+                                         TCP_RTO_MAX);
        } else {
                /* If packet was not sent due to local congestion,
-                * do not backoff and do not remember probes_out.
+                * do not backoff and do not remember icsk_probes_out.
                 * Let local senders to fight for local resources.
                 *
                 * Use accumulated backoff yet.
                 */
-               if (!tp->probes_out)
-                       tp->probes_out=1;
-               tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
-                                     min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
+               if (!icsk->icsk_probes_out)
+                       icsk->icsk_probes_out = 1;
+               inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 
+                                         min(icsk->icsk_rto << icsk->icsk_backoff,
+                                             TCP_RESOURCE_PROBE_INTERVAL),
+                                         TCP_RTO_MAX);
        }
 }