EXPORT_SYMBOL(sysctl_tcp_ecn);
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
-int sysctl_tcp_adv_win_scale __read_mostly = 2;
+int sysctl_tcp_adv_win_scale __read_mostly = 1;
EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
+/* rfc5961 challenge ack rate limiting */
+int sysctl_tcp_challenge_ack_limit = 1000;
+
int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
+#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}
-static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb)
+static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
{
if (tcp_hdr(skb)->cwr)
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}
-static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
+static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
{
- if (tp->ecn_flags & TCP_ECN_OK) {
- if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
- tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ if (!(tp->ecn_flags & TCP_ECN_OK))
+ return;
+
+ switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
+ case INET_ECN_NOT_ECT:
/* Funny extension: if ECT is not set on a segment,
- * it is surely retransmit. It is not in ECN RFC,
- * but Linux follows this rule. */
- else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags)))
+ * and we already seen ECT on a previous segment,
+ * it is probably a retransmit.
+ */
+ if (tp->ecn_flags & TCP_ECN_SEEN)
tcp_enter_quickack_mode((struct sock *)tp);
+ break;
+ case INET_ECN_CE:
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ /* fallinto */
+ default:
+ tp->ecn_flags |= TCP_ECN_SEEN;
}
}
-static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th)
+static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
{
if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
tp->ecn_flags &= ~TCP_ECN_OK;
}
-static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th)
+static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
{
if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
tp->ecn_flags &= ~TCP_ECN_OK;
}
-static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th)
+static inline int TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
{
if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
return 1;
static void tcp_fixup_sndbuf(struct sock *sk)
{
- int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
- sizeof(struct sk_buff);
+ int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER);
- if (sk->sk_sndbuf < 3 * sndmem) {
- sk->sk_sndbuf = 3 * sndmem;
- if (sk->sk_sndbuf > sysctl_tcp_wmem[2])
- sk->sk_sndbuf = sysctl_tcp_wmem[2];
- }
+ sndmem *= TCP_INIT_CWND;
+ if (sk->sk_sndbuf < sndmem)
+ sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
}
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
return 0;
}
-static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
+static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
incr = __tcp_grow_window(sk, skb);
if (incr) {
+ incr = max_t(int, incr, 2 * skb->len);
tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
tp->window_clamp);
inet_csk(sk)->icsk_ack.quick |= 1;
static void tcp_fixup_rcvbuf(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
+ u32 mss = tcp_sk(sk)->advmss;
+ u32 icwnd = TCP_DEFAULT_INIT_RCVWND;
+ int rcvmem;
- /* Try to select rcvbuf so that 4 mss-sized segments
- * will fit to window and corresponding skbs will fit to our rcvbuf.
- * (was 3; 4 is minimum to allow fast retransmit to work.)
+ /* Limit to 10 segments if mss <= 1460,
+ * or 14600/mss segments, with a minimum of two segments.
*/
- while (tcp_win_from_space(rcvmem) < tp->advmss)
+ if (mss > 1460)
+ icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
+
+ rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER);
+ while (tcp_win_from_space(rcvmem) < mss)
rcvmem += 128;
- if (sk->sk_rcvbuf < 4 * rcvmem)
- sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
+
+ rcvmem *= icwnd;
+
+ if (sk->sk_rcvbuf < rcvmem)
+ sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
}
/* 4. Try to fixup all. It is made immediately after connection enters
*/
void tcp_initialize_rcv_mss(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
hint = min(hint, tp->rcv_wnd / 2);
if (!win_dep) {
m -= (new_sample >> 3);
new_sample += m;
- } else if (m < new_sample)
- new_sample = m << 3;
+ } else {
+ m <<= 3;
+ if (m < new_sample)
+ new_sample = m;
+ }
} else {
/* No previous measure. */
new_sample = m << 3;
space /= tp->advmss;
if (!space)
space = 1;
- rcvmem = (tp->advmss + MAX_TCP_HEADER +
- 16 + sizeof(struct sk_buff));
+ rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
while (tcp_win_from_space(rcvmem) < tp->advmss)
rcvmem += 128;
space *= rcvmem;
}
}
-__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
+__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
{
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
tp->lost_retrans_low = new_low_seq;
}
-static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
+static int tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp, int num_sacks,
u32 prior_snd_una)
{
*/
if (pkt_len > mss) {
unsigned int new_len = (pkt_len / mss) * mss;
- if (!in_sack && new_len < pkt_len) {
+ if (!in_sack && new_len < pkt_len)
new_len += mss;
- if (new_len > skb->len)
- return 0;
- }
pkt_len = new_len;
}
+
+ if (pkt_len >= skb->len && !in_sack)
+ return 0;
+
err = tcp_fragment(sk, skb, pkt_len, mss);
if (err < 0)
return err;
return in_sack;
}
-static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
- struct tcp_sacktag_state *state,
+/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
+static u8 tcp_sacktag_one(struct sock *sk,
+ struct tcp_sacktag_state *state, u8 sacked,
+ u32 start_seq, u32 end_seq,
int dup_sack, int pcount)
{
struct tcp_sock *tp = tcp_sk(sk);
- u8 sacked = TCP_SKB_CB(skb)->sacked;
int fack_count = state->fack_count;
/* Account D-SACK for retransmitted packet. */
if (dup_sack && (sacked & TCPCB_RETRANS)) {
if (tp->undo_marker && tp->undo_retrans &&
- after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
+ after(end_seq, tp->undo_marker))
tp->undo_retrans--;
if (sacked & TCPCB_SACKED_ACKED)
state->reord = min(fack_count, state->reord);
}
/* Nothing to do; acked frame is about to be dropped (was ACKed). */
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ if (!after(end_seq, tp->snd_una))
return sacked;
if (!(sacked & TCPCB_SACKED_ACKED)) {
/* New sack for not retransmitted frame,
* which was in hole. It is reordering.
*/
- if (before(TCP_SKB_CB(skb)->seq,
+ if (before(start_seq,
tcp_highest_sack_seq(tp)))
state->reord = min(fack_count,
state->reord);
/* SACK enhanced F-RTO (RFC4138; Appendix B) */
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark))
+ if (!after(end_seq, tp->frto_highmark))
state->flag |= FLAG_ONLY_ORIG_SACKED;
}
/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
- before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->lost_skb_hint)->seq))
+ before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
tp->lost_cnt_hint += pcount;
if (fack_count > tp->fackets_out)
return sacked;
}
+/* Shift newly-SACKed bytes from this skb to the immediately previous
+ * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
+ */
static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
struct tcp_sacktag_state *state,
unsigned int pcount, int shifted, int mss,
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
+ u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
+ u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
BUG_ON(!pcount);
+ /* Adjust counters and hints for the newly sacked sequence
+ * range but discard the return value since prev is already
+ * marked. We must tag the range first because the seq
+ * advancement below implicitly advances
+ * tcp_highest_sack_seq() when skb is highest_sack.
+ */
+ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
+ start_seq, end_seq, dup_sack, pcount);
+
if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount;
skb_shinfo(skb)->gso_type = 0;
}
- /* We discard results */
- tcp_sacktag_one(skb, sk, state, dup_sack, pcount);
-
/* Difference in this won't matter, both ACKed by the same cumul. ACK */
TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
tp->lost_cnt_hint -= tcp_skb_pcount(prev);
}
- TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
+ TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ TCP_SKB_CB(prev)->end_seq++;
+
if (skb == tcp_highest_sack(sk))
tcp_advance_highest_sack(sk, skb);
/* I wish gso_size would have a bit more sane initialization than
* something-or-zero which complicates things
*/
-static int tcp_skb_seglen(struct sk_buff *skb)
+static int tcp_skb_seglen(const struct sk_buff *skb)
{
return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
}
/* Shifting pages past head area doesn't work */
-static int skb_can_shift(struct sk_buff *skb)
+static int skb_can_shift(const struct sk_buff *skb)
{
return !skb_headlen(skb) && skb_is_nonlinear(skb);
}
}
}
+ /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
+ if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
+ goto fallback;
+
if (!skb_shift(prev, skb, len))
goto fallback;
if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
break;
if (in_sack) {
- TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk,
- state,
- dup_sack,
- tcp_skb_pcount(skb));
+ TCP_SKB_CB(skb)->sacked =
+ tcp_sacktag_one(sk,
+ state,
+ TCP_SKB_CB(skb)->sacked,
+ TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq,
+ dup_sack,
+ tcp_skb_pcount(skb));
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
return skb;
}
-static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache)
+static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
{
return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
}
static int
-tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
+tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
u32 prior_snd_una)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- unsigned char *ptr = (skb_transport_header(ack_skb) +
- TCP_SKB_CB(ack_skb)->sacked);
+ const unsigned char *ptr = (skb_transport_header(ack_skb) +
+ TCP_SKB_CB(ack_skb)->sacked);
struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
struct tcp_sack_block sp[TCP_NUM_SACKS];
struct tcp_sack_block *cache;
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
- if (!how) {
- /* Push undo marker, if it was plain RTO and nothing
- * was retransmitted. */
- tp->undo_marker = tp->snd_una;
- } else {
+ tp->undo_marker = tp->snd_una;
+ if (how) {
tp->sacked_out = 0;
tp->fackets_out = 0;
}
return 0;
}
-static inline int tcp_fackets_out(struct tcp_sock *tp)
+static inline int tcp_fackets_out(const struct tcp_sock *tp)
{
return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
}
* they differ. Since neither occurs due to loss, TCP should really
* ignore them.
*/
-static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
+static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
{
return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
}
-static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
+static inline int tcp_skb_timedout(const struct sock *sk,
+ const struct sk_buff *skb)
{
return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
}
-static inline int tcp_head_timedout(struct sock *sk)
+static inline int tcp_head_timedout(const struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
return tp->packets_out &&
tcp_skb_timedout(sk, tcp_write_queue_head(sk));
if (cnt > packets) {
if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+ (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
(oldcnt >= packets))
break;
/* Nothing was retransmitted or returned timestamp is less
* than timestamp of the first retransmission.
*/
-static inline int tcp_packet_delayed(struct tcp_sock *tp)
+static inline int tcp_packet_delayed(const struct tcp_sock *tp)
{
return !tp->retrans_stamp ||
(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
tp->snd_cwnd_stamp = tcp_time_stamp;
}
-static inline int tcp_may_undo(struct tcp_sock *tp)
+static inline int tcp_may_undo(const struct tcp_sock *tp)
{
return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
}
* that successive retransmissions of a segment must not advance
* retrans_stamp under any conditions.
*/
-static int tcp_any_retrans_done(struct sock *sk)
+static int tcp_any_retrans_done(const struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
if (tp->retrans_out)
static inline void tcp_complete_cwr(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- /* Do not moderate cwnd if it's already undone in cwr or recovery */
- if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) {
- tp->snd_cwnd = tp->snd_ssthresh;
- tp->snd_cwnd_stamp = tcp_time_stamp;
+
+ /* Do not moderate cwnd if it's already undone in cwr or recovery. */
+ if (tp->undo_marker) {
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
+ tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ } else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) {
+ /* PRR algorithm. */
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ }
}
tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
}
}
EXPORT_SYMBOL(tcp_simple_retransmit);
+/* This function implements the PRR algorithm, specifcally the PRR-SSRB
+ * (proportional rate reduction with slow start reduction bound) as described in
+ * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt.
+ * It computes the number of packets to send (sndcnt) based on packets newly
+ * delivered:
+ * 1) If the packets in flight is larger than ssthresh, PRR spreads the
+ * cwnd reductions across a full RTT.
+ * 2) If packets in flight is lower than ssthresh (such as due to excess
+ * losses and/or application stalls), do not perform any further cwnd
+ * reductions, but instead slow start up to ssthresh.
+ */
+static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
+ int fast_rexmit, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int sndcnt = 0;
+ int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
+
+ if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
+ u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
+ tp->prior_cwnd - 1;
+ sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
+ } else {
+ sndcnt = min_t(int, delta,
+ max_t(int, tp->prr_delivered - tp->prr_out,
+ newly_acked_sacked) + 1);
+ }
+
+ sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
+ tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
+}
+
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
-static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
+static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
+ int newly_acked_sacked, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
tp->bytes_acked = 0;
tp->snd_cwnd_cnt = 0;
+ tp->prior_cwnd = tp->snd_cwnd;
+ tp->prr_delivered = 0;
+ tp->prr_out = 0;
tcp_set_ca_state(sk, TCP_CA_Recovery);
fast_rexmit = 1;
}
if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
tcp_update_scoreboard(sk, fast_rexmit);
- tcp_cwnd_down(sk, flag);
+ tp->prr_delivered += newly_acked_sacked;
+ tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag);
tcp_xmit_retransmit_queue(sk);
}
*/
static void tcp_rearm_rto(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
if (!tp->packets_out) {
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
* connection startup slow start one packet too
* quickly. This is severely frowned upon behavior.
*/
- if (!(scb->flags & TCPHDR_SYN)) {
+ if (!(scb->tcp_flags & TCPHDR_SYN)) {
flag |= FLAG_DATA_ACKED;
} else {
flag |= FLAG_SYN_ACKED;
int delta;
/* Non-retransmitted hole got filled? That's reordering */
- if (reord < prior_fackets)
+ if (reord < prior_fackets && reord <= tp->fackets_out)
tcp_update_reordering(sk, tp->fackets_out - reord, 0);
delta = tcp_is_fack(tp) ? pkts_acked :
* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
* and in FreeBSD. NetBSD's one is even worse.) is wrong.
*/
-static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
+static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
u32 ack_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
}
} else {
if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
+ if (!tcp_packets_in_flight(tp)) {
+ tcp_enter_frto_loss(sk, 2, flag);
+ return true;
+ }
+
/* Prevent sending of new data. */
tp->snd_cwnd = min(tp->snd_cwnd,
tcp_packets_in_flight(tp));
return 0;
}
+/* RFC 5961 7 [ACK Throttling] */
+static void tcp_send_challenge_ack(struct sock *sk)
+{
+ /* unprotected vars, we dont care of overwrites */
+ static u32 challenge_timestamp;
+ static unsigned int challenge_count;
+ u32 count, now = jiffies / HZ;
+
+ if (now != challenge_timestamp) {
+ u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1;
+
+ challenge_timestamp = now;
+ ACCESS_ONCE(challenge_count) =
+ half + (u32)(
+ ((u64) random32() * sysctl_tcp_challenge_ack_limit)
+ >> 32);
+ }
+ count = ACCESS_ONCE(challenge_count);
+ if (count > 0) {
+ ACCESS_ONCE(challenge_count) = count - 1;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+ tcp_send_ack(sk);
+ }
+}
+
+static void tcp_store_ts_recent(struct tcp_sock *tp)
+{
+ tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
+ tp->rx_opt.ts_recent_stamp = get_seconds();
+}
+
+static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+{
+ if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
+ /* PAWS bug workaround wrt. ACK frames, the PAWS discard
+ * extra check below makes sure this can only happen
+ * for pure ACK frames. -DaveM
+ *
+ * Not only, also it occurs for expired timestamps.
+ */
+
+ if (tcp_paws_check(&tp->rx_opt, 0))
+ tcp_store_ts_recent(tp);
+ }
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
-static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
+static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_in_flight;
u32 prior_fackets;
int prior_packets;
+ int prior_sacked = tp->sacked_out;
+ int newly_acked_sacked = 0;
int frto_cwnd = 0;
/* If the ack is older than previous acks
* then we can probably ignore it.
*/
- if (before(ack, prior_snd_una))
+ if (before(ack, prior_snd_una)) {
+ /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
+ if (before(ack, prior_snd_una - tp->max_window)) {
+ tcp_send_challenge_ack(sk);
+ return -1;
+ }
goto old_ack;
+ }
/* If the ack includes data we haven't sent yet, discard
* this segment (RFC793 Section 3.9).
prior_fackets = tp->fackets_out;
prior_in_flight = tcp_packets_in_flight(tp);
+ /* ts_recent update must be made after we are sure that the packet
+ * is in window.
+ */
+ if (flag & FLAG_UPDATE_TS_RECENT)
+ tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+
if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
/* Window is constant, pure forward advance.
* No more checks are required.
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
+ newly_acked_sacked = (prior_packets - prior_sacked) -
+ (tp->packets_out - tp->sacked_out);
+
if (tp->frto_counter)
frto_cwnd = tcp_process_frto(sk, flag);
/* Guarantee sacktag reordering detection against wrap-arounds */
tcp_may_raise_cwnd(sk, flag))
tcp_cong_avoid(sk, ack, prior_in_flight);
tcp_fastretrans_alert(sk, prior_packets - tp->packets_out,
- flag);
+ newly_acked_sacked, flag);
} else {
if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
tcp_cong_avoid(sk, ack, prior_in_flight);
* But, this can also be called on packets in the established flow when
* the fast version below fails.
*/
-void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
- u8 **hvpp, int estab)
+void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx,
+ const u8 **hvpp, int estab)
{
- unsigned char *ptr;
- struct tcphdr *th = tcp_hdr(skb);
+ const unsigned char *ptr;
+ const struct tcphdr *th = tcp_hdr(skb);
int length = (th->doff * 4) - sizeof(struct tcphdr);
- ptr = (unsigned char *)(th + 1);
+ ptr = (const unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
while (length > 0) {
}
EXPORT_SYMBOL(tcp_parse_options);
-static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
+static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
{
- __be32 *ptr = (__be32 *)(th + 1);
+ const __be32 *ptr = (const __be32 *)(th + 1);
if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
| (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_options().
*/
-static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
- struct tcp_sock *tp, u8 **hvpp)
+static int tcp_fast_parse_options(const struct sk_buff *skb,
+ const struct tcphdr *th,
+ struct tcp_sock *tp, const u8 **hvpp)
{
/* In the spirit of fast parsing, compare doff directly to constant
* values. Because equality is used, short doff can be ignored here.
/*
* Parse MD5 Signature option
*/
-u8 *tcp_parse_md5sig_option(struct tcphdr *th)
+const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
{
- int length = (th->doff << 2) - sizeof (*th);
- u8 *ptr = (u8*)(th + 1);
+ int length = (th->doff << 2) - sizeof(*th);
+ const u8 *ptr = (const u8 *)(th + 1);
/* If the TCP option is too short, we can short cut */
if (length < TCPOLEN_MD5SIG)
EXPORT_SYMBOL(tcp_parse_md5sig_option);
#endif
-static inline void tcp_store_ts_recent(struct tcp_sock *tp)
-{
- tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
- tp->rx_opt.ts_recent_stamp = get_seconds();
-}
-
-static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
-{
- if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
- /* PAWS bug workaround wrt. ACK frames, the PAWS discard
- * extra check below makes sure this can only happen
- * for pure ACK frames. -DaveM
- *
- * Not only, also it occurs for expired timestamps.
- */
-
- if (tcp_paws_check(&tp->rx_opt, 0))
- tcp_store_ts_recent(tp);
- }
-}
-
/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
*
* It is not fatal. If this ACK does _not_ change critical state (seqs, window)
static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcphdr *th = tcp_hdr(skb);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcphdr *th = tcp_hdr(skb);
u32 seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
* (borrowed from freebsd)
*/
-static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq)
+static inline int tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
{
return !before(end_seq, tp->rcv_wup) &&
!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
*
* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
*/
-static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
+static void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
}
-static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
+static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
__skb_queue_tail(&sk->sk_receive_queue, skb);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (tcp_hdr(skb)->fin)
- tcp_fin(skb, sk, tcp_hdr(skb));
+ tcp_fin(sk);
}
}
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
- struct tcphdr *th = tcp_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
struct tcp_sock *tp = tcp_sk(sk);
int eaten = -1;
if (skb->len)
tcp_event_data_recv(sk, skb);
if (th->fin)
- tcp_fin(skb, sk, th);
+ tcp_fin(sk);
if (!skb_queue_empty(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
tp->snd_cwnd_stamp = tcp_time_stamp;
}
-static int tcp_should_expand_sndbuf(struct sock *sk)
+static int tcp_should_expand_sndbuf(const struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
/* If the user specified a specific send buffer setting, do
* not modify it.
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_should_expand_sndbuf(sk)) {
- int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
- MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
+ int sndmem = SKB_TRUESIZE(max_t(u32,
+ tp->rx_opt.mss_clamp,
+ tp->mss_cache) +
+ MAX_TCP_HEADER);
int demanded = max_t(unsigned int, tp->snd_cwnd,
tp->reordering + 1);
sndmem *= 2 * demanded;
* either form (or just set the sysctl tcp_stdurg).
*/
-static void tcp_check_urg(struct sock *sk, struct tcphdr *th)
+static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 ptr = ntohs(th->urg_ptr);
}
/* This is the 'fast' part of urgent handling. */
-static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
+static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Does PAWS and seqno based validation of an incoming segment, flags will
* play significant role here.
*/
-static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, int syn_inerr)
+static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, int syn_inerr)
{
- u8 *hash_location;
+ const u8 *hash_location;
struct tcp_sock *tp = tcp_sk(sk);
/* RFC1323: H1. Apply PAWS check first. */
* an acknowledgment should be sent in reply (unless the RST
* bit is set, if so drop the segment and return)".
*/
- if (!th->rst)
+ if (!th->rst) {
+ if (th->syn)
+ goto syn_challenge;
tcp_send_dupack(sk, skb);
+ }
goto discard;
}
/* Step 2: check RST bit */
if (th->rst) {
- tcp_reset(sk);
+ /* RFC 5961 3.2 :
+ * If sequence number exactly matches RCV.NXT, then
+ * RESET the connection
+ * else
+ * Send a challenge ACK
+ */
+ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
+ tcp_reset(sk);
+ else
+ tcp_send_challenge_ack(sk);
goto discard;
}
- /* ts_recent update must be made after we are sure that the packet
- * is in window.
- */
- tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
-
/* step 3: check security and precedence [ignored] */
- /* step 4: Check for a SYN in window. */
- if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ /* step 4: Check for a SYN
+ * RFC 5691 4.2 : Send a challenge ack
+ */
+ if (th->syn) {
+syn_challenge:
if (syn_inerr)
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
- tcp_reset(sk);
- return -1;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
+ tcp_send_challenge_ack(sk);
+ goto discard;
}
- return 1;
+ return true;
discard:
__kfree_skb(skb);
- return 0;
+ return false;
}
/*
* tcp_data_queue when everything is OK.
*/
int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+ const struct tcphdr *th, unsigned int len)
{
struct tcp_sock *tp = tcp_sk(sk);
- int res;
/*
* Header prediction.
if (tp->copied_seq == tp->rcv_nxt &&
len - tcp_header_len <= tp->ucopy.len) {
#ifdef CONFIG_NET_DMA
- if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
+ if (tp->ucopy.task == current &&
+ sock_owned_by_user(sk) &&
+ tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
copied_early = 1;
eaten = 1;
}
if (tcp_checksum_complete_user(sk, skb))
goto csum_error;
+ if ((int)skb->truesize > sk->sk_forward_alloc)
+ goto step5;
+
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
tcp_rcv_rtt_measure_ts(sk, skb);
- if ((int)skb->truesize > sk->sk_forward_alloc)
- goto step5;
-
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
* Standard slow path.
*/
- res = tcp_validate_incoming(sk, skb, th, 1);
- if (res <= 0)
- return -res;
+ if (!tcp_validate_incoming(sk, skb, th, 1))
+ return 0;
step5:
- if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
+ if (th->ack &&
+ tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
goto discard;
tcp_rcv_rtt_measure_ts(sk, skb);
EXPORT_SYMBOL(tcp_rcv_established);
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+ const struct tcphdr *th, unsigned int len)
{
- u8 *hash_location;
+ const u8 *hash_location;
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_cookie_values *cvp = tp->cookie_values;
}
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->copied_seq = tp->rcv_nxt;
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
*/
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+ const struct tcphdr *th, unsigned int len)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int queued = 0;
- int res;
tp->rx_opt.saw_tstamp = 0;
goto discard;
if (th->syn) {
+ if (th->fin)
+ goto discard;
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
return 1;
return 0;
}
- res = tcp_validate_incoming(sk, skb, th, 0);
- if (res <= 0)
- return -res;
+ if (!tcp_validate_incoming(sk, skb, th, 0))
+ return 0;
/* step 5: check the ACK field */
if (th->ack) {
- int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
+ int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
+ FLAG_UPDATE_TS_RECENT) > 0;
switch (sk->sk_state) {
case TCP_SYN_RECV: