X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?p=pandora-kernel.git;a=blobdiff_plain;f=net%2Fipv4%2Ftcp_input.c;h=7db22978114c6a0b391dd2d1ef99dd61e75ca4c9;hp=52b5c2d0ecd0aee7c99d1f31e8bf3a1115a293b9;hb=a0eee8681a9bc21d3f83260a8ceb3fa30236b189;hpb=044595d4e448305fbaec472eb7d22636d24e7d8c diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 52b5c2d0ecd0..7db22978114c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -83,9 +83,12 @@ int sysctl_tcp_ecn __read_mostly = 2; EXPORT_SYMBOL(sysctl_tcp_ecn); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; -int sysctl_tcp_adv_win_scale __read_mostly = 2; +int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); +/* rfc5961 challenge ack rate limiting */ +int sysctl_tcp_challenge_ack_limit = 1000; + int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -112,6 +115,7 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ +#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -334,6 +338,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) incr = __tcp_grow_window(sk, skb); if (incr) { + incr = max_t(int, incr, 2 * skb->len); tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); inet_csk(sk)->icsk_ack.quick |= 1; @@ -473,8 +478,11 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) if (!win_dep) { m -= (new_sample >> 3); new_sample += m; - } else if (m < new_sample) - new_sample = m << 3; + } else { + m <<= 3; + if (m < new_sample) + new_sample = m; + } } else { /* No previous measure. */ new_sample = m << 3; @@ -1295,13 +1303,14 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, */ if (pkt_len > mss) { unsigned int new_len = (pkt_len / mss) * mss; - if (!in_sack && new_len < pkt_len) { + if (!in_sack && new_len < pkt_len) new_len += mss; - if (new_len > skb->len) - return 0; - } pkt_len = new_len; } + + if (pkt_len >= skb->len && !in_sack) + return 0; + err = tcp_fragment(sk, skb, pkt_len, mss); if (err < 0) return err; @@ -1310,25 +1319,26 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, return in_sack; } -static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, - struct tcp_sacktag_state *state, +/* Mark the given newly-SACKed range as such, adjusting counters and hints. */ +static u8 tcp_sacktag_one(struct sock *sk, + struct tcp_sacktag_state *state, u8 sacked, + u32 start_seq, u32 end_seq, int dup_sack, int pcount) { struct tcp_sock *tp = tcp_sk(sk); - u8 sacked = TCP_SKB_CB(skb)->sacked; int fack_count = state->fack_count; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans && - after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + after(end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) state->reord = min(fack_count, state->reord); } /* Nothing to do; acked frame is about to be dropped (was ACKed). */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + if (!after(end_seq, tp->snd_una)) return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { @@ -1347,13 +1357,13 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, /* New sack for not retransmitted frame, * which was in hole. It is reordering. */ - if (before(TCP_SKB_CB(skb)->seq, + if (before(start_seq, tcp_highest_sack_seq(tp))) state->reord = min(fack_count, state->reord); /* SACK enhanced F-RTO (RFC4138; Appendix B) */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) + if (!after(end_seq, tp->frto_highmark)) state->flag |= FLAG_ONLY_ORIG_SACKED; } @@ -1371,8 +1381,7 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->lost_skb_hint)->seq)) + before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) tp->lost_cnt_hint += pcount; if (fack_count > tp->fackets_out) @@ -1391,6 +1400,9 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, return sacked; } +/* Shift newly-SACKed bytes from this skb to the immediately previous + * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. + */ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss, @@ -1398,9 +1410,20 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev = tcp_write_queue_prev(sk, skb); + u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ + u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ BUG_ON(!pcount); + /* Adjust counters and hints for the newly sacked sequence + * range but discard the return value since prev is already + * marked. We must tag the range first because the seq + * advancement below implicitly advances + * tcp_highest_sack_seq() when skb is highest_sack. + */ + tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, + start_seq, end_seq, dup_sack, pcount); + if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; @@ -1408,7 +1431,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->seq += shifted; skb_shinfo(prev)->gso_segs += pcount; - BUG_ON(skb_shinfo(skb)->gso_segs < pcount); + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); skb_shinfo(skb)->gso_segs -= pcount; /* When we're adding to gso_segs == 1, gso_size will be zero, @@ -1427,9 +1450,6 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, skb_shinfo(skb)->gso_type = 0; } - /* We discard results */ - tcp_sacktag_one(skb, sk, state, dup_sack, pcount); - /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); @@ -1450,7 +1470,10 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, tp->lost_cnt_hint -= tcp_skb_pcount(prev); } - TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags; + TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + TCP_SKB_CB(prev)->end_seq++; + if (skb == tcp_highest_sack(sk)) tcp_advance_highest_sack(sk, skb); @@ -1476,6 +1499,21 @@ static int skb_can_shift(const struct sk_buff *skb) return !skb_headlen(skb) && skb_is_nonlinear(skb); } +int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, + int pcount, int shiftlen) +{ + /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE) + * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need + * to make sure not storing more than 65535 * 8 bytes per skb, + * even if current MSS is bigger. + */ + if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE)) + return 0; + if (unlikely(tcp_skb_pcount(to) + pcount > 65535)) + return 0; + return skb_shift(to, from, shiftlen); +} + /* Try collapsing SACK blocks spanning across multiple skbs to a single * skb. */ @@ -1487,6 +1525,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev; int mss; + int next_pcount; int pcount = 0; int len; int in_sack; @@ -1577,7 +1616,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, } } - if (!skb_shift(prev, skb, len)) + /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */ + if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) + goto fallback; + + if (!tcp_skb_shift(prev, skb, pcount, len)) goto fallback; if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) goto out; @@ -1596,9 +1639,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, goto out; len = skb->len; - if (skb_shift(prev, skb, len)) { - pcount += tcp_skb_pcount(skb); - tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); + next_pcount = tcp_skb_pcount(skb); + if (tcp_skb_shift(prev, skb, next_pcount, len)) { + pcount += next_pcount; + tcp_shifted_skb(sk, skb, state, next_pcount, len, mss, 0); } out: @@ -1667,10 +1711,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, break; if (in_sack) { - TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk, - state, - dup_sack, - tcp_skb_pcount(skb)); + TCP_SKB_CB(skb)->sacked = + tcp_sacktag_one(sk, + state, + TCP_SKB_CB(skb)->sacked, + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + dup_sack, + tcp_skb_pcount(skb)); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -2239,11 +2287,8 @@ void tcp_enter_loss(struct sock *sk, int how) if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - if (!how) { - /* Push undo marker, if it was plain RTO and nothing - * was retransmitted. */ - tp->undo_marker = tp->snd_una; - } else { + tp->undo_marker = tp->snd_una; + if (how) { tp->sacked_out = 0; tp->fackets_out = 0; } @@ -2556,6 +2601,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) if (cnt > packets) { if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || (oldcnt >= packets)) break; @@ -2844,11 +2890,14 @@ static inline void tcp_complete_cwr(struct sock *sk) /* Do not moderate cwnd if it's already undone in cwr or recovery. */ if (tp->undo_marker) { - if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) + if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - else /* PRR */ + tp->snd_cwnd_stamp = tcp_time_stamp; + } else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) { + /* PRR algorithm. */ tp->snd_cwnd = tp->snd_ssthresh; - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_stamp = tcp_time_stamp; + } } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } @@ -3393,7 +3442,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, int delta; /* Non-retransmitted hole got filled? That's reordering */ - if (reord < prior_fackets) + if (reord < prior_fackets && reord <= tp->fackets_out) tcp_update_reordering(sk, tp->fackets_out - reord, 0); delta = tcp_is_fack(tp) ? pkts_acked : @@ -3625,6 +3674,11 @@ static int tcp_process_frto(struct sock *sk, int flag) } } else { if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { + if (!tcp_packets_in_flight(tp)) { + tcp_enter_frto_loss(sk, 2, flag); + return true; + } + /* Prevent sending of new data. */ tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)); @@ -3673,6 +3727,52 @@ static int tcp_process_frto(struct sock *sk, int flag) return 0; } +/* RFC 5961 7 [ACK Throttling] */ +static void tcp_send_challenge_ack(struct sock *sk) +{ + /* unprotected vars, we dont care of overwrites */ + static u32 challenge_timestamp; + static unsigned int challenge_count; + u32 count, now = jiffies / HZ; + + if (now != challenge_timestamp) { + u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; + + challenge_timestamp = now; + ACCESS_ONCE(challenge_count) = + half + (u32)( + ((u64) random32() * sysctl_tcp_challenge_ack_limit) + >> 32); + } + count = ACCESS_ONCE(challenge_count); + if (count > 0) { + ACCESS_ONCE(challenge_count) = count - 1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); + tcp_send_ack(sk); + } +} + +static void tcp_store_ts_recent(struct tcp_sock *tp) +{ + tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; + tp->rx_opt.ts_recent_stamp = get_seconds(); +} + +static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) +{ + if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { + /* PAWS bug workaround wrt. ACK frames, the PAWS discard + * extra check below makes sure this can only happen + * for pure ACK frames. -DaveM + * + * Not only, also it occurs for expired timestamps. + */ + + if (tcp_paws_check(&tp->rx_opt, 0)) + tcp_store_ts_recent(tp); + } +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3691,8 +3791,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* If the ack is older than previous acks * then we can probably ignore it. */ - if (before(ack, prior_snd_una)) + if (before(ack, prior_snd_una)) { + /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ + if (before(ack, prior_snd_una - tp->max_window)) { + tcp_send_challenge_ack(sk); + return -1; + } goto old_ack; + } /* If the ack includes data we haven't sent yet, discard * this segment (RFC793 Section 3.9). @@ -3715,6 +3821,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) prior_fackets = tp->fackets_out; prior_in_flight = tcp_packets_in_flight(tp); + /* ts_recent update must be made after we are sure that the packet + * is in window. + */ + if (flag & FLAG_UPDATE_TS_RECENT) + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. * No more checks are required. @@ -4005,27 +4117,6 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) EXPORT_SYMBOL(tcp_parse_md5sig_option); #endif -static inline void tcp_store_ts_recent(struct tcp_sock *tp) -{ - tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = get_seconds(); -} - -static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) -{ - if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { - /* PAWS bug workaround wrt. ACK frames, the PAWS discard - * extra check below makes sure this can only happen - * for pure ACK frames. -DaveM - * - * Not only, also it occurs for expired timestamps. - */ - - if (tcp_paws_check(&tp->rx_opt, 0)) - tcp_store_ts_recent(tp); - } -} - /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM * * It is not fatal. If this ACK does _not_ change critical state (seqs, window) @@ -5216,8 +5307,8 @@ out: /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ -static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, int syn_inerr) +static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, int syn_inerr) { const u8 *hash_location; struct tcp_sock *tp = tcp_sk(sk); @@ -5242,38 +5333,48 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, * an acknowledgment should be sent in reply (unless the RST * bit is set, if so drop the segment and return)". */ - if (!th->rst) + if (!th->rst) { + if (th->syn) + goto syn_challenge; tcp_send_dupack(sk, skb); + } goto discard; } /* Step 2: check RST bit */ if (th->rst) { - tcp_reset(sk); + /* RFC 5961 3.2 : + * If sequence number exactly matches RCV.NXT, then + * RESET the connection + * else + * Send a challenge ACK + */ + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) + tcp_reset(sk); + else + tcp_send_challenge_ack(sk); goto discard; } - /* ts_recent update must be made after we are sure that the packet - * is in window. - */ - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - /* step 3: check security and precedence [ignored] */ - /* step 4: Check for a SYN in window. */ - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + /* step 4: Check for a SYN + * RFC 5691 4.2 : Send a challenge ack + */ + if (th->syn) { +syn_challenge: if (syn_inerr) TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); - tcp_reset(sk); - return -1; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); + tcp_send_challenge_ack(sk); + goto discard; } - return 1; + return true; discard: __kfree_skb(skb); - return 0; + return false; } /* @@ -5303,7 +5404,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); - int res; /* * Header prediction. @@ -5388,7 +5488,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tp->copied_seq == tp->rcv_nxt && len - tcp_header_len <= tp->ucopy.len) { #ifdef CONFIG_NET_DMA - if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { + if (tp->ucopy.task == current && + sock_owned_by_user(sk) && + tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { copied_early = 1; eaten = 1; } @@ -5424,6 +5526,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tcp_checksum_complete_user(sk, skb)) goto csum_error; + if ((int)skb->truesize > sk->sk_forward_alloc) + goto step5; + /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -5435,9 +5540,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_rcv_rtt_measure_ts(sk, skb); - if ((int)skb->truesize > sk->sk_forward_alloc) - goto step5; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ @@ -5481,12 +5583,12 @@ slow_path: * Standard slow path. */ - res = tcp_validate_incoming(sk, skb, th, 1); - if (res <= 0) - return -res; + if (!tcp_validate_incoming(sk, skb, th, 1)) + return 0; step5: - if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) + if (th->ack && + tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) goto discard; tcp_rcv_rtt_measure_ts(sk, skb); @@ -5734,6 +5836,7 @@ discard: } tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tp->copied_seq = tp->rcv_nxt; tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is @@ -5793,7 +5896,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int queued = 0; - int res; tp->rx_opt.saw_tstamp = 0; @@ -5809,6 +5911,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; if (th->syn) { + if (th->fin) + goto discard; if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) return 1; @@ -5846,13 +5950,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 0; } - res = tcp_validate_incoming(sk, skb, th, 0); - if (res <= 0) - return -res; + if (!tcp_validate_incoming(sk, skb, th, 0)) + return 0; /* step 5: check the ACK field */ if (th->ack) { - int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; + int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | + FLAG_UPDATE_TS_RECENT) > 0; switch (sk->sk_state) { case TCP_SYN_RECV: