tcp: fix tcp_ack() performance problem

[pandora-kernel.git] / net / ipv4 / tcp_input.c
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index fb0fe97..a12b455 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -68,12 +68,12 @@
  #include <linux/module.h>
  #include <linux/sysctl.h>
  #include <linux/kernel.h>
+#include <linux/prefetch.h>
  #include <net/dst.h>
  #include <net/tcp.h>
  #include <net/inet_common.h>
  #include <linux/ipsec.h>
  #include <asm/unaligned.h>
-#include <net/netdma.h>
  #include <linux/errqueue.h>
  
  int sysctl_tcp_timestamps __read_mostly = 1;
@@ -201,28 +201,25 @@ static inline bool tcp_in_quickack_mode(const struct sock *sk)
         return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
  }
  
-static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
+static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
  {
         if (tp->ecn_flags & TCP_ECN_OK)
                 tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
  }
  
-static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
+static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
  {
         if (tcp_hdr(skb)->cwr)
                 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
  }
  
-static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
+static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
  {
         tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
  }
  
-static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
  {
-       if (!(tp->ecn_flags & TCP_ECN_OK))
-               return;
-
         switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
         case INET_ECN_NOT_ECT:
                 /* Funny extension: if ECT is not set on a segment,
@@ -233,30 +230,43 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
                         tcp_enter_quickack_mode((struct sock *)tp);
                 break;
         case INET_ECN_CE:
+               if (tcp_ca_needs_ecn((struct sock *)tp))
+                       tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
+
                 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
                         /* Better not delay acks, sender can have a very low cwnd */
                         tcp_enter_quickack_mode((struct sock *)tp);
                         tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
                 }
-               /* fallinto */
+               tp->ecn_flags |= TCP_ECN_SEEN;
+               break;
         default:
+               if (tcp_ca_needs_ecn((struct sock *)tp))
+                       tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
                 tp->ecn_flags |= TCP_ECN_SEEN;
+               break;
         }
  }
  
-static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
+static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+{
+       if (tp->ecn_flags & TCP_ECN_OK)
+               __tcp_ecn_check_ce(tp, skb);
+}
+
+static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
  {
         if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
                 tp->ecn_flags &= ~TCP_ECN_OK;
  }
  
-static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
+static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
  {
         if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
                 tp->ecn_flags &= ~TCP_ECN_OK;
  }
  
-static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
+static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
  {
         if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
                 return true;
@@ -653,7 +663,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
         }
         icsk->icsk_ack.lrcvtime = now;
  
-       TCP_ECN_check_ce(tp, skb);
+       tcp_ecn_check_ce(tp, skb);
  
         if (skb->len >= 128)
                 tcp_grow_window(sk, skb);
@@ -1969,7 +1979,7 @@ void tcp_enter_loss(struct sock *sk)
                                        sysctl_tcp_reordering);
         tcp_set_ca_state(sk, TCP_CA_Loss);
         tp->high_seq = tp->snd_nxt;
-       TCP_ECN_queue_cwr(tp);
+       tcp_ecn_queue_cwr(tp);
  
         /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
          * loss recovery is underway except recurring timeout(s) on
@@ -2361,7 +2371,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
  
                 if (tp->prior_ssthresh > tp->snd_ssthresh) {
                         tp->snd_ssthresh = tp->prior_ssthresh;
-                       TCP_ECN_withdraw_cwr(tp);
+                       tcp_ecn_withdraw_cwr(tp);
                 }
         } else {
                 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
@@ -2491,7 +2501,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
         tp->prr_delivered = 0;
         tp->prr_out = 0;
         tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
-       TCP_ECN_queue_cwr(tp);
+       tcp_ecn_queue_cwr(tp);
  }
  
  static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
@@ -3020,6 +3030,21 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
         return packets_acked;
  }
  
+static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
+                          u32 prior_snd_una)
+{
+       const struct skb_shared_info *shinfo;
+
+       /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
+       if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)))
+               return;
+
+       shinfo = skb_shinfo(skb);
+       if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
+           between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1))
+               __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+}
+
  /* Remove acknowledged frames from the retransmission queue. If our packet
   * is before the ack sequence we can discard it as it's confirmed to have
   * arrived at the other end.
@@ -3043,14 +3068,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
         first_ackt.v64 = 0;
  
         while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
-               struct skb_shared_info *shinfo = skb_shinfo(skb);
                 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
                 u8 sacked = scb->sacked;
                 u32 acked_pcount;
  
-               if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
-                   between(shinfo->tskey, prior_snd_una, tp->snd_una - 1))
-                       __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+               tcp_ack_tstamp(sk, skb, prior_snd_una);
  
                 /* Determine how many packets and what bytes were acked, tso and else */
                 if (after(scb->end_seq, tp->snd_una)) {
@@ -3064,10 +3086,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
  
                         fully_acked = false;
                 } else {
+                       /* Speedup tcp_unlink_write_queue() and next loop */
+                       prefetchw(skb->next);
                         acked_pcount = tcp_skb_pcount(skb);
                 }
  
-               if (sacked & TCPCB_RETRANS) {
+               if (unlikely(sacked & TCPCB_RETRANS)) {
                         if (sacked & TCPCB_SACKED_RETRANS)
                                 tp->retrans_out -= acked_pcount;
                         flag |= FLAG_RETRANS_DATA_ACKED;
@@ -3098,7 +3122,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                  * connection startup slow start one packet too
                  * quickly.  This is severely frowned upon behavior.
                  */
-               if (!(scb->tcp_flags & TCPHDR_SYN)) {
+               if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
                         flag |= FLAG_DATA_ACKED;
                 } else {
                         flag |= FLAG_SYN_ACKED;
@@ -3110,9 +3134,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
  
                 tcp_unlink_write_queue(skb, sk);
                 sk_wmem_free_skb(sk, skb);
-               if (skb == tp->retransmit_skb_hint)
+               if (unlikely(skb == tp->retransmit_skb_hint))
                         tp->retransmit_skb_hint = NULL;
-               if (skb == tp->lost_skb_hint)
+               if (unlikely(skb == tp->lost_skb_hint))
                         tp->lost_skb_hint = NULL;
         }
  
@@ -3123,7 +3147,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                 flag |= FLAG_SACK_RENEGING;
  
         skb_mstamp_get(&now);
-       if (first_ackt.v64) {
+       if (likely(first_ackt.v64)) {
                 seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
                 ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
         }
@@ -3362,6 +3386,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
         }
  }
  
+static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
+{
+       const struct inet_connection_sock *icsk = inet_csk(sk);
+
+       if (icsk->icsk_ca_ops->in_ack_event)
+               icsk->icsk_ca_ops->in_ack_event(sk, flags);
+}
+
  /* This routine deals with incoming acks, but not outgoing ones. */
  static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  {
@@ -3377,6 +3409,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
         int acked = 0; /* Number of packets newly acked */
         long sack_rtt_us = -1L;
  
+       /* We very likely will need to access write queue head. */
+       prefetchw(sk->sk_write_queue.next);
+
         /* If the ack is older than previous acks
          * then we can probably ignore it.
          */
@@ -3421,10 +3456,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
                 tp->snd_una = ack;
                 flag |= FLAG_WIN_UPDATE;
  
-               tcp_ca_event(sk, CA_EVENT_FAST_ACK);
+               tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
  
                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
         } else {
+               u32 ack_ev_flags = CA_ACK_SLOWPATH;
+
                 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
                         flag |= FLAG_DATA;
                 else
@@ -3436,10 +3473,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
                         flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
                                                         &sack_rtt_us);
  
-               if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
+               if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
                         flag |= FLAG_ECE;
+                       ack_ev_flags |= CA_ACK_ECE;
+               }
  
-               tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
+               if (flag & FLAG_WIN_UPDATE)
+                       ack_ev_flags |= CA_ACK_WIN_UPDATE;
+
+               tcp_in_ack_event(sk, ack_ev_flags);
         }
  
         /* We passed data and got it acked, remove any soft error
@@ -4171,7 +4213,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
         struct sk_buff *skb1;
         u32 seq, end_seq;
  
-       TCP_ECN_check_ce(tp, skb);
+       tcp_ecn_check_ce(tp, skb);
  
         if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
@@ -4354,7 +4396,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
         skb_dst_drop(skb);
         __skb_pull(skb, tcp_hdr(skb)->doff * 4);
  
-       TCP_ECN_accept_cwr(tp, skb);
+       tcp_ecn_accept_cwr(tp, skb);
  
         tp->rx_opt.dsack = 0;
  
@@ -4933,53 +4975,6 @@ static inline bool tcp_checksum_complete_user(struct sock *sk,
                __tcp_checksum_complete_user(sk, skb);
  }
  
-#ifdef CONFIG_NET_DMA
-static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
-                                 int hlen)
-{
-       struct tcp_sock *tp = tcp_sk(sk);
-       int chunk = skb->len - hlen;
-       int dma_cookie;
-       bool copied_early = false;
-
-       if (tp->ucopy.wakeup)
-               return false;
-
-       if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-               tp->ucopy.dma_chan = net_dma_find_channel();
-
-       if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
-
-               dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
-                                                        skb, hlen,
-                                                        tp->ucopy.iov, chunk,
-                                                        tp->ucopy.pinned_list);
-
-               if (dma_cookie < 0)
-                       goto out;
-
-               tp->ucopy.dma_cookie = dma_cookie;
-               copied_early = true;
-
-               tp->ucopy.len -= chunk;
-               tp->copied_seq += chunk;
-               tcp_rcv_space_adjust(sk);
-
-               if ((tp->ucopy.len == 0) ||
-                   (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
-                   (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
-                       tp->ucopy.wakeup = 1;
-                       sk->sk_data_ready(sk);
-               }
-       } else if (chunk > 0) {
-               tp->ucopy.wakeup = 1;
-               sk->sk_data_ready(sk);
-       }
-out:
-       return copied_early;
-}
-#endif /* CONFIG_NET_DMA */
-
  /* Does PAWS and seqno based validation of an incoming segment, flags will
   * play significant role here.
   */
@@ -5159,27 +5154,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                         }
                 } else {
                         int eaten = 0;
-                       int copied_early = 0;
                         bool fragstolen = false;
  
-                       if (tp->copied_seq == tp->rcv_nxt &&
-                           len - tcp_header_len <= tp->ucopy.len) {
-#ifdef CONFIG_NET_DMA
-                               if (tp->ucopy.task == current &&
-                                   sock_owned_by_user(sk) &&
-                                   tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
-                                       copied_early = 1;
-                                       eaten = 1;
-                               }
-#endif
-                               if (tp->ucopy.task == current &&
-                                   sock_owned_by_user(sk) && !copied_early) {
-                                       __set_current_state(TASK_RUNNING);
+                       if (tp->ucopy.task == current &&
+                           tp->copied_seq == tp->rcv_nxt &&
+                           len - tcp_header_len <= tp->ucopy.len &&
+                           sock_owned_by_user(sk)) {
+                               __set_current_state(TASK_RUNNING);
  
-                                       if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
-                                               eaten = 1;
-                               }
-                               if (eaten) {
+                               if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
                                         /* Predicted packet is in window by definition.
                                          * seq == rcv_nxt and rcv_wup <= rcv_nxt.
                                          * Hence, check seq<=rcv_wup reduces to:
@@ -5195,9 +5178,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                         __skb_pull(skb, tcp_header_len);
                                         tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
                                         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
+                                       eaten = 1;
                                 }
-                               if (copied_early)
-                                       tcp_cleanup_rbuf(sk, skb->len);
                         }
                         if (!eaten) {
                                 if (tcp_checksum_complete_user(sk, skb))
@@ -5234,14 +5216,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                         goto no_ack;
                         }
  
-                       if (!copied_early || tp->rcv_nxt != tp->rcv_wup)
-                               __tcp_ack_snd_check(sk, 0);
+                       __tcp_ack_snd_check(sk, 0);
  no_ack:
-#ifdef CONFIG_NET_DMA
-                       if (copied_early)
-                               __skb_queue_tail(&sk->sk_async_wait_queue, skb);
-                       else
-#endif
                         if (eaten)
                                 kfree_skb_partial(skb, fragstolen);
                         sk->sk_data_ready(sk);
@@ -5435,7 +5411,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                  *    state to ESTABLISHED..."
                  */
  
-               TCP_ECN_rcv_synack(tp, th);
+               tcp_ecn_rcv_synack(tp, th);
  
                 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
                 tcp_ack(sk, skb, FLAG_SLOWPATH);
@@ -5554,7 +5530,7 @@ discard:
                 tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
                 tp->max_window = tp->snd_wnd;
  
-               TCP_ECN_rcv_syn(tp, th);
+               tcp_ecn_rcv_syn(tp, th);
  
                 tcp_mtup_init(sk);
                 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -5884,6 +5860,40 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
  #endif
  }
  
+/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
+ *
+ * If we receive a SYN packet with these bits set, it means a
+ * network is playing bad games with TOS bits. In order to
+ * avoid possible false congestion notifications, we disable
+ * TCP ECN negociation.
+ *
+ * Exception: tcp_ca wants ECN. This is required for DCTCP
+ * congestion control; it requires setting ECT on all packets,
+ * including SYN. We inverse the test in this case: If our
+ * local socket wants ECN, but peer only set ece/cwr (but not
+ * ECT in IP header) its probably a non-DCTCP aware sender.
+ */
+static void tcp_ecn_create_request(struct request_sock *req,
+                                  const struct sk_buff *skb,
+                                  const struct sock *listen_sk)
+{
+       const struct tcphdr *th = tcp_hdr(skb);
+       const struct net *net = sock_net(listen_sk);
+       bool th_ecn = th->ece && th->cwr;
+       bool ect, need_ecn;
+
+       if (!th_ecn)
+               return;
+
+       ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
+       need_ecn = tcp_ca_needs_ecn(listen_sk);
+
+       if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
+               inet_rsk(req)->ecn_ok = 1;
+       else if (ect && need_ecn)
+               inet_rsk(req)->ecn_ok = 1;
+}
+
  int tcp_conn_request(struct request_sock_ops *rsk_ops,
                      const struct tcp_request_sock_ops *af_ops,
                      struct sock *sk, struct sk_buff *skb)
@@ -5944,7 +5954,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
                 goto drop_and_free;
  
         if (!want_cookie || tmp_opt.tstamp_ok)
-               TCP_ECN_create_request(req, skb, sk);
+               tcp_ecn_create_request(req, skb, sk);
  
         if (want_cookie) {
                 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);