}
/* Packet ECN state for a SYN-ACK */
-static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)
+static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
if (!(tp->ecn_flags & TCP_ECN_OK))
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
+ else if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
}
/* Packet ECN state for a SYN. */
-static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
+static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->ecn_flags = 0;
- if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) {
+ if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
+ tcp_ca_needs_ecn(sk)) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK;
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
}
}
-static __inline__ void
-TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)
+static void
+tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th,
+ struct sock *sk)
{
- if (inet_rsk(req)->ecn_ok)
+ if (inet_rsk(req)->ecn_ok) {
th->ece = 1;
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
+ }
}
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
* be sent.
*/
-static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
+static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
int tcp_header_len)
{
struct tcp_sock *tp = tcp_sk(sk);
tcp_hdr(skb)->cwr = 1;
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
}
- } else {
+ } else if (!tcp_ca_needs_ecn(sk)) {
/* ACK or retransmitted segment: clear ECT|CE */
INET_ECN_dontxmit(sk);
}
TCP_SKB_CB(skb)->tcp_flags = flags;
TCP_SKB_CB(skb)->sacked = 0;
- shinfo->gso_segs = 1;
+ tcp_skb_pcount_set(skb, 1);
shinfo->gso_size = 0;
shinfo->gso_type = 0;
{
struct sock *sk = skb->sk;
struct tcp_sock *tp = tcp_sk(sk);
+ int wmem;
+
+ /* Keep one reference on sk_wmem_alloc.
+ * Will be released by sk_free() from here or tcp_tasklet_func()
+ */
+ wmem = atomic_sub_return(skb->truesize - 1, &sk->sk_wmem_alloc);
+
+ /* If this softirq is serviced by ksoftirqd, we are likely under stress.
+ * Wait until our queues (qdisc + devices) are drained.
+ * This gives :
+ * - less callbacks to tcp_write_xmit(), reducing stress (batches)
+ * - chance for incoming ACK (processed by another cpu maybe)
+ * to migrate this flow (skb->ooo_okay will be eventually set)
+ */
+ if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
+ goto out;
if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
!test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
unsigned long flags;
struct tsq_tasklet *tsq;
- /* Keep a ref on socket.
- * This last ref will be released in tcp_tasklet_func()
- */
- atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
-
/* queue this socket to tasklet queue */
local_irq_save(flags);
tsq = &__get_cpu_var(tsq_tasklet);
list_add(&tp->tsq_node, &tsq->head);
tasklet_schedule(&tsq->tasklet);
local_irq_restore(flags);
- } else {
- sock_wfree(skb);
+ return;
}
+out:
+ sk_free(sk);
}
/* This routine actually transmits TCP packets queued in by
tcp_ca_event(sk, CA_EVENT_TX_START);
/* if no packet is in qdisc/device queue, then allow XPS to select
- * another queue.
+ * another queue. We can be called from tcp_tsq_handler()
+ * which holds one reference to sk_wmem_alloc.
+ *
+ * TODO: Ideally, in-flight pure ACK packets should not matter here.
+ * One way to get this would be to set skb->truesize = 2 on them.
*/
- skb->ooo_okay = sk_wmem_alloc_get(sk) == 0;
+ skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
tcp_options_write((__be32 *)(th + 1), tp, &opts);
if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
- TCP_ECN_send(sk, skb, tcp_header_size);
+ tcp_ecn_send(sk, skb, tcp_header_size);
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
tcp_skb_pcount(skb));
+ /* OK, its time to fill skb_shinfo(skb)->gso_segs */
+ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
+
/* Our usage of tstamp should remain private */
skb->tstamp.tv64 = 0;
+
+ /* Cleanup our debris for IP stacks */
+ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
+ sizeof(struct inet6_skb_parm)));
+
err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
if (likely(err <= 0))
/* Advance write_seq and place onto the write_queue. */
tp->write_seq = TCP_SKB_CB(skb)->end_seq;
- skb_header_release(skb);
+ __skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
sk->sk_wmem_queued += skb->truesize;
sk_mem_charge(sk, skb->truesize);
/* Avoid the costly divide in the normal
* non-TSO case.
*/
- shinfo->gso_segs = 1;
+ tcp_skb_pcount_set(skb, 1);
shinfo->gso_size = 0;
shinfo->gso_type = 0;
} else {
- shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
+ tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
shinfo->gso_size = mss_now;
shinfo->gso_type = sk->sk_gso_type;
}
}
/* Link BUFF into the send queue. */
- skb_header_release(buff);
+ __skb_header_release(buff);
tcp_insert_write_queue_after(skb, buff, sk);
return 0;
tcp_set_skb_tso_segs(sk, buff, mss_now);
/* Link BUFF into the send queue. */
- skb_header_release(buff);
+ __skb_header_release(buff);
tcp_insert_write_queue_after(skb, buff, sk);
return 0;
static bool skb_still_in_host_queue(const struct sock *sk,
const struct sk_buff *skb)
{
- const struct sk_buff *fclone = skb + 1;
-
- if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
- fclone->fclone == SKB_FCLONE_CLONE)) {
+ if (unlikely(skb_fclone_busy(skb))) {
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
return true;
if (nskb == NULL)
return -ENOMEM;
tcp_unlink_write_queue(skb, sk);
- skb_header_release(nskb);
+ __skb_header_release(nskb);
__tcp_add_write_queue_head(sk, nskb);
sk_wmem_free_skb(sk, skb);
sk->sk_wmem_queued += nskb->truesize;
}
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
- TCP_ECN_send_synack(tcp_sk(sk), skb);
+ tcp_ecn_send_synack(sk, skb);
}
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
- TCP_ECN_make_synack(req, th);
+ tcp_ecn_make_synack(req, th, sk);
th->source = htons(ireq->ir_num);
th->dest = ireq->ir_rmt_port;
/* Setting of flags are superfluous here for callers (and ECE is
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
tcb->end_seq += skb->len;
- skb_header_release(skb);
+ __skb_header_release(skb);
__tcp_add_write_queue_tail(sk, skb);
sk->sk_wmem_queued += skb->truesize;
sk_mem_charge(sk, skb->truesize);
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
tp->retrans_stamp = tcp_time_stamp;
tcp_connect_queue_skb(sk, buff);
- TCP_ECN_send_syn(sk, buff);
+ tcp_ecn_send_syn(sk, buff);
/* Send off SYN; include data in Fast Open. */
err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
int ato = icsk->icsk_ack.ato;
unsigned long timeout;
+ tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
+
if (ato > TCP_DELACK_MIN) {
const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ / 2;
if (sk->sk_state == TCP_CLOSE)
return;
+ tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
+
/* We are not putting this on the write queue, so
* tcp_transmit_skb() will set the ownership to this
* sock.
skb_mstamp_get(&buff->skb_mstamp);
tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
}
+EXPORT_SYMBOL_GPL(tcp_send_ack);
/* This routine sends a packet with an out of date sequence
* number. It assumes the other end will try to ack it.
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long probe_max;
int err;
err = tcp_write_wakeup(sk);
if (icsk->icsk_backoff < sysctl_tcp_retries2)
icsk->icsk_backoff++;
icsk->icsk_probes_out++;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
- TCP_RTO_MAX);
+ probe_max = TCP_RTO_MAX;
} else {
/* If packet was not sent due to local congestion,
* do not backoff and do not remember icsk_probes_out.
*/
if (!icsk->icsk_probes_out)
icsk->icsk_probes_out = 1;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- min(icsk->icsk_rto << icsk->icsk_backoff,
- TCP_RESOURCE_PROBE_INTERVAL),
- TCP_RTO_MAX);
+ probe_max = TCP_RESOURCE_PROBE_INTERVAL;
}
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ inet_csk_rto_backoff(icsk, probe_max),
+ TCP_RTO_MAX);
}
int tcp_rtx_synack(struct sock *sk, struct request_sock *req)