net/ipv4/tcp_timer.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 #include <linux/module.h>
  24 #include <net/tcp.h>
  25
  26 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
  27 int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
  28 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
  29 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
  30 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
  31 int sysctl_tcp_retries1 = TCP_RETR1;
  32 int sysctl_tcp_retries2 = TCP_RETR2;
  33 int sysctl_tcp_orphan_retries;
  34
  35 static void tcp_write_timer(unsigned long);
  36 static void tcp_delack_timer(unsigned long);
  37 static void tcp_keepalive_timer (unsigned long data);
  38
  39 void tcp_init_xmit_timers(struct sock *sk)
  40 {
  41         inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
  42                                   &tcp_keepalive_timer);
  43 }
  44
  45 EXPORT_SYMBOL(tcp_init_xmit_timers);
  46
  47 static void tcp_write_err(struct sock *sk)
  48 {
  49         sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
  50         sk->sk_error_report(sk);
  51
  52         tcp_done(sk);
  53         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
  54 }
  55
  56 /* Do not allow orphaned sockets to eat all our resources.
  57  * This is direct violation of TCP specs, but it is required
  58  * to prevent DoS attacks. It is called when a retransmission timeout
  59  * or zero probe timeout occurs on orphaned socket.
  60  *
  61  * Criterium is still not confirmed experimentally and may change.
  62  * We kill the socket, if:
  63  * 1. If number of orphaned sockets exceeds an administratively configured
  64  *    limit.
  65  * 2. If we have strong memory pressure.
  66  */
  67 static int tcp_out_of_resources(struct sock *sk, int do_reset)
  68 {
  69         struct tcp_sock *tp = tcp_sk(sk);
  70         int orphans = atomic_read(&tcp_orphan_count);
  71
  72         /* If peer does not open window for long time, or did not transmit
  73          * anything for long time, penalize it. */
  74         if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
  75                 orphans <<= 1;
  76
  77         /* If some dubious ICMP arrived, penalize even more. */
  78         if (sk->sk_err_soft)
  79                 orphans <<= 1;
  80
  81         if (orphans >= sysctl_tcp_max_orphans ||
  82             (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
  83              atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
  84                 if (net_ratelimit())
  85                         printk(KERN_INFO "Out of socket memory\n");
  86
  87                 /* Catch exceptional cases, when connection requires reset.
  88                  *      1. Last segment was sent recently. */
  89                 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
  90                     /*  2. Window is closed. */
  91                     (!tp->snd_wnd && !tp->packets_out))
  92                         do_reset = 1;
  93                 if (do_reset)
  94                         tcp_send_active_reset(sk, GFP_ATOMIC);
  95                 tcp_done(sk);
  96                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
  97                 return 1;
  98         }
  99         return 0;
 100 }
 101
 102 /* Calculate maximal number or retries on an orphaned socket. */
 103 static int tcp_orphan_retries(struct sock *sk, int alive)
 104 {
 105         int retries = sysctl_tcp_orphan_retries; /* May be zero. */
 106
 107         /* We know from an ICMP that something is wrong. */
 108         if (sk->sk_err_soft && !alive)
 109                 retries = 0;
 110
 111         /* However, if socket sent something recently, select some safe
 112          * number of retries. 8 corresponds to >100 seconds with minimal
 113          * RTO of 200msec. */
 114         if (retries == 0 && alive)
 115                 retries = 8;
 116         return retries;
 117 }
 118
 119 /* A write timeout has occurred. Process the after effects. */
 120 static int tcp_write_timeout(struct sock *sk)
 121 {
 122         const struct inet_connection_sock *icsk = inet_csk(sk);
 123         int retry_until;
 124
 125         if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 126                 if (icsk->icsk_retransmits)
 127                         dst_negative_advice(&sk->sk_dst_cache);
 128                 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 129         } else {
 130                 if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
 131                         /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
 132                            hole detection. :-(
 133
 134                            It is place to make it. It is not made. I do not want
 135                            to make it. It is disguisting. It does not work in any
 136                            case. Let me to cite the same draft, which requires for
 137                            us to implement this:
 138
 139    "The one security concern raised by this memo is that ICMP black holes
 140    are often caused by over-zealous security administrators who block
 141    all ICMP messages.  It is vitally important that those who design and
 142    deploy security systems understand the impact of strict filtering on
 143    upper-layer protocols.  The safest web site in the world is worthless
 144    if most TCP implementations cannot transfer data from it.  It would
 145    be far nicer to have all of the black holes fixed rather than fixing
 146    all of the TCP implementations."
 147
 148                            Golden words :-).
 149                    */
 150
 151                         dst_negative_advice(&sk->sk_dst_cache);
 152                 }
 153
 154                 retry_until = sysctl_tcp_retries2;
 155                 if (sock_flag(sk, SOCK_DEAD)) {
 156                         const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
 157
 158                         retry_until = tcp_orphan_retries(sk, alive);
 159
 160                         if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
 161                                 return 1;
 162                 }
 163         }
 164
 165         if (icsk->icsk_retransmits >= retry_until) {
 166                 /* Has it gone just too far? */
 167                 tcp_write_err(sk);
 168                 return 1;
 169         }
 170         return 0;
 171 }
 172
 173 static void tcp_delack_timer(unsigned long data)
 174 {
 175         struct sock *sk = (struct sock*)data;
 176         struct tcp_sock *tp = tcp_sk(sk);
 177         struct inet_connection_sock *icsk = inet_csk(sk);
 178
 179         bh_lock_sock(sk);
 180         if (sock_owned_by_user(sk)) {
 181                 /* Try again later. */
 182                 icsk->icsk_ack.blocked = 1;
 183                 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
 184                 sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
 185                 goto out_unlock;
 186         }
 187
 188         sk_stream_mem_reclaim(sk);
 189
 190         if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
 191                 goto out;
 192
 193         if (time_after(icsk->icsk_ack.timeout, jiffies)) {
 194                 sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
 195                 goto out;
 196         }
 197         icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
 198
 199         if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 200                 struct sk_buff *skb;
 201
 202                 NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
 203
 204                 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
 205                         sk->sk_backlog_rcv(sk, skb);
 206
 207                 tp->ucopy.memory = 0;
 208         }
 209
 210         if (inet_csk_ack_scheduled(sk)) {
 211                 if (!icsk->icsk_ack.pingpong) {
 212                         /* Delayed ACK missed: inflate ATO. */
 213                         icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
 214                 } else {
 215                         /* Delayed ACK missed: leave pingpong mode and
 216                          * deflate ATO.
 217                          */
 218                         icsk->icsk_ack.pingpong = 0;
 219                         icsk->icsk_ack.ato      = TCP_ATO_MIN;
 220                 }
 221                 tcp_send_ack(sk);
 222                 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
 223         }
 224         TCP_CHECK_TIMER(sk);
 225
 226 out:
 227         if (tcp_memory_pressure)
 228                 sk_stream_mem_reclaim(sk);
 229 out_unlock:
 230         bh_unlock_sock(sk);
 231         sock_put(sk);
 232 }
 233
 234 static void tcp_probe_timer(struct sock *sk)
 235 {
 236         struct inet_connection_sock *icsk = inet_csk(sk);
 237         struct tcp_sock *tp = tcp_sk(sk);
 238         int max_probes;
 239
 240         if (tp->packets_out || !sk->sk_send_head) {
 241                 icsk->icsk_probes_out = 0;
 242                 return;
 243         }
 244
 245         /* *WARNING* RFC 1122 forbids this
 246          *
 247          * It doesn't AFAIK, because we kill the retransmit timer -AK
 248          *
 249          * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
 250          * this behaviour in Solaris down as a bug fix. [AC]
 251          *
 252          * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
 253          * even if they advertise zero window. Hence, connection is killed only
 254          * if we received no ACKs for normal connection timeout. It is not killed
 255          * only because window stays zero for some time, window may be zero
 256          * until armageddon and even later. We are in full accordance
 257          * with RFCs, only probe timer combines both retransmission timeout
 258          * and probe timeout in one bottle.                             --ANK
 259          */
 260         max_probes = sysctl_tcp_retries2;
 261
 262         if (sock_flag(sk, SOCK_DEAD)) {
 263                 const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
 264
 265                 max_probes = tcp_orphan_retries(sk, alive);
 266
 267                 if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
 268                         return;
 269         }
 270
 271         if (icsk->icsk_probes_out > max_probes) {
 272                 tcp_write_err(sk);
 273         } else {
 274                 /* Only send another probe if we didn't close things up. */
 275                 tcp_send_probe0(sk);
 276         }
 277 }
 278
 279 /*
 280  *      The TCP retransmit timer.
 281  */
 282
 283 static void tcp_retransmit_timer(struct sock *sk)
 284 {
 285         struct tcp_sock *tp = tcp_sk(sk);
 286         struct inet_connection_sock *icsk = inet_csk(sk);
 287
 288         if (!tp->packets_out)
 289                 goto out;
 290
 291         BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
 292
 293         if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
 294             !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
 295                 /* Receiver dastardly shrinks window. Our retransmits
 296                  * become zero probes, but we should not timeout this
 297                  * connection. If the socket is an orphan, time it out,
 298                  * we cannot allow such beasts to hang infinitely.
 299                  */
 300 #ifdef TCP_DEBUG
 301                 if (net_ratelimit()) {
 302                         struct inet_sock *inet = inet_sk(sk);
 303                         printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
 304                                NIPQUAD(inet->daddr), htons(inet->dport),
 305                                inet->num, tp->snd_una, tp->snd_nxt);
 306                 }
 307 #endif
 308                 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
 309                         tcp_write_err(sk);
 310                         goto out;
 311                 }
 312                 tcp_enter_loss(sk, 0);
 313                 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
 314                 __sk_dst_reset(sk);
 315                 goto out_reset_timer;
 316         }
 317
 318         if (tcp_write_timeout(sk))
 319                 goto out;
 320
 321         if (icsk->icsk_retransmits == 0) {
 322                 if (icsk->icsk_ca_state == TCP_CA_Disorder ||
 323                     icsk->icsk_ca_state == TCP_CA_Recovery) {
 324                         if (tp->rx_opt.sack_ok) {
 325                                 if (icsk->icsk_ca_state == TCP_CA_Recovery)
 326                                         NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
 327                                 else
 328                                         NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
 329                         } else {
 330                                 if (icsk->icsk_ca_state == TCP_CA_Recovery)
 331                                         NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
 332                                 else
 333                                         NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
 334                         }
 335                 } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
 336                         NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
 337                 } else {
 338                         NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
 339                 }
 340         }
 341
 342         if (tcp_use_frto(sk)) {
 343                 tcp_enter_frto(sk);
 344         } else {
 345                 tcp_enter_loss(sk, 0);
 346         }
 347
 348         if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
 349                 /* Retransmission failed because of local congestion,
 350                  * do not backoff.
 351                  */
 352                 if (!icsk->icsk_retransmits)
 353                         icsk->icsk_retransmits = 1;
 354                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 355                                           min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
 356                                           TCP_RTO_MAX);
 357                 goto out;
 358         }
 359
 360         /* Increase the timeout each time we retransmit.  Note that
 361          * we do not increase the rtt estimate.  rto is initialized
 362          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 363          * that doubling rto each time is the least we can get away with.
 364          * In KA9Q, Karn uses this for the first few times, and then
 365          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 366          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 367          * defined in the protocol as the maximum possible RTT.  I guess
 368          * we'll have to use something other than TCP to talk to the
 369          * University of Mars.
 370          *
 371          * PAWS allows us longer timeouts and large windows, so once
 372          * implemented ftp to mars will work nicely. We will have to fix
 373          * the 120 second clamps though!
 374          */
 375         icsk->icsk_backoff++;
 376         icsk->icsk_retransmits++;
 377
 378 out_reset_timer:
 379         icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
 380         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
 381         if (icsk->icsk_retransmits > sysctl_tcp_retries1)
 382                 __sk_dst_reset(sk);
 383
 384 out:;
 385 }
 386
 387 static void tcp_write_timer(unsigned long data)
 388 {
 389         struct sock *sk = (struct sock*)data;
 390         struct inet_connection_sock *icsk = inet_csk(sk);
 391         int event;
 392
 393         bh_lock_sock(sk);
 394         if (sock_owned_by_user(sk)) {
 395                 /* Try again later */
 396                 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
 397                 goto out_unlock;
 398         }
 399
 400         if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
 401                 goto out;
 402
 403         if (time_after(icsk->icsk_timeout, jiffies)) {
 404                 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
 405                 goto out;
 406         }
 407
 408         event = icsk->icsk_pending;
 409         icsk->icsk_pending = 0;
 410
 411         switch (event) {
 412         case ICSK_TIME_RETRANS:
 413                 tcp_retransmit_timer(sk);
 414                 break;
 415         case ICSK_TIME_PROBE0:
 416                 tcp_probe_timer(sk);
 417                 break;
 418         }
 419         TCP_CHECK_TIMER(sk);
 420
 421 out:
 422         sk_stream_mem_reclaim(sk);
 423 out_unlock:
 424         bh_unlock_sock(sk);
 425         sock_put(sk);
 426 }
 427
 428 /*
 429  *      Timer for listening sockets
 430  */
 431
 432 static void tcp_synack_timer(struct sock *sk)
 433 {
 434         inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
 435                                    TCP_TIMEOUT_INIT, TCP_RTO_MAX);
 436 }
 437
 438 void tcp_set_keepalive(struct sock *sk, int val)
 439 {
 440         if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
 441                 return;
 442
 443         if (val && !sock_flag(sk, SOCK_KEEPOPEN))
 444                 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
 445         else if (!val)
 446                 inet_csk_delete_keepalive_timer(sk);
 447 }
 448
 449
 450 static void tcp_keepalive_timer (unsigned long data)
 451 {
 452         struct sock *sk = (struct sock *) data;
 453         struct inet_connection_sock *icsk = inet_csk(sk);
 454         struct tcp_sock *tp = tcp_sk(sk);
 455         __u32 elapsed;
 456
 457         /* Only process if socket is not in use. */
 458         bh_lock_sock(sk);
 459         if (sock_owned_by_user(sk)) {
 460                 /* Try again later. */
 461                 inet_csk_reset_keepalive_timer (sk, HZ/20);
 462                 goto out;
 463         }
 464
 465         if (sk->sk_state == TCP_LISTEN) {
 466                 tcp_synack_timer(sk);
 467                 goto out;
 468         }
 469
 470         if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
 471                 if (tp->linger2 >= 0) {
 472                         const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
 473
 474                         if (tmo > 0) {
 475                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
 476                                 goto out;
 477                         }
 478                 }
 479                 tcp_send_active_reset(sk, GFP_ATOMIC);
 480                 goto death;
 481         }
 482
 483         if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
 484                 goto out;
 485
 486         elapsed = keepalive_time_when(tp);
 487
 488         /* It is alive without keepalive 8) */
 489         if (tp->packets_out || sk->sk_send_head)
 490                 goto resched;
 491
 492         elapsed = tcp_time_stamp - tp->rcv_tstamp;
 493
 494         if (elapsed >= keepalive_time_when(tp)) {
 495                 if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) ||
 496                      (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) {
 497                         tcp_send_active_reset(sk, GFP_ATOMIC);
 498                         tcp_write_err(sk);
 499                         goto out;
 500                 }
 501                 if (tcp_write_wakeup(sk) <= 0) {
 502                         icsk->icsk_probes_out++;
 503                         elapsed = keepalive_intvl_when(tp);
 504                 } else {
 505                         /* If keepalive was lost due to local congestion,
 506                          * try harder.
 507                          */
 508                         elapsed = TCP_RESOURCE_PROBE_INTERVAL;
 509                 }
 510         } else {
 511                 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
 512                 elapsed = keepalive_time_when(tp) - elapsed;
 513         }
 514
 515         TCP_CHECK_TIMER(sk);
 516         sk_stream_mem_reclaim(sk);
 517
 518 resched:
 519         inet_csk_reset_keepalive_timer (sk, elapsed);
 520         goto out;
 521
 522 death:
 523         tcp_done(sk);
 524
 525 out:
 526         bh_unlock_sock(sk);
 527         sock_put(sk);
 528 }