net/ipv4/tcp_minisocks.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 #include <linux/config.h>
  24 #include <linux/mm.h>
  25 #include <linux/module.h>
  26 #include <linux/sysctl.h>
  27 #include <linux/workqueue.h>
  28 #include <net/tcp.h>
  29 #include <net/inet_common.h>
  30 #include <net/xfrm.h>
  31
  32 #ifdef CONFIG_SYSCTL
  33 #define SYNC_INIT 0 /* let the user enable it */
  34 #else
  35 #define SYNC_INIT 1
  36 #endif
  37
  38 int sysctl_tcp_tw_recycle;
  39 int sysctl_tcp_max_tw_buckets = NR_FILE*2;
  40
  41 int sysctl_tcp_syncookies = SYNC_INIT;
  42 int sysctl_tcp_abort_on_overflow;
  43
  44 static void tcp_tw_schedule(struct inet_timewait_sock *tw, int timeo);
  45
  46 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
  47 {
  48         if (seq == s_win)
  49                 return 1;
  50         if (after(end_seq, s_win) && before(seq, e_win))
  51                 return 1;
  52         return (seq == e_win && seq == end_seq);
  53 }
  54
  55 /* New-style handling of TIME_WAIT sockets. */
  56
  57 int tcp_tw_count;
  58
  59 /*
  60  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  61  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
  62  *   (and, probably, tail of data) and one or more our ACKs are lost.
  63  * * What is TIME-WAIT timeout? It is associated with maximal packet
  64  *   lifetime in the internet, which results in wrong conclusion, that
  65  *   it is set to catch "old duplicate segments" wandering out of their path.
  66  *   It is not quite correct. This timeout is calculated so that it exceeds
  67  *   maximal retransmission timeout enough to allow to lose one (or more)
  68  *   segments sent by peer and our ACKs. This time may be calculated from RTO.
  69  * * When TIME-WAIT socket receives RST, it means that another end
  70  *   finally closed and we are allowed to kill TIME-WAIT too.
  71  * * Second purpose of TIME-WAIT is catching old duplicate segments.
  72  *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
  73  *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
  74  * * If we invented some more clever way to catch duplicates
  75  *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
  76  *
  77  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
  78  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
  79  * from the very beginning.
  80  *
  81  * NOTE. With recycling (and later with fin-wait-2) TW bucket
  82  * is _not_ stateless. It means, that strictly speaking we must
  83  * spinlock it. I do not want! Well, probability of misbehaviour
  84  * is ridiculously low and, seems, we could use some mb() tricks
  85  * to avoid misread sequence numbers, states etc.  --ANK
  86  */
  87 enum tcp_tw_status
  88 tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
  89                            const struct tcphdr *th)
  90 {
  91         struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
  92         struct tcp_options_received tmp_opt;
  93         int paws_reject = 0;
  94
  95         tmp_opt.saw_tstamp = 0;
  96         if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
  97                 tcp_parse_options(skb, &tmp_opt, 0);
  98
  99                 if (tmp_opt.saw_tstamp) {
 100                         tmp_opt.ts_recent       = tcptw->tw_ts_recent;
 101                         tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 102                         paws_reject = tcp_paws_check(&tmp_opt, th->rst);
 103                 }
 104         }
 105
 106         if (tw->tw_substate == TCP_FIN_WAIT2) {
 107                 /* Just repeat all the checks of tcp_rcv_state_process() */
 108
 109                 /* Out of window, send ACK */
 110                 if (paws_reject ||
 111                     !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 112                                    tcptw->tw_rcv_nxt,
 113                                    tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
 114                         return TCP_TW_ACK;
 115
 116                 if (th->rst)
 117                         goto kill;
 118
 119                 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
 120                         goto kill_with_rst;
 121
 122                 /* Dup ACK? */
 123                 if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
 124                     TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
 125                         inet_twsk_put(tw);
 126                         return TCP_TW_SUCCESS;
 127                 }
 128
 129                 /* New data or FIN. If new data arrive after half-duplex close,
 130                  * reset.
 131                  */
 132                 if (!th->fin ||
 133                     TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
 134 kill_with_rst:
 135                         tcp_tw_deschedule(tw);
 136                         inet_twsk_put(tw);
 137                         return TCP_TW_RST;
 138                 }
 139
 140                 /* FIN arrived, enter true time-wait state. */
 141                 tw->tw_substate   = TCP_TIME_WAIT;
 142                 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 143                 if (tmp_opt.saw_tstamp) {
 144                         tcptw->tw_ts_recent_stamp = xtime.tv_sec;
 145                         tcptw->tw_ts_recent       = tmp_opt.rcv_tsval;
 146                 }
 147
 148                 /* I am shamed, but failed to make it more elegant.
 149                  * Yes, it is direct reference to IP, which is impossible
 150                  * to generalize to IPv6. Taking into account that IPv6
 151                  * do not undertsnad recycling in any case, it not
 152                  * a big problem in practice. --ANK */
 153                 if (tw->tw_family == AF_INET &&
 154                     sysctl_tcp_tw_recycle && tcptw->tw_ts_recent_stamp &&
 155                     tcp_v4_tw_remember_stamp(tw))
 156                         tcp_tw_schedule(tw, tw->tw_timeout);
 157                 else
 158                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 159                 return TCP_TW_ACK;
 160         }
 161
 162         /*
 163          *      Now real TIME-WAIT state.
 164          *
 165          *      RFC 1122:
 166          *      "When a connection is [...] on TIME-WAIT state [...]
 167          *      [a TCP] MAY accept a new SYN from the remote TCP to
 168          *      reopen the connection directly, if it:
 169          *
 170          *      (1)  assigns its initial sequence number for the new
 171          *      connection to be larger than the largest sequence
 172          *      number it used on the previous connection incarnation,
 173          *      and
 174          *
 175          *      (2)  returns to TIME-WAIT state if the SYN turns out
 176          *      to be an old duplicate".
 177          */
 178
 179         if (!paws_reject &&
 180             (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
 181              (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
 182                 /* In window segment, it may be only reset or bare ack. */
 183
 184                 if (th->rst) {
 185                         /* This is TIME_WAIT assasination, in two flavors.
 186                          * Oh well... nobody has a sufficient solution to this
 187                          * protocol bug yet.
 188                          */
 189                         if (sysctl_tcp_rfc1337 == 0) {
 190 kill:
 191                                 tcp_tw_deschedule(tw);
 192                                 inet_twsk_put(tw);
 193                                 return TCP_TW_SUCCESS;
 194                         }
 195                 }
 196                 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 197
 198                 if (tmp_opt.saw_tstamp) {
 199                         tcptw->tw_ts_recent       = tmp_opt.rcv_tsval;
 200                         tcptw->tw_ts_recent_stamp = xtime.tv_sec;
 201                 }
 202
 203                 inet_twsk_put(tw);
 204                 return TCP_TW_SUCCESS;
 205         }
 206
 207         /* Out of window segment.
 208
 209            All the segments are ACKed immediately.
 210
 211            The only exception is new SYN. We accept it, if it is
 212            not old duplicate and we are not in danger to be killed
 213            by delayed old duplicates. RFC check is that it has
 214            newer sequence number works at rates <40Mbit/sec.
 215            However, if paws works, it is reliable AND even more,
 216            we even may relax silly seq space cutoff.
 217
 218            RED-PEN: we violate main RFC requirement, if this SYN will appear
 219            old duplicate (i.e. we receive RST in reply to SYN-ACK),
 220            we must return socket to time-wait state. It is not good,
 221            but not fatal yet.
 222          */
 223
 224         if (th->syn && !th->rst && !th->ack && !paws_reject &&
 225             (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
 226              (tmp_opt.saw_tstamp &&
 227               (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
 228                 u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
 229                 if (isn == 0)
 230                         isn++;
 231                 TCP_SKB_CB(skb)->when = isn;
 232                 return TCP_TW_SYN;
 233         }
 234
 235         if (paws_reject)
 236                 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
 237
 238         if(!th->rst) {
 239                 /* In this case we must reset the TIMEWAIT timer.
 240                  *
 241                  * If it is ACKless SYN it may be both old duplicate
 242                  * and new good SYN with random sequence number <rcv_nxt.
 243                  * Do not reschedule in the last case.
 244                  */
 245                 if (paws_reject || th->ack)
 246                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 247
 248                 /* Send ACK. Note, we do not put the bucket,
 249                  * it will be released by caller.
 250                  */
 251                 return TCP_TW_ACK;
 252         }
 253         inet_twsk_put(tw);
 254         return TCP_TW_SUCCESS;
 255 }
 256
 257 /*
 258  * Move a socket to time-wait or dead fin-wait-2 state.
 259  */
 260 void tcp_time_wait(struct sock *sk, int state, int timeo)
 261 {
 262         struct inet_timewait_sock *tw = NULL;
 263         const struct tcp_sock *tp = tcp_sk(sk);
 264         int recycle_ok = 0;
 265
 266         if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
 267                 recycle_ok = tp->af_specific->remember_stamp(sk);
 268
 269         if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
 270                 tw = inet_twsk_alloc(sk, state);
 271
 272         if (tw != NULL) {
 273                 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 274                 const struct inet_connection_sock *icsk = inet_csk(sk);
 275                 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
 276
 277                 tw->tw_rcv_wscale       = tp->rx_opt.rcv_wscale;
 278                 tcptw->tw_rcv_nxt       = tp->rcv_nxt;
 279                 tcptw->tw_snd_nxt       = tp->snd_nxt;
 280                 tcptw->tw_rcv_wnd       = tcp_receive_window(tp);
 281                 tcptw->tw_ts_recent     = tp->rx_opt.ts_recent;
 282                 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 283
 284 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 285                 if (tw->tw_family == PF_INET6) {
 286                         struct ipv6_pinfo *np = inet6_sk(sk);
 287                         struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
 288
 289                         ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
 290                         ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
 291                         tw->tw_ipv6only = np->ipv6only;
 292                 }
 293 #endif
 294                 /* Linkage updates. */
 295                 __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
 296
 297                 /* Get the TIME_WAIT timeout firing. */
 298                 if (timeo < rto)
 299                         timeo = rto;
 300
 301                 if (recycle_ok) {
 302                         tw->tw_timeout = rto;
 303                 } else {
 304                         tw->tw_timeout = TCP_TIMEWAIT_LEN;
 305                         if (state == TCP_TIME_WAIT)
 306                                 timeo = TCP_TIMEWAIT_LEN;
 307                 }
 308
 309                 tcp_tw_schedule(tw, timeo);
 310                 inet_twsk_put(tw);
 311         } else {
 312                 /* Sorry, if we're out of memory, just CLOSE this
 313                  * socket up.  We've got bigger problems than
 314                  * non-graceful socket closings.
 315                  */
 316                 if (net_ratelimit())
 317                         printk(KERN_INFO "TCP: time wait bucket table overflow\n");
 318         }
 319
 320         tcp_update_metrics(sk);
 321         tcp_done(sk);
 322 }
 323
 324 /* Kill off TIME_WAIT sockets once their lifetime has expired. */
 325 static int tcp_tw_death_row_slot;
 326
 327 static void tcp_twkill(unsigned long);
 328
 329 /* TIME_WAIT reaping mechanism. */
 330 #define TCP_TWKILL_SLOTS        8       /* Please keep this a power of 2. */
 331 #define TCP_TWKILL_PERIOD       (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
 332
 333 #define TCP_TWKILL_QUOTA        100
 334
 335 static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
 336 static DEFINE_SPINLOCK(tw_death_lock);
 337 static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
 338 static void twkill_work(void *);
 339 static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
 340 static u32 twkill_thread_slots;
 341
 342 /* Returns non-zero if quota exceeded.  */
 343 static int tcp_do_twkill_work(int slot, unsigned int quota)
 344 {
 345         struct inet_timewait_sock *tw;
 346         struct hlist_node *node;
 347         unsigned int killed;
 348         int ret;
 349
 350         /* NOTE: compare this to previous version where lock
 351          * was released after detaching chain. It was racy,
 352          * because tw buckets are scheduled in not serialized context
 353          * in 2.3 (with netfilter), and with softnet it is common, because
 354          * soft irqs are not sequenced.
 355          */
 356         killed = 0;
 357         ret = 0;
 358 rescan:
 359         inet_twsk_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
 360                 __inet_twsk_del_dead_node(tw);
 361                 spin_unlock(&tw_death_lock);
 362                 __inet_twsk_kill(tw, &tcp_hashinfo);
 363                 inet_twsk_put(tw);
 364                 killed++;
 365                 spin_lock(&tw_death_lock);
 366                 if (killed > quota) {
 367                         ret = 1;
 368                         break;
 369                 }
 370
 371                 /* While we dropped tw_death_lock, another cpu may have
 372                  * killed off the next TW bucket in the list, therefore
 373                  * do a fresh re-read of the hlist head node with the
 374                  * lock reacquired.  We still use the hlist traversal
 375                  * macro in order to get the prefetches.
 376                  */
 377                 goto rescan;
 378         }
 379
 380         tcp_tw_count -= killed;
 381         NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
 382
 383         return ret;
 384 }
 385
 386 static void tcp_twkill(unsigned long dummy)
 387 {
 388         int need_timer, ret;
 389
 390         spin_lock(&tw_death_lock);
 391
 392         if (tcp_tw_count == 0)
 393                 goto out;
 394
 395         need_timer = 0;
 396         ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
 397         if (ret) {
 398                 twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
 399                 mb();
 400                 schedule_work(&tcp_twkill_work);
 401                 need_timer = 1;
 402         } else {
 403                 /* We purged the entire slot, anything left?  */
 404                 if (tcp_tw_count)
 405                         need_timer = 1;
 406         }
 407         tcp_tw_death_row_slot =
 408                 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
 409         if (need_timer)
 410                 mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
 411 out:
 412         spin_unlock(&tw_death_lock);
 413 }
 414
 415 extern void twkill_slots_invalid(void);
 416
 417 static void twkill_work(void *dummy)
 418 {
 419         int i;
 420
 421         if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
 422                 twkill_slots_invalid();
 423
 424         while (twkill_thread_slots) {
 425                 spin_lock_bh(&tw_death_lock);
 426                 for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
 427                         if (!(twkill_thread_slots & (1 << i)))
 428                                 continue;
 429
 430                         while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
 431                                 if (need_resched()) {
 432                                         spin_unlock_bh(&tw_death_lock);
 433                                         schedule();
 434                                         spin_lock_bh(&tw_death_lock);
 435                                 }
 436                         }
 437
 438                         twkill_thread_slots &= ~(1 << i);
 439                 }
 440                 spin_unlock_bh(&tw_death_lock);
 441         }
 442 }
 443
 444 /* These are always called from BH context.  See callers in
 445  * tcp_input.c to verify this.
 446  */
 447
 448 /* This is for handling early-kills of TIME_WAIT sockets. */
 449 void tcp_tw_deschedule(struct inet_timewait_sock *tw)
 450 {
 451         spin_lock(&tw_death_lock);
 452         if (inet_twsk_del_dead_node(tw)) {
 453                 inet_twsk_put(tw);
 454                 if (--tcp_tw_count == 0)
 455                         del_timer(&tcp_tw_timer);
 456         }
 457         spin_unlock(&tw_death_lock);
 458         __inet_twsk_kill(tw, &tcp_hashinfo);
 459 }
 460
 461 /* Short-time timewait calendar */
 462
 463 static int tcp_twcal_hand = -1;
 464 static int tcp_twcal_jiffie;
 465 static void tcp_twcal_tick(unsigned long);
 466 static struct timer_list tcp_twcal_timer =
 467                 TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
 468 static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
 469
 470 static void tcp_tw_schedule(struct inet_timewait_sock *tw, const int timeo)
 471 {
 472         struct hlist_head *list;
 473         int slot;
 474
 475         /* timeout := RTO * 3.5
 476          *
 477          * 3.5 = 1+2+0.5 to wait for two retransmits.
 478          *
 479          * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
 480          * our ACK acking that FIN can be lost. If N subsequent retransmitted
 481          * FINs (or previous seqments) are lost (probability of such event
 482          * is p^(N+1), where p is probability to lose single packet and
 483          * time to detect the loss is about RTO*(2^N - 1) with exponential
 484          * backoff). Normal timewait length is calculated so, that we
 485          * waited at least for one retransmitted FIN (maximal RTO is 120sec).
 486          * [ BTW Linux. following BSD, violates this requirement waiting
 487          *   only for 60sec, we should wait at least for 240 secs.
 488          *   Well, 240 consumes too much of resources 8)
 489          * ]
 490          * This interval is not reduced to catch old duplicate and
 491          * responces to our wandering segments living for two MSLs.
 492          * However, if we use PAWS to detect
 493          * old duplicates, we can reduce the interval to bounds required
 494          * by RTO, rather than MSL. So, if peer understands PAWS, we
 495          * kill tw bucket after 3.5*RTO (it is important that this number
 496          * is greater than TS tick!) and detect old duplicates with help
 497          * of PAWS.
 498          */
 499         slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
 500
 501         spin_lock(&tw_death_lock);
 502
 503         /* Unlink it, if it was scheduled */
 504         if (inet_twsk_del_dead_node(tw))
 505                 tcp_tw_count--;
 506         else
 507                 atomic_inc(&tw->tw_refcnt);
 508
 509         if (slot >= TCP_TW_RECYCLE_SLOTS) {
 510                 /* Schedule to slow timer */
 511                 if (timeo >= TCP_TIMEWAIT_LEN) {
 512                         slot = TCP_TWKILL_SLOTS-1;
 513                 } else {
 514                         slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
 515                         if (slot >= TCP_TWKILL_SLOTS)
 516                                 slot = TCP_TWKILL_SLOTS-1;
 517                 }
 518                 tw->tw_ttd = jiffies + timeo;
 519                 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
 520                 list = &tcp_tw_death_row[slot];
 521         } else {
 522                 tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
 523
 524                 if (tcp_twcal_hand < 0) {
 525                         tcp_twcal_hand = 0;
 526                         tcp_twcal_jiffie = jiffies;
 527                         tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
 528                         add_timer(&tcp_twcal_timer);
 529                 } else {
 530                         if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
 531                                 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
 532                         slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
 533                 }
 534                 list = &tcp_twcal_row[slot];
 535         }
 536
 537         hlist_add_head(&tw->tw_death_node, list);
 538
 539         if (tcp_tw_count++ == 0)
 540                 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
 541         spin_unlock(&tw_death_lock);
 542 }
 543
 544 void tcp_twcal_tick(unsigned long dummy)
 545 {
 546         int n, slot;
 547         unsigned long j;
 548         unsigned long now = jiffies;
 549         int killed = 0;
 550         int adv = 0;
 551
 552         spin_lock(&tw_death_lock);
 553         if (tcp_twcal_hand < 0)
 554                 goto out;
 555
 556         slot = tcp_twcal_hand;
 557         j = tcp_twcal_jiffie;
 558
 559         for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
 560                 if (time_before_eq(j, now)) {
 561                         struct hlist_node *node, *safe;
 562                         struct inet_timewait_sock *tw;
 563
 564                         inet_twsk_for_each_inmate_safe(tw, node, safe,
 565                                                        &tcp_twcal_row[slot]) {
 566                                 __inet_twsk_del_dead_node(tw);
 567                                 __inet_twsk_kill(tw, &tcp_hashinfo);
 568                                 inet_twsk_put(tw);
 569                                 killed++;
 570                         }
 571                 } else {
 572                         if (!adv) {
 573                                 adv = 1;
 574                                 tcp_twcal_jiffie = j;
 575                                 tcp_twcal_hand = slot;
 576                         }
 577
 578                         if (!hlist_empty(&tcp_twcal_row[slot])) {
 579                                 mod_timer(&tcp_twcal_timer, j);
 580                                 goto out;
 581                         }
 582                 }
 583                 j += (1<<TCP_TW_RECYCLE_TICK);
 584                 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
 585         }
 586         tcp_twcal_hand = -1;
 587
 588 out:
 589         if ((tcp_tw_count -= killed) == 0)
 590                 del_timer(&tcp_tw_timer);
 591         NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
 592         spin_unlock(&tw_death_lock);
 593 }
 594
 595 /* This is not only more efficient than what we used to do, it eliminates
 596  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
 597  *
 598  * Actually, we could lots of memory writes here. tp of listening
 599  * socket contains all necessary default parameters.
 600  */
 601 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
 602 {
 603         struct sock *newsk = sk_clone(sk, GFP_ATOMIC);
 604
 605         if (newsk != NULL) {
 606                 struct inet_request_sock *ireq = inet_rsk(req);
 607                 struct tcp_request_sock *treq = tcp_rsk(req);
 608                 struct inet_sock *newinet = inet_sk(newsk);
 609                 struct inet_connection_sock *newicsk = inet_csk(newsk);
 610                 struct tcp_sock *newtp;
 611
 612                 newsk->sk_state = TCP_SYN_RECV;
 613                 newicsk->icsk_bind_hash = NULL;
 614
 615                 /* Clone the TCP header template */
 616                 newinet->dport = ireq->rmt_port;
 617                 newsk->sk_write_space = sk_stream_write_space;
 618
 619                 /* Now setup tcp_sock */
 620                 newtp = tcp_sk(newsk);
 621                 newtp->pred_flags = 0;
 622                 newtp->rcv_nxt = treq->rcv_isn + 1;
 623                 newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1;
 624
 625                 tcp_prequeue_init(newtp);
 626
 627                 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
 628
 629                 newicsk->icsk_retransmits = 0;
 630                 newicsk->icsk_backoff = 0;
 631                 newtp->srtt = 0;
 632                 newtp->mdev = TCP_TIMEOUT_INIT;
 633                 newicsk->icsk_rto = TCP_TIMEOUT_INIT;
 634
 635                 newtp->packets_out = 0;
 636                 newtp->left_out = 0;
 637                 newtp->retrans_out = 0;
 638                 newtp->sacked_out = 0;
 639                 newtp->fackets_out = 0;
 640                 newtp->snd_ssthresh = 0x7fffffff;
 641
 642                 /* So many TCP implementations out there (incorrectly) count the
 643                  * initial SYN frame in their delayed-ACK and congestion control
 644                  * algorithms that we must have the following bandaid to talk
 645                  * efficiently to them.  -DaveM
 646                  */
 647                 newtp->snd_cwnd = 2;
 648                 newtp->snd_cwnd_cnt = 0;
 649
 650                 newtp->frto_counter = 0;
 651                 newtp->frto_highmark = 0;
 652
 653                 newtp->ca_ops = &tcp_reno;
 654
 655                 tcp_set_ca_state(newtp, TCP_CA_Open);
 656                 tcp_init_xmit_timers(newsk);
 657                 skb_queue_head_init(&newtp->out_of_order_queue);
 658                 newtp->rcv_wup = treq->rcv_isn + 1;
 659                 newtp->write_seq = treq->snt_isn + 1;
 660                 newtp->pushed_seq = newtp->write_seq;
 661                 newtp->copied_seq = treq->rcv_isn + 1;
 662
 663                 newtp->rx_opt.saw_tstamp = 0;
 664
 665                 newtp->rx_opt.dsack = 0;
 666                 newtp->rx_opt.eff_sacks = 0;
 667
 668                 newtp->probes_out = 0;
 669                 newtp->rx_opt.num_sacks = 0;
 670                 newtp->urg_data = 0;
 671                 /* Deinitialize accept_queue to trap illegal accesses. */
 672                 memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
 673
 674                 if (sock_flag(newsk, SOCK_KEEPOPEN))
 675                         inet_csk_reset_keepalive_timer(newsk,
 676                                                        keepalive_time_when(newtp));
 677
 678                 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
 679                 if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
 680                         if (sysctl_tcp_fack)
 681                                 newtp->rx_opt.sack_ok |= 2;
 682                 }
 683                 newtp->window_clamp = req->window_clamp;
 684                 newtp->rcv_ssthresh = req->rcv_wnd;
 685                 newtp->rcv_wnd = req->rcv_wnd;
 686                 newtp->rx_opt.wscale_ok = ireq->wscale_ok;
 687                 if (newtp->rx_opt.wscale_ok) {
 688                         newtp->rx_opt.snd_wscale = ireq->snd_wscale;
 689                         newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
 690                 } else {
 691                         newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
 692                         newtp->window_clamp = min(newtp->window_clamp, 65535U);
 693                 }
 694                 newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale;
 695                 newtp->max_window = newtp->snd_wnd;
 696
 697                 if (newtp->rx_opt.tstamp_ok) {
 698                         newtp->rx_opt.ts_recent = req->ts_recent;
 699                         newtp->rx_opt.ts_recent_stamp = xtime.tv_sec;
 700                         newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
 701                 } else {
 702                         newtp->rx_opt.ts_recent_stamp = 0;
 703                         newtp->tcp_header_len = sizeof(struct tcphdr);
 704                 }
 705                 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
 706                         newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
 707                 newtp->rx_opt.mss_clamp = req->mss;
 708                 TCP_ECN_openreq_child(newtp, req);
 709                 if (newtp->ecn_flags&TCP_ECN_OK)
 710                         sock_set_flag(newsk, SOCK_NO_LARGESEND);
 711
 712                 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
 713         }
 714         return newsk;
 715 }
 716
 717 /*
 718  *      Process an incoming packet for SYN_RECV sockets represented
 719  *      as a request_sock.
 720  */
 721
 722 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 723                            struct request_sock *req,
 724                            struct request_sock **prev)
 725 {
 726         struct tcphdr *th = skb->h.th;
 727         struct tcp_sock *tp = tcp_sk(sk);
 728         u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
 729         int paws_reject = 0;
 730         struct tcp_options_received tmp_opt;
 731         struct sock *child;
 732
 733         tmp_opt.saw_tstamp = 0;
 734         if (th->doff > (sizeof(struct tcphdr)>>2)) {
 735                 tcp_parse_options(skb, &tmp_opt, 0);
 736
 737                 if (tmp_opt.saw_tstamp) {
 738                         tmp_opt.ts_recent = req->ts_recent;
 739                         /* We do not store true stamp, but it is not required,
 740                          * it can be estimated (approximately)
 741                          * from another data.
 742                          */
 743                         tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
 744                         paws_reject = tcp_paws_check(&tmp_opt, th->rst);
 745                 }
 746         }
 747
 748         /* Check for pure retransmitted SYN. */
 749         if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
 750             flg == TCP_FLAG_SYN &&
 751             !paws_reject) {
 752                 /*
 753                  * RFC793 draws (Incorrectly! It was fixed in RFC1122)
 754                  * this case on figure 6 and figure 8, but formal
 755                  * protocol description says NOTHING.
 756                  * To be more exact, it says that we should send ACK,
 757                  * because this segment (at least, if it has no data)
 758                  * is out of window.
 759                  *
 760                  *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
 761                  *  describe SYN-RECV state. All the description
 762                  *  is wrong, we cannot believe to it and should
 763                  *  rely only on common sense and implementation
 764                  *  experience.
 765                  *
 766                  * Enforce "SYN-ACK" according to figure 8, figure 6
 767                  * of RFC793, fixed by RFC1122.
 768                  */
 769                 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
 770                 return NULL;
 771         }
 772
 773         /* Further reproduces section "SEGMENT ARRIVES"
 774            for state SYN-RECEIVED of RFC793.
 775            It is broken, however, it does not work only
 776            when SYNs are crossed.
 777
 778            You would think that SYN crossing is impossible here, since
 779            we should have a SYN_SENT socket (from connect()) on our end,
 780            but this is not true if the crossed SYNs were sent to both
 781            ends by a malicious third party.  We must defend against this,
 782            and to do that we first verify the ACK (as per RFC793, page
 783            36) and reset if it is invalid.  Is this a true full defense?
 784            To convince ourselves, let us consider a way in which the ACK
 785            test can still pass in this 'malicious crossed SYNs' case.
 786            Malicious sender sends identical SYNs (and thus identical sequence
 787            numbers) to both A and B:
 788
 789                 A: gets SYN, seq=7
 790                 B: gets SYN, seq=7
 791
 792            By our good fortune, both A and B select the same initial
 793            send sequence number of seven :-)
 794
 795                 A: sends SYN|ACK, seq=7, ack_seq=8
 796                 B: sends SYN|ACK, seq=7, ack_seq=8
 797
 798            So we are now A eating this SYN|ACK, ACK test passes.  So
 799            does sequence test, SYN is truncated, and thus we consider
 800            it a bare ACK.
 801
 802            If tp->defer_accept, we silently drop this bare ACK.  Otherwise,
 803            we create an established connection.  Both ends (listening sockets)
 804            accept the new incoming connection and try to talk to each other. 8-)
 805
 806            Note: This case is both harmless, and rare.  Possibility is about the
 807            same as us discovering intelligent life on another plant tomorrow.
 808
 809            But generally, we should (RFC lies!) to accept ACK
 810            from SYNACK both here and in tcp_rcv_state_process().
 811            tcp_rcv_state_process() does not, hence, we do not too.
 812
 813            Note that the case is absolutely generic:
 814            we cannot optimize anything here without
 815            violating protocol. All the checks must be made
 816            before attempt to create socket.
 817          */
 818
 819         /* RFC793 page 36: "If the connection is in any non-synchronized state ...
 820          *                  and the incoming segment acknowledges something not yet
 821          *                  sent (the segment carries an unaccaptable ACK) ...
 822          *                  a reset is sent."
 823          *
 824          * Invalid ACK: reset will be sent by listening socket
 825          */
 826         if ((flg & TCP_FLAG_ACK) &&
 827             (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
 828                 return sk;
 829
 830         /* Also, it would be not so bad idea to check rcv_tsecr, which
 831          * is essentially ACK extension and too early or too late values
 832          * should cause reset in unsynchronized states.
 833          */
 834
 835         /* RFC793: "first check sequence number". */
 836
 837         if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 838                                           tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
 839                 /* Out of window: send ACK and drop. */
 840                 if (!(flg & TCP_FLAG_RST))
 841                         req->rsk_ops->send_ack(skb, req);
 842                 if (paws_reject)
 843                         NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
 844                 return NULL;
 845         }
 846
 847         /* In sequence, PAWS is OK. */
 848
 849         if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
 850                         req->ts_recent = tmp_opt.rcv_tsval;
 851
 852                 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
 853                         /* Truncate SYN, it is out of window starting
 854                            at tcp_rsk(req)->rcv_isn + 1. */
 855                         flg &= ~TCP_FLAG_SYN;
 856                 }
 857
 858                 /* RFC793: "second check the RST bit" and
 859                  *         "fourth, check the SYN bit"
 860                  */
 861                 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
 862                         goto embryonic_reset;
 863
 864                 /* ACK sequence verified above, just make sure ACK is
 865                  * set.  If ACK not set, just silently drop the packet.
 866                  */
 867                 if (!(flg & TCP_FLAG_ACK))
 868                         return NULL;
 869
 870                 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
 871                 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
 872                         inet_rsk(req)->acked = 1;
 873                         return NULL;
 874                 }
 875
 876                 /* OK, ACK is valid, create big socket and
 877                  * feed this segment to it. It will repeat all
 878                  * the tests. THIS SEGMENT MUST MOVE SOCKET TO
 879                  * ESTABLISHED STATE. If it will be dropped after
 880                  * socket is created, wait for troubles.
 881                  */
 882                 child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
 883                 if (child == NULL)
 884                         goto listen_overflow;
 885
 886                 inet_csk_reqsk_queue_unlink(sk, req, prev);
 887                 inet_csk_reqsk_queue_removed(sk, req);
 888
 889                 inet_csk_reqsk_queue_add(sk, req, child);
 890                 return child;
 891
 892         listen_overflow:
 893                 if (!sysctl_tcp_abort_on_overflow) {
 894                         inet_rsk(req)->acked = 1;
 895                         return NULL;
 896                 }
 897
 898         embryonic_reset:
 899                 NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
 900                 if (!(flg & TCP_FLAG_RST))
 901                         req->rsk_ops->send_reset(skb);
 902
 903                 inet_csk_reqsk_queue_drop(sk, req, prev);
 904                 return NULL;
 905 }
 906
 907 /*
 908  * Queue segment on the new socket if the new socket is active,
 909  * otherwise we just shortcircuit this and continue with
 910  * the new socket.
 911  */
 912
 913 int tcp_child_process(struct sock *parent, struct sock *child,
 914                       struct sk_buff *skb)
 915 {
 916         int ret = 0;
 917         int state = child->sk_state;
 918
 919         if (!sock_owned_by_user(child)) {
 920                 ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
 921
 922                 /* Wakeup parent, send SIGIO */
 923                 if (state == TCP_SYN_RECV && child->sk_state != state)
 924                         parent->sk_data_ready(parent, 0);
 925         } else {
 926                 /* Alas, it is possible again, because we do lookup
 927                  * in main socket hash table and lock on listening
 928                  * socket does not protect us more.
 929                  */
 930                 sk_add_backlog(child, skb);
 931         }
 932
 933         bh_unlock_sock(child);
 934         sock_put(child);
 935         return ret;
 936 }
 937
 938 EXPORT_SYMBOL(tcp_check_req);
 939 EXPORT_SYMBOL(tcp_child_process);
 940 EXPORT_SYMBOL(tcp_create_openreq_child);
 941 EXPORT_SYMBOL(tcp_timewait_state_process);
 942 EXPORT_SYMBOL(tcp_tw_deschedule);