net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/types.h>
  55 #include <linux/fcntl.h>
  56 #include <linux/module.h>
  57 #include <linux/random.h>
  58 #include <linux/cache.h>
  59 #include <linux/jhash.h>
  60 #include <linux/init.h>
  61 #include <linux/times.h>
  62
  63 #include <net/net_namespace.h>
  64 #include <net/icmp.h>
  65 #include <net/inet_hashtables.h>
  66 #include <net/tcp.h>
  67 #include <net/transp_v6.h>
  68 #include <net/ipv6.h>
  69 #include <net/inet_common.h>
  70 #include <net/timewait_sock.h>
  71 #include <net/xfrm.h>
  72 #include <net/netdma.h>
  73
  74 #include <linux/inet.h>
  75 #include <linux/ipv6.h>
  76 #include <linux/stddef.h>
  77 #include <linux/proc_fs.h>
  78 #include <linux/seq_file.h>
  79
  80 #include <linux/crypto.h>
  81 #include <linux/scatterlist.h>
  82
  83 int sysctl_tcp_tw_reuse __read_mostly;
  84 int sysctl_tcp_low_latency __read_mostly;
  85
  86 /* Check TCP sequence numbers in ICMP packets. */
  87 #define ICMP_MIN_LENGTH 8
  88
  89 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
  90
  91 #ifdef CONFIG_TCP_MD5SIG
  92 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  93                                                    __be32 addr);
  94 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
  95                                    __be32 saddr, __be32 daddr,
  96                                    struct tcphdr *th, unsigned int tcplen);
  97 #else
  98 static inline
  99 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 100 {
 101         return NULL;
 102 }
 103 #endif
 104
 105 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
 106         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
 107         .lhash_users = ATOMIC_INIT(0),
 108         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
 109 };
 110
 111 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 112 {
 113         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 114                                           ip_hdr(skb)->saddr,
 115                                           tcp_hdr(skb)->dest,
 116                                           tcp_hdr(skb)->source);
 117 }
 118
 119 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 120 {
 121         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 122         struct tcp_sock *tp = tcp_sk(sk);
 123
 124         /* With PAWS, it is safe from the viewpoint
 125            of data integrity. Even without PAWS it is safe provided sequence
 126            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 127
 128            Actually, the idea is close to VJ's one, only timestamp cache is
 129            held not per host, but per port pair and TW bucket is used as state
 130            holder.
 131
 132            If TW bucket has been already destroyed we fall back to VJ's scheme
 133            and use initial timestamp retrieved from peer table.
 134          */
 135         if (tcptw->tw_ts_recent_stamp &&
 136             (twp == NULL || (sysctl_tcp_tw_reuse &&
 137                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 138                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 139                 if (tp->write_seq == 0)
 140                         tp->write_seq = 1;
 141                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 142                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 143                 sock_hold(sktw);
 144                 return 1;
 145         }
 146
 147         return 0;
 148 }
 149
 150 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 151
 152 /* This will initiate an outgoing connection. */
 153 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 154 {
 155         struct inet_sock *inet = inet_sk(sk);
 156         struct tcp_sock *tp = tcp_sk(sk);
 157         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 158         struct rtable *rt;
 159         __be32 daddr, nexthop;
 160         int tmp;
 161         int err;
 162
 163         if (addr_len < sizeof(struct sockaddr_in))
 164                 return -EINVAL;
 165
 166         if (usin->sin_family != AF_INET)
 167                 return -EAFNOSUPPORT;
 168
 169         nexthop = daddr = usin->sin_addr.s_addr;
 170         if (inet->opt && inet->opt->srr) {
 171                 if (!daddr)
 172                         return -EINVAL;
 173                 nexthop = inet->opt->faddr;
 174         }
 175
 176         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 177                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 178                                IPPROTO_TCP,
 179                                inet->sport, usin->sin_port, sk, 1);
 180         if (tmp < 0) {
 181                 if (tmp == -ENETUNREACH)
 182                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
 183                 return tmp;
 184         }
 185
 186         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 187                 ip_rt_put(rt);
 188                 return -ENETUNREACH;
 189         }
 190
 191         if (!inet->opt || !inet->opt->srr)
 192                 daddr = rt->rt_dst;
 193
 194         if (!inet->saddr)
 195                 inet->saddr = rt->rt_src;
 196         inet->rcv_saddr = inet->saddr;
 197
 198         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 199                 /* Reset inherited state */
 200                 tp->rx_opt.ts_recent       = 0;
 201                 tp->rx_opt.ts_recent_stamp = 0;
 202                 tp->write_seq              = 0;
 203         }
 204
 205         if (tcp_death_row.sysctl_tw_recycle &&
 206             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 207                 struct inet_peer *peer = rt_get_peer(rt);
 208                 /*
 209                  * VJ's idea. We save last timestamp seen from
 210                  * the destination in peer table, when entering state
 211                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 212                  * when trying new connection.
 213                  */
 214                 if (peer != NULL &&
 215                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
 216                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 217                         tp->rx_opt.ts_recent = peer->tcp_ts;
 218                 }
 219         }
 220
 221         inet->dport = usin->sin_port;
 222         inet->daddr = daddr;
 223
 224         inet_csk(sk)->icsk_ext_hdr_len = 0;
 225         if (inet->opt)
 226                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 227
 228         tp->rx_opt.mss_clamp = 536;
 229
 230         /* Socket identity is still unknown (sport may be zero).
 231          * However we set state to SYN-SENT and not releasing socket
 232          * lock select source port, enter ourselves into the hash tables and
 233          * complete initialization after this.
 234          */
 235         tcp_set_state(sk, TCP_SYN_SENT);
 236         err = inet_hash_connect(&tcp_death_row, sk);
 237         if (err)
 238                 goto failure;
 239
 240         err = ip_route_newports(&rt, IPPROTO_TCP,
 241                                 inet->sport, inet->dport, sk);
 242         if (err)
 243                 goto failure;
 244
 245         /* OK, now commit destination to socket.  */
 246         sk->sk_gso_type = SKB_GSO_TCPV4;
 247         sk_setup_caps(sk, &rt->u.dst);
 248
 249         if (!tp->write_seq)
 250                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 251                                                            inet->daddr,
 252                                                            inet->sport,
 253                                                            usin->sin_port);
 254
 255         inet->id = tp->write_seq ^ jiffies;
 256
 257         err = tcp_connect(sk);
 258         rt = NULL;
 259         if (err)
 260                 goto failure;
 261
 262         return 0;
 263
 264 failure:
 265         /*
 266          * This unhashes the socket and releases the local port,
 267          * if necessary.
 268          */
 269         tcp_set_state(sk, TCP_CLOSE);
 270         ip_rt_put(rt);
 271         sk->sk_route_caps = 0;
 272         inet->dport = 0;
 273         return err;
 274 }
 275
 276 /*
 277  * This routine does path mtu discovery as defined in RFC1191.
 278  */
 279 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 280 {
 281         struct dst_entry *dst;
 282         struct inet_sock *inet = inet_sk(sk);
 283
 284         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 285          * send out by Linux are always <576bytes so they should go through
 286          * unfragmented).
 287          */
 288         if (sk->sk_state == TCP_LISTEN)
 289                 return;
 290
 291         /* We don't check in the destentry if pmtu discovery is forbidden
 292          * on this route. We just assume that no packet_to_big packets
 293          * are send back when pmtu discovery is not active.
 294          * There is a small race when the user changes this flag in the
 295          * route, but I think that's acceptable.
 296          */
 297         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 298                 return;
 299
 300         dst->ops->update_pmtu(dst, mtu);
 301
 302         /* Something is about to be wrong... Remember soft error
 303          * for the case, if this connection will not able to recover.
 304          */
 305         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 306                 sk->sk_err_soft = EMSGSIZE;
 307
 308         mtu = dst_mtu(dst);
 309
 310         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 311             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 312                 tcp_sync_mss(sk, mtu);
 313
 314                 /* Resend the TCP packet because it's
 315                  * clear that the old packet has been
 316                  * dropped. This is the new "fast" path mtu
 317                  * discovery.
 318                  */
 319                 tcp_simple_retransmit(sk);
 320         } /* else let the usual retransmit timer handle it */
 321 }
 322
 323 /*
 324  * This routine is called by the ICMP module when it gets some
 325  * sort of error condition.  If err < 0 then the socket should
 326  * be closed and the error returned to the user.  If err > 0
 327  * it's just the icmp type << 8 | icmp code.  After adjustment
 328  * header points to the first 8 bytes of the tcp header.  We need
 329  * to find the appropriate port.
 330  *
 331  * The locking strategy used here is very "optimistic". When
 332  * someone else accesses the socket the ICMP is just dropped
 333  * and for some paths there is no check at all.
 334  * A more general error queue to queue errors for later handling
 335  * is probably better.
 336  *
 337  */
 338
 339 void tcp_v4_err(struct sk_buff *skb, u32 info)
 340 {
 341         struct iphdr *iph = (struct iphdr *)skb->data;
 342         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 343         struct tcp_sock *tp;
 344         struct inet_sock *inet;
 345         const int type = icmp_hdr(skb)->type;
 346         const int code = icmp_hdr(skb)->code;
 347         struct sock *sk;
 348         __u32 seq;
 349         int err;
 350
 351         if (skb->len < (iph->ihl << 2) + 8) {
 352                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 353                 return;
 354         }
 355
 356         sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
 357                         iph->saddr, th->source, inet_iif(skb));
 358         if (!sk) {
 359                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 360                 return;
 361         }
 362         if (sk->sk_state == TCP_TIME_WAIT) {
 363                 inet_twsk_put(inet_twsk(sk));
 364                 return;
 365         }
 366
 367         bh_lock_sock(sk);
 368         /* If too many ICMPs get dropped on busy
 369          * servers this needs to be solved differently.
 370          */
 371         if (sock_owned_by_user(sk))
 372                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 373
 374         if (sk->sk_state == TCP_CLOSE)
 375                 goto out;
 376
 377         tp = tcp_sk(sk);
 378         seq = ntohl(th->seq);
 379         if (sk->sk_state != TCP_LISTEN &&
 380             !between(seq, tp->snd_una, tp->snd_nxt)) {
 381                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 382                 goto out;
 383         }
 384
 385         switch (type) {
 386         case ICMP_SOURCE_QUENCH:
 387                 /* Just silently ignore these. */
 388                 goto out;
 389         case ICMP_PARAMETERPROB:
 390                 err = EPROTO;
 391                 break;
 392         case ICMP_DEST_UNREACH:
 393                 if (code > NR_ICMP_UNREACH)
 394                         goto out;
 395
 396                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 397                         if (!sock_owned_by_user(sk))
 398                                 do_pmtu_discovery(sk, iph, info);
 399                         goto out;
 400                 }
 401
 402                 err = icmp_err_convert[code].errno;
 403                 break;
 404         case ICMP_TIME_EXCEEDED:
 405                 err = EHOSTUNREACH;
 406                 break;
 407         default:
 408                 goto out;
 409         }
 410
 411         switch (sk->sk_state) {
 412                 struct request_sock *req, **prev;
 413         case TCP_LISTEN:
 414                 if (sock_owned_by_user(sk))
 415                         goto out;
 416
 417                 req = inet_csk_search_req(sk, &prev, th->dest,
 418                                           iph->daddr, iph->saddr);
 419                 if (!req)
 420                         goto out;
 421
 422                 /* ICMPs are not backlogged, hence we cannot get
 423                    an established socket here.
 424                  */
 425                 BUG_TRAP(!req->sk);
 426
 427                 if (seq != tcp_rsk(req)->snt_isn) {
 428                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 429                         goto out;
 430                 }
 431
 432                 /*
 433                  * Still in SYN_RECV, just remove it silently.
 434                  * There is no good way to pass the error to the newly
 435                  * created socket, and POSIX does not want network
 436                  * errors returned from accept().
 437                  */
 438                 inet_csk_reqsk_queue_drop(sk, req, prev);
 439                 goto out;
 440
 441         case TCP_SYN_SENT:
 442         case TCP_SYN_RECV:  /* Cannot happen.
 443                                It can f.e. if SYNs crossed.
 444                              */
 445                 if (!sock_owned_by_user(sk)) {
 446                         sk->sk_err = err;
 447
 448                         sk->sk_error_report(sk);
 449
 450                         tcp_done(sk);
 451                 } else {
 452                         sk->sk_err_soft = err;
 453                 }
 454                 goto out;
 455         }
 456
 457         /* If we've already connected we will keep trying
 458          * until we time out, or the user gives up.
 459          *
 460          * rfc1122 4.2.3.9 allows to consider as hard errors
 461          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 462          * but it is obsoleted by pmtu discovery).
 463          *
 464          * Note, that in modern internet, where routing is unreliable
 465          * and in each dark corner broken firewalls sit, sending random
 466          * errors ordered by their masters even this two messages finally lose
 467          * their original sense (even Linux sends invalid PORT_UNREACHs)
 468          *
 469          * Now we are in compliance with RFCs.
 470          *                                                      --ANK (980905)
 471          */
 472
 473         inet = inet_sk(sk);
 474         if (!sock_owned_by_user(sk) && inet->recverr) {
 475                 sk->sk_err = err;
 476                 sk->sk_error_report(sk);
 477         } else  { /* Only an error on timeout */
 478                 sk->sk_err_soft = err;
 479         }
 480
 481 out:
 482         bh_unlock_sock(sk);
 483         sock_put(sk);
 484 }
 485
 486 /* This routine computes an IPv4 TCP checksum. */
 487 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 488 {
 489         struct inet_sock *inet = inet_sk(sk);
 490         struct tcphdr *th = tcp_hdr(skb);
 491
 492         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 493                 th->check = ~tcp_v4_check(len, inet->saddr,
 494                                           inet->daddr, 0);
 495                 skb->csum_start = skb_transport_header(skb) - skb->head;
 496                 skb->csum_offset = offsetof(struct tcphdr, check);
 497         } else {
 498                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
 499                                          csum_partial((char *)th,
 500                                                       th->doff << 2,
 501                                                       skb->csum));
 502         }
 503 }
 504
 505 int tcp_v4_gso_send_check(struct sk_buff *skb)
 506 {
 507         const struct iphdr *iph;
 508         struct tcphdr *th;
 509
 510         if (!pskb_may_pull(skb, sizeof(*th)))
 511                 return -EINVAL;
 512
 513         iph = ip_hdr(skb);
 514         th = tcp_hdr(skb);
 515
 516         th->check = 0;
 517         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
 518         skb->csum_start = skb_transport_header(skb) - skb->head;
 519         skb->csum_offset = offsetof(struct tcphdr, check);
 520         skb->ip_summed = CHECKSUM_PARTIAL;
 521         return 0;
 522 }
 523
 524 /*
 525  *      This routine will send an RST to the other tcp.
 526  *
 527  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 528  *                    for reset.
 529  *      Answer: if a packet caused RST, it is not for a socket
 530  *              existing in our system, if it is matched to a socket,
 531  *              it is just duplicate segment or bug in other side's TCP.
 532  *              So that we build reply only basing on parameters
 533  *              arrived with segment.
 534  *      Exception: precedence violation. We do not implement it in any case.
 535  */
 536
 537 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 538 {
 539         struct tcphdr *th = tcp_hdr(skb);
 540         struct {
 541                 struct tcphdr th;
 542 #ifdef CONFIG_TCP_MD5SIG
 543                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 544 #endif
 545         } rep;
 546         struct ip_reply_arg arg;
 547 #ifdef CONFIG_TCP_MD5SIG
 548         struct tcp_md5sig_key *key;
 549 #endif
 550
 551         /* Never send a reset in response to a reset. */
 552         if (th->rst)
 553                 return;
 554
 555         if (skb->rtable->rt_type != RTN_LOCAL)
 556                 return;
 557
 558         /* Swap the send and the receive. */
 559         memset(&rep, 0, sizeof(rep));
 560         rep.th.dest   = th->source;
 561         rep.th.source = th->dest;
 562         rep.th.doff   = sizeof(struct tcphdr) / 4;
 563         rep.th.rst    = 1;
 564
 565         if (th->ack) {
 566                 rep.th.seq = th->ack_seq;
 567         } else {
 568                 rep.th.ack = 1;
 569                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 570                                        skb->len - (th->doff << 2));
 571         }
 572
 573         memset(&arg, 0, sizeof(arg));
 574         arg.iov[0].iov_base = (unsigned char *)&rep;
 575         arg.iov[0].iov_len  = sizeof(rep.th);
 576
 577 #ifdef CONFIG_TCP_MD5SIG
 578         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 579         if (key) {
 580                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 581                                    (TCPOPT_NOP << 16) |
 582                                    (TCPOPT_MD5SIG << 8) |
 583                                    TCPOLEN_MD5SIG);
 584                 /* Update length and the length the header thinks exists */
 585                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 586                 rep.th.doff = arg.iov[0].iov_len / 4;
 587
 588                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
 589                                         key,
 590                                         ip_hdr(skb)->daddr,
 591                                         ip_hdr(skb)->saddr,
 592                                         &rep.th, arg.iov[0].iov_len);
 593         }
 594 #endif
 595         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 596                                       ip_hdr(skb)->saddr, /* XXX */
 597                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
 598         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 599
 600         ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
 601                       &arg, arg.iov[0].iov_len);
 602
 603         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 604         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 605 }
 606
 607 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 608    outside socket context is ugly, certainly. What can I do?
 609  */
 610
 611 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 612                             u32 win, u32 ts, int oif,
 613                             struct tcp_md5sig_key *key)
 614 {
 615         struct tcphdr *th = tcp_hdr(skb);
 616         struct {
 617                 struct tcphdr th;
 618                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 619 #ifdef CONFIG_TCP_MD5SIG
 620                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 621 #endif
 622                         ];
 623         } rep;
 624         struct ip_reply_arg arg;
 625
 626         memset(&rep.th, 0, sizeof(struct tcphdr));
 627         memset(&arg, 0, sizeof(arg));
 628
 629         arg.iov[0].iov_base = (unsigned char *)&rep;
 630         arg.iov[0].iov_len  = sizeof(rep.th);
 631         if (ts) {
 632                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 633                                    (TCPOPT_TIMESTAMP << 8) |
 634                                    TCPOLEN_TIMESTAMP);
 635                 rep.opt[1] = htonl(tcp_time_stamp);
 636                 rep.opt[2] = htonl(ts);
 637                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 638         }
 639
 640         /* Swap the send and the receive. */
 641         rep.th.dest    = th->source;
 642         rep.th.source  = th->dest;
 643         rep.th.doff    = arg.iov[0].iov_len / 4;
 644         rep.th.seq     = htonl(seq);
 645         rep.th.ack_seq = htonl(ack);
 646         rep.th.ack     = 1;
 647         rep.th.window  = htons(win);
 648
 649 #ifdef CONFIG_TCP_MD5SIG
 650         if (key) {
 651                 int offset = (ts) ? 3 : 0;
 652
 653                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 654                                           (TCPOPT_NOP << 16) |
 655                                           (TCPOPT_MD5SIG << 8) |
 656                                           TCPOLEN_MD5SIG);
 657                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 658                 rep.th.doff = arg.iov[0].iov_len/4;
 659
 660                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
 661                                         key,
 662                                         ip_hdr(skb)->daddr,
 663                                         ip_hdr(skb)->saddr,
 664                                         &rep.th, arg.iov[0].iov_len);
 665         }
 666 #endif
 667         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 668                                       ip_hdr(skb)->saddr, /* XXX */
 669                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 670         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 671         if (oif)
 672                 arg.bound_dev_if = oif;
 673
 674         ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
 675                       &arg, arg.iov[0].iov_len);
 676
 677         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 678 }
 679
 680 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 681 {
 682         struct inet_timewait_sock *tw = inet_twsk(sk);
 683         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 684
 685         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 686                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 687                         tcptw->tw_ts_recent,
 688                         tw->tw_bound_dev_if,
 689                         tcp_twsk_md5_key(tcptw)
 690                         );
 691
 692         inet_twsk_put(tw);
 693 }
 694
 695 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
 696                                   struct request_sock *req)
 697 {
 698         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 699                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 700                         req->ts_recent,
 701                         0,
 702                         tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr));
 703 }
 704
 705 /*
 706  *      Send a SYN-ACK after having received a SYN.
 707  *      This still operates on a request_sock only, not on a big
 708  *      socket.
 709  */
 710 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 711                                 struct dst_entry *dst)
 712 {
 713         const struct inet_request_sock *ireq = inet_rsk(req);
 714         int err = -1;
 715         struct sk_buff * skb;
 716
 717         /* First, grab a route. */
 718         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 719                 return -1;
 720
 721         skb = tcp_make_synack(sk, dst, req);
 722
 723         if (skb) {
 724                 struct tcphdr *th = tcp_hdr(skb);
 725
 726                 th->check = tcp_v4_check(skb->len,
 727                                          ireq->loc_addr,
 728                                          ireq->rmt_addr,
 729                                          csum_partial((char *)th, skb->len,
 730                                                       skb->csum));
 731
 732                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 733                                             ireq->rmt_addr,
 734                                             ireq->opt);
 735                 err = net_xmit_eval(err);
 736         }
 737
 738         dst_release(dst);
 739         return err;
 740 }
 741
 742 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
 743 {
 744         return __tcp_v4_send_synack(sk, req, NULL);
 745 }
 746
 747 /*
 748  *      IPv4 request_sock destructor.
 749  */
 750 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 751 {
 752         kfree(inet_rsk(req)->opt);
 753 }
 754
 755 #ifdef CONFIG_SYN_COOKIES
 756 static void syn_flood_warning(struct sk_buff *skb)
 757 {
 758         static unsigned long warntime;
 759
 760         if (time_after(jiffies, (warntime + HZ * 60))) {
 761                 warntime = jiffies;
 762                 printk(KERN_INFO
 763                        "possible SYN flooding on port %d. Sending cookies.\n",
 764                        ntohs(tcp_hdr(skb)->dest));
 765         }
 766 }
 767 #endif
 768
 769 /*
 770  * Save and compile IPv4 options into the request_sock if needed.
 771  */
 772 static struct ip_options *tcp_v4_save_options(struct sock *sk,
 773                                               struct sk_buff *skb)
 774 {
 775         struct ip_options *opt = &(IPCB(skb)->opt);
 776         struct ip_options *dopt = NULL;
 777
 778         if (opt && opt->optlen) {
 779                 int opt_size = optlength(opt);
 780                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 781                 if (dopt) {
 782                         if (ip_options_echo(dopt, skb)) {
 783                                 kfree(dopt);
 784                                 dopt = NULL;
 785                         }
 786                 }
 787         }
 788         return dopt;
 789 }
 790
 791 #ifdef CONFIG_TCP_MD5SIG
 792 /*
 793  * RFC2385 MD5 checksumming requires a mapping of
 794  * IP address->MD5 Key.
 795  * We need to maintain these in the sk structure.
 796  */
 797
 798 /* Find the Key structure for an address.  */
 799 static struct tcp_md5sig_key *
 800                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 801 {
 802         struct tcp_sock *tp = tcp_sk(sk);
 803         int i;
 804
 805         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 806                 return NULL;
 807         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 808                 if (tp->md5sig_info->keys4[i].addr == addr)
 809                         return &tp->md5sig_info->keys4[i].base;
 810         }
 811         return NULL;
 812 }
 813
 814 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 815                                          struct sock *addr_sk)
 816 {
 817         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
 818 }
 819
 820 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 821
 822 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 823                                                       struct request_sock *req)
 824 {
 825         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 826 }
 827
 828 /* This can be called on a newly created socket, from other files */
 829 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 830                       u8 *newkey, u8 newkeylen)
 831 {
 832         /* Add Key to the list */
 833         struct tcp_md5sig_key *key;
 834         struct tcp_sock *tp = tcp_sk(sk);
 835         struct tcp4_md5sig_key *keys;
 836
 837         key = tcp_v4_md5_do_lookup(sk, addr);
 838         if (key) {
 839                 /* Pre-existing entry - just update that one. */
 840                 kfree(key->key);
 841                 key->key = newkey;
 842                 key->keylen = newkeylen;
 843         } else {
 844                 struct tcp_md5sig_info *md5sig;
 845
 846                 if (!tp->md5sig_info) {
 847                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 848                                                   GFP_ATOMIC);
 849                         if (!tp->md5sig_info) {
 850                                 kfree(newkey);
 851                                 return -ENOMEM;
 852                         }
 853                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 854                 }
 855                 if (tcp_alloc_md5sig_pool() == NULL) {
 856                         kfree(newkey);
 857                         return -ENOMEM;
 858                 }
 859                 md5sig = tp->md5sig_info;
 860
 861                 if (md5sig->alloced4 == md5sig->entries4) {
 862                         keys = kmalloc((sizeof(*keys) *
 863                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 864                         if (!keys) {
 865                                 kfree(newkey);
 866                                 tcp_free_md5sig_pool();
 867                                 return -ENOMEM;
 868                         }
 869
 870                         if (md5sig->entries4)
 871                                 memcpy(keys, md5sig->keys4,
 872                                        sizeof(*keys) * md5sig->entries4);
 873
 874                         /* Free old key list, and reference new one */
 875                         kfree(md5sig->keys4);
 876                         md5sig->keys4 = keys;
 877                         md5sig->alloced4++;
 878                 }
 879                 md5sig->entries4++;
 880                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 881                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 882                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 883         }
 884         return 0;
 885 }
 886
 887 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 888
 889 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 890                                u8 *newkey, u8 newkeylen)
 891 {
 892         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
 893                                  newkey, newkeylen);
 894 }
 895
 896 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 897 {
 898         struct tcp_sock *tp = tcp_sk(sk);
 899         int i;
 900
 901         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 902                 if (tp->md5sig_info->keys4[i].addr == addr) {
 903                         /* Free the key */
 904                         kfree(tp->md5sig_info->keys4[i].base.key);
 905                         tp->md5sig_info->entries4--;
 906
 907                         if (tp->md5sig_info->entries4 == 0) {
 908                                 kfree(tp->md5sig_info->keys4);
 909                                 tp->md5sig_info->keys4 = NULL;
 910                                 tp->md5sig_info->alloced4 = 0;
 911                         } else if (tp->md5sig_info->entries4 != i) {
 912                                 /* Need to do some manipulation */
 913                                 memmove(&tp->md5sig_info->keys4[i],
 914                                         &tp->md5sig_info->keys4[i+1],
 915                                         (tp->md5sig_info->entries4 - i) *
 916                                          sizeof(struct tcp4_md5sig_key));
 917                         }
 918                         tcp_free_md5sig_pool();
 919                         return 0;
 920                 }
 921         }
 922         return -ENOENT;
 923 }
 924
 925 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 926
 927 static void tcp_v4_clear_md5_list(struct sock *sk)
 928 {
 929         struct tcp_sock *tp = tcp_sk(sk);
 930
 931         /* Free each key, then the set of key keys,
 932          * the crypto element, and then decrement our
 933          * hold on the last resort crypto.
 934          */
 935         if (tp->md5sig_info->entries4) {
 936                 int i;
 937                 for (i = 0; i < tp->md5sig_info->entries4; i++)
 938                         kfree(tp->md5sig_info->keys4[i].base.key);
 939                 tp->md5sig_info->entries4 = 0;
 940                 tcp_free_md5sig_pool();
 941         }
 942         if (tp->md5sig_info->keys4) {
 943                 kfree(tp->md5sig_info->keys4);
 944                 tp->md5sig_info->keys4 = NULL;
 945                 tp->md5sig_info->alloced4  = 0;
 946         }
 947 }
 948
 949 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 950                                  int optlen)
 951 {
 952         struct tcp_md5sig cmd;
 953         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 954         u8 *newkey;
 955
 956         if (optlen < sizeof(cmd))
 957                 return -EINVAL;
 958
 959         if (copy_from_user(&cmd, optval, sizeof(cmd)))
 960                 return -EFAULT;
 961
 962         if (sin->sin_family != AF_INET)
 963                 return -EINVAL;
 964
 965         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
 966                 if (!tcp_sk(sk)->md5sig_info)
 967                         return -ENOENT;
 968                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
 969         }
 970
 971         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
 972                 return -EINVAL;
 973
 974         if (!tcp_sk(sk)->md5sig_info) {
 975                 struct tcp_sock *tp = tcp_sk(sk);
 976                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
 977
 978                 if (!p)
 979                         return -EINVAL;
 980
 981                 tp->md5sig_info = p;
 982                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 983         }
 984
 985         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
 986         if (!newkey)
 987                 return -ENOMEM;
 988         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
 989                                  newkey, cmd.tcpm_keylen);
 990 }
 991
 992 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
 993                                    __be32 saddr, __be32 daddr,
 994                                    struct tcphdr *th,
 995                                    unsigned int tcplen)
 996 {
 997         struct tcp_md5sig_pool *hp;
 998         struct tcp4_pseudohdr *bp;
 999         int err;
1000
1001         /*
1002          * Okay, so RFC2385 is turned on for this connection,
1003          * so we need to generate the MD5 hash for the packet now.
1004          */
1005
1006         hp = tcp_get_md5sig_pool();
1007         if (!hp)
1008                 goto clear_hash_noput;
1009
1010         bp = &hp->md5_blk.ip4;
1011
1012         /*
1013          * The TCP pseudo-header (in the order: source IP address,
1014          * destination IP address, zero-padded protocol number, and
1015          * segment length)
1016          */
1017         bp->saddr = saddr;
1018         bp->daddr = daddr;
1019         bp->pad = 0;
1020         bp->protocol = IPPROTO_TCP;
1021         bp->len = htons(tcplen);
1022
1023         err = tcp_calc_md5_hash(md5_hash, key, sizeof(*bp),
1024                                 th, tcplen, hp);
1025         if (err)
1026                 goto clear_hash;
1027
1028         /* Free up the crypto pool */
1029         tcp_put_md5sig_pool();
1030 out:
1031         return 0;
1032 clear_hash:
1033         tcp_put_md5sig_pool();
1034 clear_hash_noput:
1035         memset(md5_hash, 0, 16);
1036         goto out;
1037 }
1038
1039 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1040                          struct sock *sk,
1041                          struct dst_entry *dst,
1042                          struct request_sock *req,
1043                          struct tcphdr *th,
1044                          unsigned int tcplen)
1045 {
1046         __be32 saddr, daddr;
1047
1048         if (sk) {
1049                 saddr = inet_sk(sk)->saddr;
1050                 daddr = inet_sk(sk)->daddr;
1051         } else {
1052                 struct rtable *rt = (struct rtable *)dst;
1053                 BUG_ON(!rt);
1054                 saddr = rt->rt_src;
1055                 daddr = rt->rt_dst;
1056         }
1057         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1058                                        saddr, daddr,
1059                                        th, tcplen);
1060 }
1061
1062 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1063
1064 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1065 {
1066         /*
1067          * This gets called for each TCP segment that arrives
1068          * so we want to be efficient.
1069          * We have 3 drop cases:
1070          * o No MD5 hash and one expected.
1071          * o MD5 hash and we're not expecting one.
1072          * o MD5 hash and its wrong.
1073          */
1074         __u8 *hash_location = NULL;
1075         struct tcp_md5sig_key *hash_expected;
1076         const struct iphdr *iph = ip_hdr(skb);
1077         struct tcphdr *th = tcp_hdr(skb);
1078         int genhash;
1079         unsigned char newhash[16];
1080
1081         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1082         hash_location = tcp_parse_md5sig_option(th);
1083
1084         /* We've parsed the options - do we have a hash? */
1085         if (!hash_expected && !hash_location)
1086                 return 0;
1087
1088         if (hash_expected && !hash_location) {
1089                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1090                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1091                                NIPQUAD(iph->saddr), ntohs(th->source),
1092                                NIPQUAD(iph->daddr), ntohs(th->dest));
1093                 return 1;
1094         }
1095
1096         if (!hash_expected && hash_location) {
1097                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1098                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1099                                NIPQUAD(iph->saddr), ntohs(th->source),
1100                                NIPQUAD(iph->daddr), ntohs(th->dest));
1101                 return 1;
1102         }
1103
1104         /* Okay, so this is hash_expected and hash_location -
1105          * so we need to calculate the checksum.
1106          */
1107         genhash = tcp_v4_do_calc_md5_hash(newhash,
1108                                           hash_expected,
1109                                           iph->saddr, iph->daddr,
1110                                           th, skb->len);
1111
1112         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1113                 if (net_ratelimit()) {
1114                         printk(KERN_INFO "MD5 Hash failed for "
1115                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1116                                NIPQUAD(iph->saddr), ntohs(th->source),
1117                                NIPQUAD(iph->daddr), ntohs(th->dest),
1118                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1119                 }
1120                 return 1;
1121         }
1122         return 0;
1123 }
1124
1125 #endif
1126
1127 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1128         .family         =       PF_INET,
1129         .obj_size       =       sizeof(struct tcp_request_sock),
1130         .rtx_syn_ack    =       tcp_v4_send_synack,
1131         .send_ack       =       tcp_v4_reqsk_send_ack,
1132         .destructor     =       tcp_v4_reqsk_destructor,
1133         .send_reset     =       tcp_v4_send_reset,
1134 };
1135
1136 #ifdef CONFIG_TCP_MD5SIG
1137 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1138         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1139 };
1140 #endif
1141
1142 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1143         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1144         .twsk_unique    = tcp_twsk_unique,
1145         .twsk_destructor= tcp_twsk_destructor,
1146 };
1147
1148 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1149 {
1150         struct inet_request_sock *ireq;
1151         struct tcp_options_received tmp_opt;
1152         struct request_sock *req;
1153         __be32 saddr = ip_hdr(skb)->saddr;
1154         __be32 daddr = ip_hdr(skb)->daddr;
1155         __u32 isn = TCP_SKB_CB(skb)->when;
1156         struct dst_entry *dst = NULL;
1157 #ifdef CONFIG_SYN_COOKIES
1158         int want_cookie = 0;
1159 #else
1160 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1161 #endif
1162
1163         /* Never answer to SYNs send to broadcast or multicast */
1164         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1165                 goto drop;
1166
1167         /* TW buckets are converted to open requests without
1168          * limitations, they conserve resources and peer is
1169          * evidently real one.
1170          */
1171         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1172 #ifdef CONFIG_SYN_COOKIES
1173                 if (sysctl_tcp_syncookies) {
1174                         want_cookie = 1;
1175                 } else
1176 #endif
1177                 goto drop;
1178         }
1179
1180         /* Accept backlog is full. If we have already queued enough
1181          * of warm entries in syn queue, drop request. It is better than
1182          * clogging syn queue with openreqs with exponentially increasing
1183          * timeout.
1184          */
1185         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1186                 goto drop;
1187
1188         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1189         if (!req)
1190                 goto drop;
1191
1192 #ifdef CONFIG_TCP_MD5SIG
1193         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1194 #endif
1195
1196         tcp_clear_options(&tmp_opt);
1197         tmp_opt.mss_clamp = 536;
1198         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1199
1200         tcp_parse_options(skb, &tmp_opt, 0);
1201
1202         if (want_cookie && !tmp_opt.saw_tstamp)
1203                 tcp_clear_options(&tmp_opt);
1204
1205         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1206                 /* Some OSes (unknown ones, but I see them on web server, which
1207                  * contains information interesting only for windows'
1208                  * users) do not send their stamp in SYN. It is easy case.
1209                  * We simply do not advertise TS support.
1210                  */
1211                 tmp_opt.saw_tstamp = 0;
1212                 tmp_opt.tstamp_ok  = 0;
1213         }
1214         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1215
1216         tcp_openreq_init(req, &tmp_opt, skb);
1217
1218         if (security_inet_conn_request(sk, skb, req))
1219                 goto drop_and_free;
1220
1221         ireq = inet_rsk(req);
1222         ireq->loc_addr = daddr;
1223         ireq->rmt_addr = saddr;
1224         ireq->opt = tcp_v4_save_options(sk, skb);
1225         if (!want_cookie)
1226                 TCP_ECN_create_request(req, tcp_hdr(skb));
1227
1228         if (want_cookie) {
1229 #ifdef CONFIG_SYN_COOKIES
1230                 syn_flood_warning(skb);
1231                 req->cookie_ts = tmp_opt.tstamp_ok;
1232 #endif
1233                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1234         } else if (!isn) {
1235                 struct inet_peer *peer = NULL;
1236
1237                 /* VJ's idea. We save last timestamp seen
1238                  * from the destination in peer table, when entering
1239                  * state TIME-WAIT, and check against it before
1240                  * accepting new connection request.
1241                  *
1242                  * If "isn" is not zero, this request hit alive
1243                  * timewait bucket, so that all the necessary checks
1244                  * are made in the function processing timewait state.
1245                  */
1246                 if (tmp_opt.saw_tstamp &&
1247                     tcp_death_row.sysctl_tw_recycle &&
1248                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1249                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1250                     peer->v4daddr == saddr) {
1251                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1252                             (s32)(peer->tcp_ts - req->ts_recent) >
1253                                                         TCP_PAWS_WINDOW) {
1254                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1255                                 goto drop_and_release;
1256                         }
1257                 }
1258                 /* Kill the following clause, if you dislike this way. */
1259                 else if (!sysctl_tcp_syncookies &&
1260                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1261                           (sysctl_max_syn_backlog >> 2)) &&
1262                          (!peer || !peer->tcp_ts_stamp) &&
1263                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1264                         /* Without syncookies last quarter of
1265                          * backlog is filled with destinations,
1266                          * proven to be alive.
1267                          * It means that we continue to communicate
1268                          * to destinations, already remembered
1269                          * to the moment of synflood.
1270                          */
1271                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1272                                        "request from " NIPQUAD_FMT "/%u\n",
1273                                        NIPQUAD(saddr),
1274                                        ntohs(tcp_hdr(skb)->source));
1275                         goto drop_and_release;
1276                 }
1277
1278                 isn = tcp_v4_init_sequence(skb);
1279         }
1280         tcp_rsk(req)->snt_isn = isn;
1281
1282         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1283                 goto drop_and_free;
1284
1285         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1286         return 0;
1287
1288 drop_and_release:
1289         dst_release(dst);
1290 drop_and_free:
1291         reqsk_free(req);
1292 drop:
1293         return 0;
1294 }
1295
1296
1297 /*
1298  * The three way handshake has completed - we got a valid synack -
1299  * now create the new socket.
1300  */
1301 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1302                                   struct request_sock *req,
1303                                   struct dst_entry *dst)
1304 {
1305         struct inet_request_sock *ireq;
1306         struct inet_sock *newinet;
1307         struct tcp_sock *newtp;
1308         struct sock *newsk;
1309 #ifdef CONFIG_TCP_MD5SIG
1310         struct tcp_md5sig_key *key;
1311 #endif
1312
1313         if (sk_acceptq_is_full(sk))
1314                 goto exit_overflow;
1315
1316         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1317                 goto exit;
1318
1319         newsk = tcp_create_openreq_child(sk, req, skb);
1320         if (!newsk)
1321                 goto exit;
1322
1323         newsk->sk_gso_type = SKB_GSO_TCPV4;
1324         sk_setup_caps(newsk, dst);
1325
1326         newtp                 = tcp_sk(newsk);
1327         newinet               = inet_sk(newsk);
1328         ireq                  = inet_rsk(req);
1329         newinet->daddr        = ireq->rmt_addr;
1330         newinet->rcv_saddr    = ireq->loc_addr;
1331         newinet->saddr        = ireq->loc_addr;
1332         newinet->opt          = ireq->opt;
1333         ireq->opt             = NULL;
1334         newinet->mc_index     = inet_iif(skb);
1335         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1336         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1337         if (newinet->opt)
1338                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1339         newinet->id = newtp->write_seq ^ jiffies;
1340
1341         tcp_mtup_init(newsk);
1342         tcp_sync_mss(newsk, dst_mtu(dst));
1343         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1344         tcp_initialize_rcv_mss(newsk);
1345
1346 #ifdef CONFIG_TCP_MD5SIG
1347         /* Copy over the MD5 key from the original socket */
1348         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1349                 /*
1350                  * We're using one, so create a matching key
1351                  * on the newsk structure. If we fail to get
1352                  * memory, then we end up not copying the key
1353                  * across. Shucks.
1354                  */
1355                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1356                 if (newkey != NULL)
1357                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1358                                           newkey, key->keylen);
1359         }
1360 #endif
1361
1362         __inet_hash_nolisten(newsk);
1363         __inet_inherit_port(sk, newsk);
1364
1365         return newsk;
1366
1367 exit_overflow:
1368         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1369 exit:
1370         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1371         dst_release(dst);
1372         return NULL;
1373 }
1374
1375 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1376 {
1377         struct tcphdr *th = tcp_hdr(skb);
1378         const struct iphdr *iph = ip_hdr(skb);
1379         struct sock *nsk;
1380         struct request_sock **prev;
1381         /* Find possible connection requests. */
1382         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1383                                                        iph->saddr, iph->daddr);
1384         if (req)
1385                 return tcp_check_req(sk, skb, req, prev);
1386
1387         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1388                         th->source, iph->daddr, th->dest, inet_iif(skb));
1389
1390         if (nsk) {
1391                 if (nsk->sk_state != TCP_TIME_WAIT) {
1392                         bh_lock_sock(nsk);
1393                         return nsk;
1394                 }
1395                 inet_twsk_put(inet_twsk(nsk));
1396                 return NULL;
1397         }
1398
1399 #ifdef CONFIG_SYN_COOKIES
1400         if (!th->rst && !th->syn && th->ack)
1401                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1402 #endif
1403         return sk;
1404 }
1405
1406 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1407 {
1408         const struct iphdr *iph = ip_hdr(skb);
1409
1410         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1411                 if (!tcp_v4_check(skb->len, iph->saddr,
1412                                   iph->daddr, skb->csum)) {
1413                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1414                         return 0;
1415                 }
1416         }
1417
1418         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1419                                        skb->len, IPPROTO_TCP, 0);
1420
1421         if (skb->len <= 76) {
1422                 return __skb_checksum_complete(skb);
1423         }
1424         return 0;
1425 }
1426
1427
1428 /* The socket must have it's spinlock held when we get
1429  * here.
1430  *
1431  * We have a potential double-lock case here, so even when
1432  * doing backlog processing we use the BH locking scheme.
1433  * This is because we cannot sleep with the original spinlock
1434  * held.
1435  */
1436 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1437 {
1438         struct sock *rsk;
1439 #ifdef CONFIG_TCP_MD5SIG
1440         /*
1441          * We really want to reject the packet as early as possible
1442          * if:
1443          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1444          *  o There is an MD5 option and we're not expecting one
1445          */
1446         if (tcp_v4_inbound_md5_hash(sk, skb))
1447                 goto discard;
1448 #endif
1449
1450         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1451                 TCP_CHECK_TIMER(sk);
1452                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1453                         rsk = sk;
1454                         goto reset;
1455                 }
1456                 TCP_CHECK_TIMER(sk);
1457                 return 0;
1458         }
1459
1460         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1461                 goto csum_err;
1462
1463         if (sk->sk_state == TCP_LISTEN) {
1464                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1465                 if (!nsk)
1466                         goto discard;
1467
1468                 if (nsk != sk) {
1469                         if (tcp_child_process(sk, nsk, skb)) {
1470                                 rsk = nsk;
1471                                 goto reset;
1472                         }
1473                         return 0;
1474                 }
1475         }
1476
1477         TCP_CHECK_TIMER(sk);
1478         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1479                 rsk = sk;
1480                 goto reset;
1481         }
1482         TCP_CHECK_TIMER(sk);
1483         return 0;
1484
1485 reset:
1486         tcp_v4_send_reset(rsk, skb);
1487 discard:
1488         kfree_skb(skb);
1489         /* Be careful here. If this function gets more complicated and
1490          * gcc suffers from register pressure on the x86, sk (in %ebx)
1491          * might be destroyed here. This current version compiles correctly,
1492          * but you have been warned.
1493          */
1494         return 0;
1495
1496 csum_err:
1497         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1498         goto discard;
1499 }
1500
1501 /*
1502  *      From tcp_input.c
1503  */
1504
1505 int tcp_v4_rcv(struct sk_buff *skb)
1506 {
1507         const struct iphdr *iph;
1508         struct tcphdr *th;
1509         struct sock *sk;
1510         int ret;
1511
1512         if (skb->pkt_type != PACKET_HOST)
1513                 goto discard_it;
1514
1515         /* Count it even if it's bad */
1516         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1517
1518         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1519                 goto discard_it;
1520
1521         th = tcp_hdr(skb);
1522
1523         if (th->doff < sizeof(struct tcphdr) / 4)
1524                 goto bad_packet;
1525         if (!pskb_may_pull(skb, th->doff * 4))
1526                 goto discard_it;
1527
1528         /* An explanation is required here, I think.
1529          * Packet length and doff are validated by header prediction,
1530          * provided case of th->doff==0 is eliminated.
1531          * So, we defer the checks. */
1532         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1533                 goto bad_packet;
1534
1535         th = tcp_hdr(skb);
1536         iph = ip_hdr(skb);
1537         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1538         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1539                                     skb->len - th->doff * 4);
1540         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1541         TCP_SKB_CB(skb)->when    = 0;
1542         TCP_SKB_CB(skb)->flags   = iph->tos;
1543         TCP_SKB_CB(skb)->sacked  = 0;
1544
1545         sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
1546                         th->source, iph->daddr, th->dest, inet_iif(skb));
1547         if (!sk)
1548                 goto no_tcp_socket;
1549
1550 process:
1551         if (sk->sk_state == TCP_TIME_WAIT)
1552                 goto do_time_wait;
1553
1554         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1555                 goto discard_and_relse;
1556         nf_reset(skb);
1557
1558         if (sk_filter(sk, skb))
1559                 goto discard_and_relse;
1560
1561         skb->dev = NULL;
1562
1563         bh_lock_sock_nested(sk);
1564         ret = 0;
1565         if (!sock_owned_by_user(sk)) {
1566 #ifdef CONFIG_NET_DMA
1567                 struct tcp_sock *tp = tcp_sk(sk);
1568                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1569                         tp->ucopy.dma_chan = get_softnet_dma();
1570                 if (tp->ucopy.dma_chan)
1571                         ret = tcp_v4_do_rcv(sk, skb);
1572                 else
1573 #endif
1574                 {
1575                         if (!tcp_prequeue(sk, skb))
1576                         ret = tcp_v4_do_rcv(sk, skb);
1577                 }
1578         } else
1579                 sk_add_backlog(sk, skb);
1580         bh_unlock_sock(sk);
1581
1582         sock_put(sk);
1583
1584         return ret;
1585
1586 no_tcp_socket:
1587         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1588                 goto discard_it;
1589
1590         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1591 bad_packet:
1592                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1593         } else {
1594                 tcp_v4_send_reset(NULL, skb);
1595         }
1596
1597 discard_it:
1598         /* Discard frame. */
1599         kfree_skb(skb);
1600         return 0;
1601
1602 discard_and_relse:
1603         sock_put(sk);
1604         goto discard_it;
1605
1606 do_time_wait:
1607         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1608                 inet_twsk_put(inet_twsk(sk));
1609                 goto discard_it;
1610         }
1611
1612         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1613                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1614                 inet_twsk_put(inet_twsk(sk));
1615                 goto discard_it;
1616         }
1617         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1618         case TCP_TW_SYN: {
1619                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1620                                                         &tcp_hashinfo,
1621                                                         iph->daddr, th->dest,
1622                                                         inet_iif(skb));
1623                 if (sk2) {
1624                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1625                         inet_twsk_put(inet_twsk(sk));
1626                         sk = sk2;
1627                         goto process;
1628                 }
1629                 /* Fall through to ACK */
1630         }
1631         case TCP_TW_ACK:
1632                 tcp_v4_timewait_ack(sk, skb);
1633                 break;
1634         case TCP_TW_RST:
1635                 goto no_tcp_socket;
1636         case TCP_TW_SUCCESS:;
1637         }
1638         goto discard_it;
1639 }
1640
1641 /* VJ's idea. Save last timestamp seen from this destination
1642  * and hold it at least for normal timewait interval to use for duplicate
1643  * segment detection in subsequent connections, before they enter synchronized
1644  * state.
1645  */
1646
1647 int tcp_v4_remember_stamp(struct sock *sk)
1648 {
1649         struct inet_sock *inet = inet_sk(sk);
1650         struct tcp_sock *tp = tcp_sk(sk);
1651         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1652         struct inet_peer *peer = NULL;
1653         int release_it = 0;
1654
1655         if (!rt || rt->rt_dst != inet->daddr) {
1656                 peer = inet_getpeer(inet->daddr, 1);
1657                 release_it = 1;
1658         } else {
1659                 if (!rt->peer)
1660                         rt_bind_peer(rt, 1);
1661                 peer = rt->peer;
1662         }
1663
1664         if (peer) {
1665                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1666                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1667                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1668                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1669                         peer->tcp_ts = tp->rx_opt.ts_recent;
1670                 }
1671                 if (release_it)
1672                         inet_putpeer(peer);
1673                 return 1;
1674         }
1675
1676         return 0;
1677 }
1678
1679 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1680 {
1681         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1682
1683         if (peer) {
1684                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1685
1686                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1687                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1688                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1689                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1690                         peer->tcp_ts       = tcptw->tw_ts_recent;
1691                 }
1692                 inet_putpeer(peer);
1693                 return 1;
1694         }
1695
1696         return 0;
1697 }
1698
1699 struct inet_connection_sock_af_ops ipv4_specific = {
1700         .queue_xmit        = ip_queue_xmit,
1701         .send_check        = tcp_v4_send_check,
1702         .rebuild_header    = inet_sk_rebuild_header,
1703         .conn_request      = tcp_v4_conn_request,
1704         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1705         .remember_stamp    = tcp_v4_remember_stamp,
1706         .net_header_len    = sizeof(struct iphdr),
1707         .setsockopt        = ip_setsockopt,
1708         .getsockopt        = ip_getsockopt,
1709         .addr2sockaddr     = inet_csk_addr2sockaddr,
1710         .sockaddr_len      = sizeof(struct sockaddr_in),
1711         .bind_conflict     = inet_csk_bind_conflict,
1712 #ifdef CONFIG_COMPAT
1713         .compat_setsockopt = compat_ip_setsockopt,
1714         .compat_getsockopt = compat_ip_getsockopt,
1715 #endif
1716 };
1717
1718 #ifdef CONFIG_TCP_MD5SIG
1719 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1720         .md5_lookup             = tcp_v4_md5_lookup,
1721         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1722         .md5_add                = tcp_v4_md5_add_func,
1723         .md5_parse              = tcp_v4_parse_md5_keys,
1724 };
1725 #endif
1726
1727 /* NOTE: A lot of things set to zero explicitly by call to
1728  *       sk_alloc() so need not be done here.
1729  */
1730 static int tcp_v4_init_sock(struct sock *sk)
1731 {
1732         struct inet_connection_sock *icsk = inet_csk(sk);
1733         struct tcp_sock *tp = tcp_sk(sk);
1734
1735         skb_queue_head_init(&tp->out_of_order_queue);
1736         tcp_init_xmit_timers(sk);
1737         tcp_prequeue_init(tp);
1738
1739         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1740         tp->mdev = TCP_TIMEOUT_INIT;
1741
1742         /* So many TCP implementations out there (incorrectly) count the
1743          * initial SYN frame in their delayed-ACK and congestion control
1744          * algorithms that we must have the following bandaid to talk
1745          * efficiently to them.  -DaveM
1746          */
1747         tp->snd_cwnd = 2;
1748
1749         /* See draft-stevens-tcpca-spec-01 for discussion of the
1750          * initialization of these values.
1751          */
1752         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1753         tp->snd_cwnd_clamp = ~0;
1754         tp->mss_cache = 536;
1755
1756         tp->reordering = sysctl_tcp_reordering;
1757         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1758
1759         sk->sk_state = TCP_CLOSE;
1760
1761         sk->sk_write_space = sk_stream_write_space;
1762         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1763
1764         icsk->icsk_af_ops = &ipv4_specific;
1765         icsk->icsk_sync_mss = tcp_sync_mss;
1766 #ifdef CONFIG_TCP_MD5SIG
1767         tp->af_specific = &tcp_sock_ipv4_specific;
1768 #endif
1769
1770         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1771         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1772
1773         atomic_inc(&tcp_sockets_allocated);
1774
1775         return 0;
1776 }
1777
1778 void tcp_v4_destroy_sock(struct sock *sk)
1779 {
1780         struct tcp_sock *tp = tcp_sk(sk);
1781
1782         tcp_clear_xmit_timers(sk);
1783
1784         tcp_cleanup_congestion_control(sk);
1785
1786         /* Cleanup up the write buffer. */
1787         tcp_write_queue_purge(sk);
1788
1789         /* Cleans up our, hopefully empty, out_of_order_queue. */
1790         __skb_queue_purge(&tp->out_of_order_queue);
1791
1792 #ifdef CONFIG_TCP_MD5SIG
1793         /* Clean up the MD5 key list, if any */
1794         if (tp->md5sig_info) {
1795                 tcp_v4_clear_md5_list(sk);
1796                 kfree(tp->md5sig_info);
1797                 tp->md5sig_info = NULL;
1798         }
1799 #endif
1800
1801 #ifdef CONFIG_NET_DMA
1802         /* Cleans up our sk_async_wait_queue */
1803         __skb_queue_purge(&sk->sk_async_wait_queue);
1804 #endif
1805
1806         /* Clean prequeue, it must be empty really */
1807         __skb_queue_purge(&tp->ucopy.prequeue);
1808
1809         /* Clean up a referenced TCP bind bucket. */
1810         if (inet_csk(sk)->icsk_bind_hash)
1811                 inet_put_port(sk);
1812
1813         /*
1814          * If sendmsg cached page exists, toss it.
1815          */
1816         if (sk->sk_sndmsg_page) {
1817                 __free_page(sk->sk_sndmsg_page);
1818                 sk->sk_sndmsg_page = NULL;
1819         }
1820
1821         atomic_dec(&tcp_sockets_allocated);
1822 }
1823
1824 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1825
1826 #ifdef CONFIG_PROC_FS
1827 /* Proc filesystem TCP sock list dumping. */
1828
1829 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1830 {
1831         return hlist_empty(head) ? NULL :
1832                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1833 }
1834
1835 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1836 {
1837         return tw->tw_node.next ?
1838                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1839 }
1840
1841 static void *listening_get_next(struct seq_file *seq, void *cur)
1842 {
1843         struct inet_connection_sock *icsk;
1844         struct hlist_node *node;
1845         struct sock *sk = cur;
1846         struct tcp_iter_state* st = seq->private;
1847         struct net *net = seq_file_net(seq);
1848
1849         if (!sk) {
1850                 st->bucket = 0;
1851                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1852                 goto get_sk;
1853         }
1854
1855         ++st->num;
1856
1857         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1858                 struct request_sock *req = cur;
1859
1860                 icsk = inet_csk(st->syn_wait_sk);
1861                 req = req->dl_next;
1862                 while (1) {
1863                         while (req) {
1864                                 if (req->rsk_ops->family == st->family &&
1865                                     net_eq(sock_net(req->sk), net)) {
1866                                         cur = req;
1867                                         goto out;
1868                                 }
1869                                 req = req->dl_next;
1870                         }
1871                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1872                                 break;
1873 get_req:
1874                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1875                 }
1876                 sk        = sk_next(st->syn_wait_sk);
1877                 st->state = TCP_SEQ_STATE_LISTENING;
1878                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1879         } else {
1880                 icsk = inet_csk(sk);
1881                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1882                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1883                         goto start_req;
1884                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1885                 sk = sk_next(sk);
1886         }
1887 get_sk:
1888         sk_for_each_from(sk, node) {
1889                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1890                         cur = sk;
1891                         goto out;
1892                 }
1893                 icsk = inet_csk(sk);
1894                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1895                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1896 start_req:
1897                         st->uid         = sock_i_uid(sk);
1898                         st->syn_wait_sk = sk;
1899                         st->state       = TCP_SEQ_STATE_OPENREQ;
1900                         st->sbucket     = 0;
1901                         goto get_req;
1902                 }
1903                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1904         }
1905         if (++st->bucket < INET_LHTABLE_SIZE) {
1906                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1907                 goto get_sk;
1908         }
1909         cur = NULL;
1910 out:
1911         return cur;
1912 }
1913
1914 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1915 {
1916         void *rc = listening_get_next(seq, NULL);
1917
1918         while (rc && *pos) {
1919                 rc = listening_get_next(seq, rc);
1920                 --*pos;
1921         }
1922         return rc;
1923 }
1924
1925 static void *established_get_first(struct seq_file *seq)
1926 {
1927         struct tcp_iter_state* st = seq->private;
1928         struct net *net = seq_file_net(seq);
1929         void *rc = NULL;
1930
1931         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1932                 struct sock *sk;
1933                 struct hlist_node *node;
1934                 struct inet_timewait_sock *tw;
1935                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1936
1937                 read_lock_bh(lock);
1938                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1939                         if (sk->sk_family != st->family ||
1940                             !net_eq(sock_net(sk), net)) {
1941                                 continue;
1942                         }
1943                         rc = sk;
1944                         goto out;
1945                 }
1946                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1947                 inet_twsk_for_each(tw, node,
1948                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
1949                         if (tw->tw_family != st->family ||
1950                             !net_eq(twsk_net(tw), net)) {
1951                                 continue;
1952                         }
1953                         rc = tw;
1954                         goto out;
1955                 }
1956                 read_unlock_bh(lock);
1957                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1958         }
1959 out:
1960         return rc;
1961 }
1962
1963 static void *established_get_next(struct seq_file *seq, void *cur)
1964 {
1965         struct sock *sk = cur;
1966         struct inet_timewait_sock *tw;
1967         struct hlist_node *node;
1968         struct tcp_iter_state* st = seq->private;
1969         struct net *net = seq_file_net(seq);
1970
1971         ++st->num;
1972
1973         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1974                 tw = cur;
1975                 tw = tw_next(tw);
1976 get_tw:
1977                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1978                         tw = tw_next(tw);
1979                 }
1980                 if (tw) {
1981                         cur = tw;
1982                         goto out;
1983                 }
1984                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1985                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1986
1987                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1988                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1989                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1990                 } else {
1991                         cur = NULL;
1992                         goto out;
1993                 }
1994         } else
1995                 sk = sk_next(sk);
1996
1997         sk_for_each_from(sk, node) {
1998                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1999                         goto found;
2000         }
2001
2002         st->state = TCP_SEQ_STATE_TIME_WAIT;
2003         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2004         goto get_tw;
2005 found:
2006         cur = sk;
2007 out:
2008         return cur;
2009 }
2010
2011 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2012 {
2013         void *rc = established_get_first(seq);
2014
2015         while (rc && pos) {
2016                 rc = established_get_next(seq, rc);
2017                 --pos;
2018         }
2019         return rc;
2020 }
2021
2022 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2023 {
2024         void *rc;
2025         struct tcp_iter_state* st = seq->private;
2026
2027         inet_listen_lock(&tcp_hashinfo);
2028         st->state = TCP_SEQ_STATE_LISTENING;
2029         rc        = listening_get_idx(seq, &pos);
2030
2031         if (!rc) {
2032                 inet_listen_unlock(&tcp_hashinfo);
2033                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2034                 rc        = established_get_idx(seq, pos);
2035         }
2036
2037         return rc;
2038 }
2039
2040 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2041 {
2042         struct tcp_iter_state* st = seq->private;
2043         st->state = TCP_SEQ_STATE_LISTENING;
2044         st->num = 0;
2045         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2046 }
2047
2048 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2049 {
2050         void *rc = NULL;
2051         struct tcp_iter_state* st;
2052
2053         if (v == SEQ_START_TOKEN) {
2054                 rc = tcp_get_idx(seq, 0);
2055                 goto out;
2056         }
2057         st = seq->private;
2058
2059         switch (st->state) {
2060         case TCP_SEQ_STATE_OPENREQ:
2061         case TCP_SEQ_STATE_LISTENING:
2062                 rc = listening_get_next(seq, v);
2063                 if (!rc) {
2064                         inet_listen_unlock(&tcp_hashinfo);
2065                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2066                         rc        = established_get_first(seq);
2067                 }
2068                 break;
2069         case TCP_SEQ_STATE_ESTABLISHED:
2070         case TCP_SEQ_STATE_TIME_WAIT:
2071                 rc = established_get_next(seq, v);
2072                 break;
2073         }
2074 out:
2075         ++*pos;
2076         return rc;
2077 }
2078
2079 static void tcp_seq_stop(struct seq_file *seq, void *v)
2080 {
2081         struct tcp_iter_state* st = seq->private;
2082
2083         switch (st->state) {
2084         case TCP_SEQ_STATE_OPENREQ:
2085                 if (v) {
2086                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2087                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2088                 }
2089         case TCP_SEQ_STATE_LISTENING:
2090                 if (v != SEQ_START_TOKEN)
2091                         inet_listen_unlock(&tcp_hashinfo);
2092                 break;
2093         case TCP_SEQ_STATE_TIME_WAIT:
2094         case TCP_SEQ_STATE_ESTABLISHED:
2095                 if (v)
2096                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2097                 break;
2098         }
2099 }
2100
2101 static int tcp_seq_open(struct inode *inode, struct file *file)
2102 {
2103         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2104         struct tcp_iter_state *s;
2105         int err;
2106
2107         err = seq_open_net(inode, file, &afinfo->seq_ops,
2108                           sizeof(struct tcp_iter_state));
2109         if (err < 0)
2110                 return err;
2111
2112         s = ((struct seq_file *)file->private_data)->private;
2113         s->family               = afinfo->family;
2114         return 0;
2115 }
2116
2117 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2118 {
2119         int rc = 0;
2120         struct proc_dir_entry *p;
2121
2122         afinfo->seq_fops.open           = tcp_seq_open;
2123         afinfo->seq_fops.read           = seq_read;
2124         afinfo->seq_fops.llseek         = seq_lseek;
2125         afinfo->seq_fops.release        = seq_release_net;
2126
2127         afinfo->seq_ops.start           = tcp_seq_start;
2128         afinfo->seq_ops.next            = tcp_seq_next;
2129         afinfo->seq_ops.stop            = tcp_seq_stop;
2130
2131         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2132                              &afinfo->seq_fops, afinfo);
2133         if (!p)
2134                 rc = -ENOMEM;
2135         return rc;
2136 }
2137
2138 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2139 {
2140         proc_net_remove(net, afinfo->name);
2141 }
2142
2143 static void get_openreq4(struct sock *sk, struct request_sock *req,
2144                          struct seq_file *f, int i, int uid, int *len)
2145 {
2146         const struct inet_request_sock *ireq = inet_rsk(req);
2147         int ttd = req->expires - jiffies;
2148
2149         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2150                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2151                 i,
2152                 ireq->loc_addr,
2153                 ntohs(inet_sk(sk)->sport),
2154                 ireq->rmt_addr,
2155                 ntohs(ireq->rmt_port),
2156                 TCP_SYN_RECV,
2157                 0, 0, /* could print option size, but that is af dependent. */
2158                 1,    /* timers active (only the expire timer) */
2159                 jiffies_to_clock_t(ttd),
2160                 req->retrans,
2161                 uid,
2162                 0,  /* non standard timer */
2163                 0, /* open_requests have no inode */
2164                 atomic_read(&sk->sk_refcnt),
2165                 req,
2166                 len);
2167 }
2168
2169 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2170 {
2171         int timer_active;
2172         unsigned long timer_expires;
2173         struct tcp_sock *tp = tcp_sk(sk);
2174         const struct inet_connection_sock *icsk = inet_csk(sk);
2175         struct inet_sock *inet = inet_sk(sk);
2176         __be32 dest = inet->daddr;
2177         __be32 src = inet->rcv_saddr;
2178         __u16 destp = ntohs(inet->dport);
2179         __u16 srcp = ntohs(inet->sport);
2180
2181         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2182                 timer_active    = 1;
2183                 timer_expires   = icsk->icsk_timeout;
2184         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2185                 timer_active    = 4;
2186                 timer_expires   = icsk->icsk_timeout;
2187         } else if (timer_pending(&sk->sk_timer)) {
2188                 timer_active    = 2;
2189                 timer_expires   = sk->sk_timer.expires;
2190         } else {
2191                 timer_active    = 0;
2192                 timer_expires = jiffies;
2193         }
2194
2195         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2196                         "%08X %5d %8d %lu %d %p %u %u %u %u %d%n",
2197                 i, src, srcp, dest, destp, sk->sk_state,
2198                 tp->write_seq - tp->snd_una,
2199                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2200                                              (tp->rcv_nxt - tp->copied_seq),
2201                 timer_active,
2202                 jiffies_to_clock_t(timer_expires - jiffies),
2203                 icsk->icsk_retransmits,
2204                 sock_i_uid(sk),
2205                 icsk->icsk_probes_out,
2206                 sock_i_ino(sk),
2207                 atomic_read(&sk->sk_refcnt), sk,
2208                 icsk->icsk_rto,
2209                 icsk->icsk_ack.ato,
2210                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2211                 tp->snd_cwnd,
2212                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2213                 len);
2214 }
2215
2216 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2217                                struct seq_file *f, int i, int *len)
2218 {
2219         __be32 dest, src;
2220         __u16 destp, srcp;
2221         int ttd = tw->tw_ttd - jiffies;
2222
2223         if (ttd < 0)
2224                 ttd = 0;
2225
2226         dest  = tw->tw_daddr;
2227         src   = tw->tw_rcv_saddr;
2228         destp = ntohs(tw->tw_dport);
2229         srcp  = ntohs(tw->tw_sport);
2230
2231         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2232                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2233                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2234                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2235                 atomic_read(&tw->tw_refcnt), tw, len);
2236 }
2237
2238 #define TMPSZ 150
2239
2240 static int tcp4_seq_show(struct seq_file *seq, void *v)
2241 {
2242         struct tcp_iter_state* st;
2243         int len;
2244
2245         if (v == SEQ_START_TOKEN) {
2246                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2247                            "  sl  local_address rem_address   st tx_queue "
2248                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2249                            "inode");
2250                 goto out;
2251         }
2252         st = seq->private;
2253
2254         switch (st->state) {
2255         case TCP_SEQ_STATE_LISTENING:
2256         case TCP_SEQ_STATE_ESTABLISHED:
2257                 get_tcp4_sock(v, seq, st->num, &len);
2258                 break;
2259         case TCP_SEQ_STATE_OPENREQ:
2260                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2261                 break;
2262         case TCP_SEQ_STATE_TIME_WAIT:
2263                 get_timewait4_sock(v, seq, st->num, &len);
2264                 break;
2265         }
2266         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2267 out:
2268         return 0;
2269 }
2270
2271 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2272         .name           = "tcp",
2273         .family         = AF_INET,
2274         .seq_fops       = {
2275                 .owner          = THIS_MODULE,
2276         },
2277         .seq_ops        = {
2278                 .show           = tcp4_seq_show,
2279         },
2280 };
2281
2282 static int tcp4_proc_init_net(struct net *net)
2283 {
2284         return tcp_proc_register(net, &tcp4_seq_afinfo);
2285 }
2286
2287 static void tcp4_proc_exit_net(struct net *net)
2288 {
2289         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2290 }
2291
2292 static struct pernet_operations tcp4_net_ops = {
2293         .init = tcp4_proc_init_net,
2294         .exit = tcp4_proc_exit_net,
2295 };
2296
2297 int __init tcp4_proc_init(void)
2298 {
2299         return register_pernet_subsys(&tcp4_net_ops);
2300 }
2301
2302 void tcp4_proc_exit(void)
2303 {
2304         unregister_pernet_subsys(&tcp4_net_ops);
2305 }
2306 #endif /* CONFIG_PROC_FS */
2307
2308 struct proto tcp_prot = {
2309         .name                   = "TCP",
2310         .owner                  = THIS_MODULE,
2311         .close                  = tcp_close,
2312         .connect                = tcp_v4_connect,
2313         .disconnect             = tcp_disconnect,
2314         .accept                 = inet_csk_accept,
2315         .ioctl                  = tcp_ioctl,
2316         .init                   = tcp_v4_init_sock,
2317         .destroy                = tcp_v4_destroy_sock,
2318         .shutdown               = tcp_shutdown,
2319         .setsockopt             = tcp_setsockopt,
2320         .getsockopt             = tcp_getsockopt,
2321         .recvmsg                = tcp_recvmsg,
2322         .backlog_rcv            = tcp_v4_do_rcv,
2323         .hash                   = inet_hash,
2324         .unhash                 = inet_unhash,
2325         .get_port               = inet_csk_get_port,
2326         .enter_memory_pressure  = tcp_enter_memory_pressure,
2327         .sockets_allocated      = &tcp_sockets_allocated,
2328         .orphan_count           = &tcp_orphan_count,
2329         .memory_allocated       = &tcp_memory_allocated,
2330         .memory_pressure        = &tcp_memory_pressure,
2331         .sysctl_mem             = sysctl_tcp_mem,
2332         .sysctl_wmem            = sysctl_tcp_wmem,
2333         .sysctl_rmem            = sysctl_tcp_rmem,
2334         .max_header             = MAX_TCP_HEADER,
2335         .obj_size               = sizeof(struct tcp_sock),
2336         .twsk_prot              = &tcp_timewait_sock_ops,
2337         .rsk_prot               = &tcp_request_sock_ops,
2338         .h.hashinfo             = &tcp_hashinfo,
2339 #ifdef CONFIG_COMPAT
2340         .compat_setsockopt      = compat_tcp_setsockopt,
2341         .compat_getsockopt      = compat_tcp_getsockopt,
2342 #endif
2343 };
2344
2345
2346 static int __net_init tcp_sk_init(struct net *net)
2347 {
2348         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2349                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2350 }
2351
2352 static void __net_exit tcp_sk_exit(struct net *net)
2353 {
2354         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2355 }
2356
2357 static struct pernet_operations __net_initdata tcp_sk_ops = {
2358        .init = tcp_sk_init,
2359        .exit = tcp_sk_exit,
2360 };
2361
2362 void __init tcp_v4_init(void)
2363 {
2364         if (register_pernet_device(&tcp_sk_ops))
2365                 panic("Failed to create the TCP control socket.\n");
2366 }
2367
2368 EXPORT_SYMBOL(ipv4_specific);
2369 EXPORT_SYMBOL(tcp_hashinfo);
2370 EXPORT_SYMBOL(tcp_prot);
2371 EXPORT_SYMBOL(tcp_v4_conn_request);
2372 EXPORT_SYMBOL(tcp_v4_connect);
2373 EXPORT_SYMBOL(tcp_v4_do_rcv);
2374 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2375 EXPORT_SYMBOL(tcp_v4_send_check);
2376 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2377
2378 #ifdef CONFIG_PROC_FS
2379 EXPORT_SYMBOL(tcp_proc_register);
2380 EXPORT_SYMBOL(tcp_proc_unregister);
2381 #endif
2382 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2383