Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6
[pandora-kernel.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/types.h>
55 #include <linux/fcntl.h>
56 #include <linux/module.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/jhash.h>
60 #include <linux/init.h>
61 #include <linux/times.h>
62
63 #include <net/net_namespace.h>
64 #include <net/icmp.h>
65 #include <net/inet_hashtables.h>
66 #include <net/tcp.h>
67 #include <net/transp_v6.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/timewait_sock.h>
71 #include <net/xfrm.h>
72 #include <net/netdma.h>
73
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79
80 #include <linux/crypto.h>
81 #include <linux/scatterlist.h>
82
83 int sysctl_tcp_tw_reuse __read_mostly;
84 int sysctl_tcp_low_latency __read_mostly;
85
86
87 #ifdef CONFIG_TCP_MD5SIG
88 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
89                                                    __be32 addr);
90 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
91                                    __be32 saddr, __be32 daddr,
92                                    struct tcphdr *th, unsigned int tcplen);
93 #else
94 static inline
95 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
96 {
97         return NULL;
98 }
99 #endif
100
101 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
102         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
103         .lhash_users = ATOMIC_INIT(0),
104         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
105 };
106
107 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
108 {
109         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
110                                           ip_hdr(skb)->saddr,
111                                           tcp_hdr(skb)->dest,
112                                           tcp_hdr(skb)->source);
113 }
114
115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
116 {
117         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
118         struct tcp_sock *tp = tcp_sk(sk);
119
120         /* With PAWS, it is safe from the viewpoint
121            of data integrity. Even without PAWS it is safe provided sequence
122            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
123
124            Actually, the idea is close to VJ's one, only timestamp cache is
125            held not per host, but per port pair and TW bucket is used as state
126            holder.
127
128            If TW bucket has been already destroyed we fall back to VJ's scheme
129            and use initial timestamp retrieved from peer table.
130          */
131         if (tcptw->tw_ts_recent_stamp &&
132             (twp == NULL || (sysctl_tcp_tw_reuse &&
133                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
134                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
135                 if (tp->write_seq == 0)
136                         tp->write_seq = 1;
137                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
138                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
139                 sock_hold(sktw);
140                 return 1;
141         }
142
143         return 0;
144 }
145
146 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
147
148 /* This will initiate an outgoing connection. */
149 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
150 {
151         struct inet_sock *inet = inet_sk(sk);
152         struct tcp_sock *tp = tcp_sk(sk);
153         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
154         struct rtable *rt;
155         __be32 daddr, nexthop;
156         int tmp;
157         int err;
158
159         if (addr_len < sizeof(struct sockaddr_in))
160                 return -EINVAL;
161
162         if (usin->sin_family != AF_INET)
163                 return -EAFNOSUPPORT;
164
165         nexthop = daddr = usin->sin_addr.s_addr;
166         if (inet->opt && inet->opt->srr) {
167                 if (!daddr)
168                         return -EINVAL;
169                 nexthop = inet->opt->faddr;
170         }
171
172         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
173                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174                                IPPROTO_TCP,
175                                inet->sport, usin->sin_port, sk, 1);
176         if (tmp < 0) {
177                 if (tmp == -ENETUNREACH)
178                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179                 return tmp;
180         }
181
182         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183                 ip_rt_put(rt);
184                 return -ENETUNREACH;
185         }
186
187         if (!inet->opt || !inet->opt->srr)
188                 daddr = rt->rt_dst;
189
190         if (!inet->saddr)
191                 inet->saddr = rt->rt_src;
192         inet->rcv_saddr = inet->saddr;
193
194         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
195                 /* Reset inherited state */
196                 tp->rx_opt.ts_recent       = 0;
197                 tp->rx_opt.ts_recent_stamp = 0;
198                 tp->write_seq              = 0;
199         }
200
201         if (tcp_death_row.sysctl_tw_recycle &&
202             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
203                 struct inet_peer *peer = rt_get_peer(rt);
204                 /*
205                  * VJ's idea. We save last timestamp seen from
206                  * the destination in peer table, when entering state
207                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
208                  * when trying new connection.
209                  */
210                 if (peer != NULL &&
211                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
212                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
213                         tp->rx_opt.ts_recent = peer->tcp_ts;
214                 }
215         }
216
217         inet->dport = usin->sin_port;
218         inet->daddr = daddr;
219
220         inet_csk(sk)->icsk_ext_hdr_len = 0;
221         if (inet->opt)
222                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
223
224         tp->rx_opt.mss_clamp = 536;
225
226         /* Socket identity is still unknown (sport may be zero).
227          * However we set state to SYN-SENT and not releasing socket
228          * lock select source port, enter ourselves into the hash tables and
229          * complete initialization after this.
230          */
231         tcp_set_state(sk, TCP_SYN_SENT);
232         err = inet_hash_connect(&tcp_death_row, sk);
233         if (err)
234                 goto failure;
235
236         err = ip_route_newports(&rt, IPPROTO_TCP,
237                                 inet->sport, inet->dport, sk);
238         if (err)
239                 goto failure;
240
241         /* OK, now commit destination to socket.  */
242         sk->sk_gso_type = SKB_GSO_TCPV4;
243         sk_setup_caps(sk, &rt->u.dst);
244
245         if (!tp->write_seq)
246                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
247                                                            inet->daddr,
248                                                            inet->sport,
249                                                            usin->sin_port);
250
251         inet->id = tp->write_seq ^ jiffies;
252
253         err = tcp_connect(sk);
254         rt = NULL;
255         if (err)
256                 goto failure;
257
258         return 0;
259
260 failure:
261         /*
262          * This unhashes the socket and releases the local port,
263          * if necessary.
264          */
265         tcp_set_state(sk, TCP_CLOSE);
266         ip_rt_put(rt);
267         sk->sk_route_caps = 0;
268         inet->dport = 0;
269         return err;
270 }
271
272 /*
273  * This routine does path mtu discovery as defined in RFC1191.
274  */
275 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
276 {
277         struct dst_entry *dst;
278         struct inet_sock *inet = inet_sk(sk);
279
280         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
281          * send out by Linux are always <576bytes so they should go through
282          * unfragmented).
283          */
284         if (sk->sk_state == TCP_LISTEN)
285                 return;
286
287         /* We don't check in the destentry if pmtu discovery is forbidden
288          * on this route. We just assume that no packet_to_big packets
289          * are send back when pmtu discovery is not active.
290          * There is a small race when the user changes this flag in the
291          * route, but I think that's acceptable.
292          */
293         if ((dst = __sk_dst_check(sk, 0)) == NULL)
294                 return;
295
296         dst->ops->update_pmtu(dst, mtu);
297
298         /* Something is about to be wrong... Remember soft error
299          * for the case, if this connection will not able to recover.
300          */
301         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
302                 sk->sk_err_soft = EMSGSIZE;
303
304         mtu = dst_mtu(dst);
305
306         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
307             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
308                 tcp_sync_mss(sk, mtu);
309
310                 /* Resend the TCP packet because it's
311                  * clear that the old packet has been
312                  * dropped. This is the new "fast" path mtu
313                  * discovery.
314                  */
315                 tcp_simple_retransmit(sk);
316         } /* else let the usual retransmit timer handle it */
317 }
318
319 /*
320  * This routine is called by the ICMP module when it gets some
321  * sort of error condition.  If err < 0 then the socket should
322  * be closed and the error returned to the user.  If err > 0
323  * it's just the icmp type << 8 | icmp code.  After adjustment
324  * header points to the first 8 bytes of the tcp header.  We need
325  * to find the appropriate port.
326  *
327  * The locking strategy used here is very "optimistic". When
328  * someone else accesses the socket the ICMP is just dropped
329  * and for some paths there is no check at all.
330  * A more general error queue to queue errors for later handling
331  * is probably better.
332  *
333  */
334
335 void tcp_v4_err(struct sk_buff *skb, u32 info)
336 {
337         struct iphdr *iph = (struct iphdr *)skb->data;
338         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
339         struct tcp_sock *tp;
340         struct inet_sock *inet;
341         const int type = icmp_hdr(skb)->type;
342         const int code = icmp_hdr(skb)->code;
343         struct sock *sk;
344         __u32 seq;
345         int err;
346         struct net *net = dev_net(skb->dev);
347
348         if (skb->len < (iph->ihl << 2) + 8) {
349                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
350                 return;
351         }
352
353         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
354                         iph->saddr, th->source, inet_iif(skb));
355         if (!sk) {
356                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
357                 return;
358         }
359         if (sk->sk_state == TCP_TIME_WAIT) {
360                 inet_twsk_put(inet_twsk(sk));
361                 return;
362         }
363
364         bh_lock_sock(sk);
365         /* If too many ICMPs get dropped on busy
366          * servers this needs to be solved differently.
367          */
368         if (sock_owned_by_user(sk))
369                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
370
371         if (sk->sk_state == TCP_CLOSE)
372                 goto out;
373
374         tp = tcp_sk(sk);
375         seq = ntohl(th->seq);
376         if (sk->sk_state != TCP_LISTEN &&
377             !between(seq, tp->snd_una, tp->snd_nxt)) {
378                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
379                 goto out;
380         }
381
382         switch (type) {
383         case ICMP_SOURCE_QUENCH:
384                 /* Just silently ignore these. */
385                 goto out;
386         case ICMP_PARAMETERPROB:
387                 err = EPROTO;
388                 break;
389         case ICMP_DEST_UNREACH:
390                 if (code > NR_ICMP_UNREACH)
391                         goto out;
392
393                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
394                         if (!sock_owned_by_user(sk))
395                                 do_pmtu_discovery(sk, iph, info);
396                         goto out;
397                 }
398
399                 err = icmp_err_convert[code].errno;
400                 break;
401         case ICMP_TIME_EXCEEDED:
402                 err = EHOSTUNREACH;
403                 break;
404         default:
405                 goto out;
406         }
407
408         switch (sk->sk_state) {
409                 struct request_sock *req, **prev;
410         case TCP_LISTEN:
411                 if (sock_owned_by_user(sk))
412                         goto out;
413
414                 req = inet_csk_search_req(sk, &prev, th->dest,
415                                           iph->daddr, iph->saddr);
416                 if (!req)
417                         goto out;
418
419                 /* ICMPs are not backlogged, hence we cannot get
420                    an established socket here.
421                  */
422                 BUG_TRAP(!req->sk);
423
424                 if (seq != tcp_rsk(req)->snt_isn) {
425                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
426                         goto out;
427                 }
428
429                 /*
430                  * Still in SYN_RECV, just remove it silently.
431                  * There is no good way to pass the error to the newly
432                  * created socket, and POSIX does not want network
433                  * errors returned from accept().
434                  */
435                 inet_csk_reqsk_queue_drop(sk, req, prev);
436                 goto out;
437
438         case TCP_SYN_SENT:
439         case TCP_SYN_RECV:  /* Cannot happen.
440                                It can f.e. if SYNs crossed.
441                              */
442                 if (!sock_owned_by_user(sk)) {
443                         sk->sk_err = err;
444
445                         sk->sk_error_report(sk);
446
447                         tcp_done(sk);
448                 } else {
449                         sk->sk_err_soft = err;
450                 }
451                 goto out;
452         }
453
454         /* If we've already connected we will keep trying
455          * until we time out, or the user gives up.
456          *
457          * rfc1122 4.2.3.9 allows to consider as hard errors
458          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
459          * but it is obsoleted by pmtu discovery).
460          *
461          * Note, that in modern internet, where routing is unreliable
462          * and in each dark corner broken firewalls sit, sending random
463          * errors ordered by their masters even this two messages finally lose
464          * their original sense (even Linux sends invalid PORT_UNREACHs)
465          *
466          * Now we are in compliance with RFCs.
467          *                                                      --ANK (980905)
468          */
469
470         inet = inet_sk(sk);
471         if (!sock_owned_by_user(sk) && inet->recverr) {
472                 sk->sk_err = err;
473                 sk->sk_error_report(sk);
474         } else  { /* Only an error on timeout */
475                 sk->sk_err_soft = err;
476         }
477
478 out:
479         bh_unlock_sock(sk);
480         sock_put(sk);
481 }
482
483 /* This routine computes an IPv4 TCP checksum. */
484 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
485 {
486         struct inet_sock *inet = inet_sk(sk);
487         struct tcphdr *th = tcp_hdr(skb);
488
489         if (skb->ip_summed == CHECKSUM_PARTIAL) {
490                 th->check = ~tcp_v4_check(len, inet->saddr,
491                                           inet->daddr, 0);
492                 skb->csum_start = skb_transport_header(skb) - skb->head;
493                 skb->csum_offset = offsetof(struct tcphdr, check);
494         } else {
495                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
496                                          csum_partial((char *)th,
497                                                       th->doff << 2,
498                                                       skb->csum));
499         }
500 }
501
502 int tcp_v4_gso_send_check(struct sk_buff *skb)
503 {
504         const struct iphdr *iph;
505         struct tcphdr *th;
506
507         if (!pskb_may_pull(skb, sizeof(*th)))
508                 return -EINVAL;
509
510         iph = ip_hdr(skb);
511         th = tcp_hdr(skb);
512
513         th->check = 0;
514         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
515         skb->csum_start = skb_transport_header(skb) - skb->head;
516         skb->csum_offset = offsetof(struct tcphdr, check);
517         skb->ip_summed = CHECKSUM_PARTIAL;
518         return 0;
519 }
520
521 /*
522  *      This routine will send an RST to the other tcp.
523  *
524  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
525  *                    for reset.
526  *      Answer: if a packet caused RST, it is not for a socket
527  *              existing in our system, if it is matched to a socket,
528  *              it is just duplicate segment or bug in other side's TCP.
529  *              So that we build reply only basing on parameters
530  *              arrived with segment.
531  *      Exception: precedence violation. We do not implement it in any case.
532  */
533
534 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
535 {
536         struct tcphdr *th = tcp_hdr(skb);
537         struct {
538                 struct tcphdr th;
539 #ifdef CONFIG_TCP_MD5SIG
540                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
541 #endif
542         } rep;
543         struct ip_reply_arg arg;
544 #ifdef CONFIG_TCP_MD5SIG
545         struct tcp_md5sig_key *key;
546 #endif
547         struct net *net;
548
549         /* Never send a reset in response to a reset. */
550         if (th->rst)
551                 return;
552
553         if (skb->rtable->rt_type != RTN_LOCAL)
554                 return;
555
556         /* Swap the send and the receive. */
557         memset(&rep, 0, sizeof(rep));
558         rep.th.dest   = th->source;
559         rep.th.source = th->dest;
560         rep.th.doff   = sizeof(struct tcphdr) / 4;
561         rep.th.rst    = 1;
562
563         if (th->ack) {
564                 rep.th.seq = th->ack_seq;
565         } else {
566                 rep.th.ack = 1;
567                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
568                                        skb->len - (th->doff << 2));
569         }
570
571         memset(&arg, 0, sizeof(arg));
572         arg.iov[0].iov_base = (unsigned char *)&rep;
573         arg.iov[0].iov_len  = sizeof(rep.th);
574
575 #ifdef CONFIG_TCP_MD5SIG
576         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
577         if (key) {
578                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
579                                    (TCPOPT_NOP << 16) |
580                                    (TCPOPT_MD5SIG << 8) |
581                                    TCPOLEN_MD5SIG);
582                 /* Update length and the length the header thinks exists */
583                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
584                 rep.th.doff = arg.iov[0].iov_len / 4;
585
586                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
587                                         key,
588                                         ip_hdr(skb)->daddr,
589                                         ip_hdr(skb)->saddr,
590                                         &rep.th, arg.iov[0].iov_len);
591         }
592 #endif
593         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
594                                       ip_hdr(skb)->saddr, /* XXX */
595                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
596         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
597
598         net = dev_net(skb->dst->dev);
599         ip_send_reply(net->ipv4.tcp_sock, skb,
600                       &arg, arg.iov[0].iov_len);
601
602         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
603         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
604 }
605
606 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
607    outside socket context is ugly, certainly. What can I do?
608  */
609
610 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
611                             u32 win, u32 ts, int oif,
612                             struct tcp_md5sig_key *key)
613 {
614         struct tcphdr *th = tcp_hdr(skb);
615         struct {
616                 struct tcphdr th;
617                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
618 #ifdef CONFIG_TCP_MD5SIG
619                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
620 #endif
621                         ];
622         } rep;
623         struct ip_reply_arg arg;
624         struct net *net = dev_net(skb->dev);
625
626         memset(&rep.th, 0, sizeof(struct tcphdr));
627         memset(&arg, 0, sizeof(arg));
628
629         arg.iov[0].iov_base = (unsigned char *)&rep;
630         arg.iov[0].iov_len  = sizeof(rep.th);
631         if (ts) {
632                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
633                                    (TCPOPT_TIMESTAMP << 8) |
634                                    TCPOLEN_TIMESTAMP);
635                 rep.opt[1] = htonl(tcp_time_stamp);
636                 rep.opt[2] = htonl(ts);
637                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
638         }
639
640         /* Swap the send and the receive. */
641         rep.th.dest    = th->source;
642         rep.th.source  = th->dest;
643         rep.th.doff    = arg.iov[0].iov_len / 4;
644         rep.th.seq     = htonl(seq);
645         rep.th.ack_seq = htonl(ack);
646         rep.th.ack     = 1;
647         rep.th.window  = htons(win);
648
649 #ifdef CONFIG_TCP_MD5SIG
650         if (key) {
651                 int offset = (ts) ? 3 : 0;
652
653                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
654                                           (TCPOPT_NOP << 16) |
655                                           (TCPOPT_MD5SIG << 8) |
656                                           TCPOLEN_MD5SIG);
657                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
658                 rep.th.doff = arg.iov[0].iov_len/4;
659
660                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
661                                         key,
662                                         ip_hdr(skb)->daddr,
663                                         ip_hdr(skb)->saddr,
664                                         &rep.th, arg.iov[0].iov_len);
665         }
666 #endif
667         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
668                                       ip_hdr(skb)->saddr, /* XXX */
669                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
670         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
671         if (oif)
672                 arg.bound_dev_if = oif;
673
674         ip_send_reply(net->ipv4.tcp_sock, skb,
675                       &arg, arg.iov[0].iov_len);
676
677         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
678 }
679
680 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
681 {
682         struct inet_timewait_sock *tw = inet_twsk(sk);
683         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
684
685         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
686                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
687                         tcptw->tw_ts_recent,
688                         tw->tw_bound_dev_if,
689                         tcp_twsk_md5_key(tcptw)
690                         );
691
692         inet_twsk_put(tw);
693 }
694
695 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
696                                   struct request_sock *req)
697 {
698         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
699                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
700                         req->ts_recent,
701                         0,
702                         tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr));
703 }
704
705 /*
706  *      Send a SYN-ACK after having received a SYN.
707  *      This still operates on a request_sock only, not on a big
708  *      socket.
709  */
710 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
711                                 struct dst_entry *dst)
712 {
713         const struct inet_request_sock *ireq = inet_rsk(req);
714         int err = -1;
715         struct sk_buff * skb;
716
717         /* First, grab a route. */
718         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
719                 return -1;
720
721         skb = tcp_make_synack(sk, dst, req);
722
723         if (skb) {
724                 struct tcphdr *th = tcp_hdr(skb);
725
726                 th->check = tcp_v4_check(skb->len,
727                                          ireq->loc_addr,
728                                          ireq->rmt_addr,
729                                          csum_partial((char *)th, skb->len,
730                                                       skb->csum));
731
732                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
733                                             ireq->rmt_addr,
734                                             ireq->opt);
735                 err = net_xmit_eval(err);
736         }
737
738         dst_release(dst);
739         return err;
740 }
741
742 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
743 {
744         return __tcp_v4_send_synack(sk, req, NULL);
745 }
746
747 /*
748  *      IPv4 request_sock destructor.
749  */
750 static void tcp_v4_reqsk_destructor(struct request_sock *req)
751 {
752         kfree(inet_rsk(req)->opt);
753 }
754
755 #ifdef CONFIG_SYN_COOKIES
756 static void syn_flood_warning(struct sk_buff *skb)
757 {
758         static unsigned long warntime;
759
760         if (time_after(jiffies, (warntime + HZ * 60))) {
761                 warntime = jiffies;
762                 printk(KERN_INFO
763                        "possible SYN flooding on port %d. Sending cookies.\n",
764                        ntohs(tcp_hdr(skb)->dest));
765         }
766 }
767 #endif
768
769 /*
770  * Save and compile IPv4 options into the request_sock if needed.
771  */
772 static struct ip_options *tcp_v4_save_options(struct sock *sk,
773                                               struct sk_buff *skb)
774 {
775         struct ip_options *opt = &(IPCB(skb)->opt);
776         struct ip_options *dopt = NULL;
777
778         if (opt && opt->optlen) {
779                 int opt_size = optlength(opt);
780                 dopt = kmalloc(opt_size, GFP_ATOMIC);
781                 if (dopt) {
782                         if (ip_options_echo(dopt, skb)) {
783                                 kfree(dopt);
784                                 dopt = NULL;
785                         }
786                 }
787         }
788         return dopt;
789 }
790
791 #ifdef CONFIG_TCP_MD5SIG
792 /*
793  * RFC2385 MD5 checksumming requires a mapping of
794  * IP address->MD5 Key.
795  * We need to maintain these in the sk structure.
796  */
797
798 /* Find the Key structure for an address.  */
799 static struct tcp_md5sig_key *
800                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
801 {
802         struct tcp_sock *tp = tcp_sk(sk);
803         int i;
804
805         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
806                 return NULL;
807         for (i = 0; i < tp->md5sig_info->entries4; i++) {
808                 if (tp->md5sig_info->keys4[i].addr == addr)
809                         return &tp->md5sig_info->keys4[i].base;
810         }
811         return NULL;
812 }
813
814 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
815                                          struct sock *addr_sk)
816 {
817         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
818 }
819
820 EXPORT_SYMBOL(tcp_v4_md5_lookup);
821
822 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
823                                                       struct request_sock *req)
824 {
825         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
826 }
827
828 /* This can be called on a newly created socket, from other files */
829 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
830                       u8 *newkey, u8 newkeylen)
831 {
832         /* Add Key to the list */
833         struct tcp_md5sig_key *key;
834         struct tcp_sock *tp = tcp_sk(sk);
835         struct tcp4_md5sig_key *keys;
836
837         key = tcp_v4_md5_do_lookup(sk, addr);
838         if (key) {
839                 /* Pre-existing entry - just update that one. */
840                 kfree(key->key);
841                 key->key = newkey;
842                 key->keylen = newkeylen;
843         } else {
844                 struct tcp_md5sig_info *md5sig;
845
846                 if (!tp->md5sig_info) {
847                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
848                                                   GFP_ATOMIC);
849                         if (!tp->md5sig_info) {
850                                 kfree(newkey);
851                                 return -ENOMEM;
852                         }
853                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
854                 }
855                 if (tcp_alloc_md5sig_pool() == NULL) {
856                         kfree(newkey);
857                         return -ENOMEM;
858                 }
859                 md5sig = tp->md5sig_info;
860
861                 if (md5sig->alloced4 == md5sig->entries4) {
862                         keys = kmalloc((sizeof(*keys) *
863                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
864                         if (!keys) {
865                                 kfree(newkey);
866                                 tcp_free_md5sig_pool();
867                                 return -ENOMEM;
868                         }
869
870                         if (md5sig->entries4)
871                                 memcpy(keys, md5sig->keys4,
872                                        sizeof(*keys) * md5sig->entries4);
873
874                         /* Free old key list, and reference new one */
875                         kfree(md5sig->keys4);
876                         md5sig->keys4 = keys;
877                         md5sig->alloced4++;
878                 }
879                 md5sig->entries4++;
880                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
881                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
882                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
883         }
884         return 0;
885 }
886
887 EXPORT_SYMBOL(tcp_v4_md5_do_add);
888
889 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
890                                u8 *newkey, u8 newkeylen)
891 {
892         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
893                                  newkey, newkeylen);
894 }
895
896 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
897 {
898         struct tcp_sock *tp = tcp_sk(sk);
899         int i;
900
901         for (i = 0; i < tp->md5sig_info->entries4; i++) {
902                 if (tp->md5sig_info->keys4[i].addr == addr) {
903                         /* Free the key */
904                         kfree(tp->md5sig_info->keys4[i].base.key);
905                         tp->md5sig_info->entries4--;
906
907                         if (tp->md5sig_info->entries4 == 0) {
908                                 kfree(tp->md5sig_info->keys4);
909                                 tp->md5sig_info->keys4 = NULL;
910                                 tp->md5sig_info->alloced4 = 0;
911                         } else if (tp->md5sig_info->entries4 != i) {
912                                 /* Need to do some manipulation */
913                                 memmove(&tp->md5sig_info->keys4[i],
914                                         &tp->md5sig_info->keys4[i+1],
915                                         (tp->md5sig_info->entries4 - i) *
916                                          sizeof(struct tcp4_md5sig_key));
917                         }
918                         tcp_free_md5sig_pool();
919                         return 0;
920                 }
921         }
922         return -ENOENT;
923 }
924
925 EXPORT_SYMBOL(tcp_v4_md5_do_del);
926
927 static void tcp_v4_clear_md5_list(struct sock *sk)
928 {
929         struct tcp_sock *tp = tcp_sk(sk);
930
931         /* Free each key, then the set of key keys,
932          * the crypto element, and then decrement our
933          * hold on the last resort crypto.
934          */
935         if (tp->md5sig_info->entries4) {
936                 int i;
937                 for (i = 0; i < tp->md5sig_info->entries4; i++)
938                         kfree(tp->md5sig_info->keys4[i].base.key);
939                 tp->md5sig_info->entries4 = 0;
940                 tcp_free_md5sig_pool();
941         }
942         if (tp->md5sig_info->keys4) {
943                 kfree(tp->md5sig_info->keys4);
944                 tp->md5sig_info->keys4 = NULL;
945                 tp->md5sig_info->alloced4  = 0;
946         }
947 }
948
949 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
950                                  int optlen)
951 {
952         struct tcp_md5sig cmd;
953         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
954         u8 *newkey;
955
956         if (optlen < sizeof(cmd))
957                 return -EINVAL;
958
959         if (copy_from_user(&cmd, optval, sizeof(cmd)))
960                 return -EFAULT;
961
962         if (sin->sin_family != AF_INET)
963                 return -EINVAL;
964
965         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
966                 if (!tcp_sk(sk)->md5sig_info)
967                         return -ENOENT;
968                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
969         }
970
971         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
972                 return -EINVAL;
973
974         if (!tcp_sk(sk)->md5sig_info) {
975                 struct tcp_sock *tp = tcp_sk(sk);
976                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
977
978                 if (!p)
979                         return -EINVAL;
980
981                 tp->md5sig_info = p;
982                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
983         }
984
985         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
986         if (!newkey)
987                 return -ENOMEM;
988         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
989                                  newkey, cmd.tcpm_keylen);
990 }
991
992 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
993                                    __be32 saddr, __be32 daddr,
994                                    struct tcphdr *th,
995                                    unsigned int tcplen)
996 {
997         struct tcp_md5sig_pool *hp;
998         struct tcp4_pseudohdr *bp;
999         int err;
1000
1001         /*
1002          * Okay, so RFC2385 is turned on for this connection,
1003          * so we need to generate the MD5 hash for the packet now.
1004          */
1005
1006         hp = tcp_get_md5sig_pool();
1007         if (!hp)
1008                 goto clear_hash_noput;
1009
1010         bp = &hp->md5_blk.ip4;
1011
1012         /*
1013          * The TCP pseudo-header (in the order: source IP address,
1014          * destination IP address, zero-padded protocol number, and
1015          * segment length)
1016          */
1017         bp->saddr = saddr;
1018         bp->daddr = daddr;
1019         bp->pad = 0;
1020         bp->protocol = IPPROTO_TCP;
1021         bp->len = htons(tcplen);
1022
1023         err = tcp_calc_md5_hash(md5_hash, key, sizeof(*bp),
1024                                 th, tcplen, hp);
1025         if (err)
1026                 goto clear_hash;
1027
1028         /* Free up the crypto pool */
1029         tcp_put_md5sig_pool();
1030 out:
1031         return 0;
1032 clear_hash:
1033         tcp_put_md5sig_pool();
1034 clear_hash_noput:
1035         memset(md5_hash, 0, 16);
1036         goto out;
1037 }
1038
1039 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1040                          struct sock *sk,
1041                          struct dst_entry *dst,
1042                          struct request_sock *req,
1043                          struct tcphdr *th,
1044                          unsigned int tcplen)
1045 {
1046         __be32 saddr, daddr;
1047
1048         if (sk) {
1049                 saddr = inet_sk(sk)->saddr;
1050                 daddr = inet_sk(sk)->daddr;
1051         } else {
1052                 struct rtable *rt = (struct rtable *)dst;
1053                 BUG_ON(!rt);
1054                 saddr = rt->rt_src;
1055                 daddr = rt->rt_dst;
1056         }
1057         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1058                                        saddr, daddr,
1059                                        th, tcplen);
1060 }
1061
1062 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1063
1064 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1065 {
1066         /*
1067          * This gets called for each TCP segment that arrives
1068          * so we want to be efficient.
1069          * We have 3 drop cases:
1070          * o No MD5 hash and one expected.
1071          * o MD5 hash and we're not expecting one.
1072          * o MD5 hash and its wrong.
1073          */
1074         __u8 *hash_location = NULL;
1075         struct tcp_md5sig_key *hash_expected;
1076         const struct iphdr *iph = ip_hdr(skb);
1077         struct tcphdr *th = tcp_hdr(skb);
1078         int genhash;
1079         unsigned char newhash[16];
1080
1081         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1082         hash_location = tcp_parse_md5sig_option(th);
1083
1084         /* We've parsed the options - do we have a hash? */
1085         if (!hash_expected && !hash_location)
1086                 return 0;
1087
1088         if (hash_expected && !hash_location) {
1089                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1090                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1091                                NIPQUAD(iph->saddr), ntohs(th->source),
1092                                NIPQUAD(iph->daddr), ntohs(th->dest));
1093                 return 1;
1094         }
1095
1096         if (!hash_expected && hash_location) {
1097                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1098                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1099                                NIPQUAD(iph->saddr), ntohs(th->source),
1100                                NIPQUAD(iph->daddr), ntohs(th->dest));
1101                 return 1;
1102         }
1103
1104         /* Okay, so this is hash_expected and hash_location -
1105          * so we need to calculate the checksum.
1106          */
1107         genhash = tcp_v4_do_calc_md5_hash(newhash,
1108                                           hash_expected,
1109                                           iph->saddr, iph->daddr,
1110                                           th, skb->len);
1111
1112         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1113                 if (net_ratelimit()) {
1114                         printk(KERN_INFO "MD5 Hash failed for "
1115                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1116                                NIPQUAD(iph->saddr), ntohs(th->source),
1117                                NIPQUAD(iph->daddr), ntohs(th->dest),
1118                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1119                 }
1120                 return 1;
1121         }
1122         return 0;
1123 }
1124
1125 #endif
1126
1127 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1128         .family         =       PF_INET,
1129         .obj_size       =       sizeof(struct tcp_request_sock),
1130         .rtx_syn_ack    =       tcp_v4_send_synack,
1131         .send_ack       =       tcp_v4_reqsk_send_ack,
1132         .destructor     =       tcp_v4_reqsk_destructor,
1133         .send_reset     =       tcp_v4_send_reset,
1134 };
1135
1136 #ifdef CONFIG_TCP_MD5SIG
1137 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1138         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1139 };
1140 #endif
1141
1142 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1143         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1144         .twsk_unique    = tcp_twsk_unique,
1145         .twsk_destructor= tcp_twsk_destructor,
1146 };
1147
1148 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1149 {
1150         struct inet_request_sock *ireq;
1151         struct tcp_options_received tmp_opt;
1152         struct request_sock *req;
1153         __be32 saddr = ip_hdr(skb)->saddr;
1154         __be32 daddr = ip_hdr(skb)->daddr;
1155         __u32 isn = TCP_SKB_CB(skb)->when;
1156         struct dst_entry *dst = NULL;
1157 #ifdef CONFIG_SYN_COOKIES
1158         int want_cookie = 0;
1159 #else
1160 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1161 #endif
1162
1163         /* Never answer to SYNs send to broadcast or multicast */
1164         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1165                 goto drop;
1166
1167         /* TW buckets are converted to open requests without
1168          * limitations, they conserve resources and peer is
1169          * evidently real one.
1170          */
1171         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1172 #ifdef CONFIG_SYN_COOKIES
1173                 if (sysctl_tcp_syncookies) {
1174                         want_cookie = 1;
1175                 } else
1176 #endif
1177                 goto drop;
1178         }
1179
1180         /* Accept backlog is full. If we have already queued enough
1181          * of warm entries in syn queue, drop request. It is better than
1182          * clogging syn queue with openreqs with exponentially increasing
1183          * timeout.
1184          */
1185         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1186                 goto drop;
1187
1188         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1189         if (!req)
1190                 goto drop;
1191
1192 #ifdef CONFIG_TCP_MD5SIG
1193         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1194 #endif
1195
1196         tcp_clear_options(&tmp_opt);
1197         tmp_opt.mss_clamp = 536;
1198         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1199
1200         tcp_parse_options(skb, &tmp_opt, 0);
1201
1202         if (want_cookie && !tmp_opt.saw_tstamp)
1203                 tcp_clear_options(&tmp_opt);
1204
1205         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1206                 /* Some OSes (unknown ones, but I see them on web server, which
1207                  * contains information interesting only for windows'
1208                  * users) do not send their stamp in SYN. It is easy case.
1209                  * We simply do not advertise TS support.
1210                  */
1211                 tmp_opt.saw_tstamp = 0;
1212                 tmp_opt.tstamp_ok  = 0;
1213         }
1214         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1215
1216         tcp_openreq_init(req, &tmp_opt, skb);
1217
1218         if (security_inet_conn_request(sk, skb, req))
1219                 goto drop_and_free;
1220
1221         ireq = inet_rsk(req);
1222         ireq->loc_addr = daddr;
1223         ireq->rmt_addr = saddr;
1224         ireq->opt = tcp_v4_save_options(sk, skb);
1225         if (!want_cookie)
1226                 TCP_ECN_create_request(req, tcp_hdr(skb));
1227
1228         if (want_cookie) {
1229 #ifdef CONFIG_SYN_COOKIES
1230                 syn_flood_warning(skb);
1231                 req->cookie_ts = tmp_opt.tstamp_ok;
1232 #endif
1233                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1234         } else if (!isn) {
1235                 struct inet_peer *peer = NULL;
1236
1237                 /* VJ's idea. We save last timestamp seen
1238                  * from the destination in peer table, when entering
1239                  * state TIME-WAIT, and check against it before
1240                  * accepting new connection request.
1241                  *
1242                  * If "isn" is not zero, this request hit alive
1243                  * timewait bucket, so that all the necessary checks
1244                  * are made in the function processing timewait state.
1245                  */
1246                 if (tmp_opt.saw_tstamp &&
1247                     tcp_death_row.sysctl_tw_recycle &&
1248                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1249                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1250                     peer->v4daddr == saddr) {
1251                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1252                             (s32)(peer->tcp_ts - req->ts_recent) >
1253                                                         TCP_PAWS_WINDOW) {
1254                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1255                                 goto drop_and_release;
1256                         }
1257                 }
1258                 /* Kill the following clause, if you dislike this way. */
1259                 else if (!sysctl_tcp_syncookies &&
1260                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1261                           (sysctl_max_syn_backlog >> 2)) &&
1262                          (!peer || !peer->tcp_ts_stamp) &&
1263                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1264                         /* Without syncookies last quarter of
1265                          * backlog is filled with destinations,
1266                          * proven to be alive.
1267                          * It means that we continue to communicate
1268                          * to destinations, already remembered
1269                          * to the moment of synflood.
1270                          */
1271                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1272                                        "request from " NIPQUAD_FMT "/%u\n",
1273                                        NIPQUAD(saddr),
1274                                        ntohs(tcp_hdr(skb)->source));
1275                         goto drop_and_release;
1276                 }
1277
1278                 isn = tcp_v4_init_sequence(skb);
1279         }
1280         tcp_rsk(req)->snt_isn = isn;
1281
1282         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1283                 goto drop_and_free;
1284
1285         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1286         return 0;
1287
1288 drop_and_release:
1289         dst_release(dst);
1290 drop_and_free:
1291         reqsk_free(req);
1292 drop:
1293         return 0;
1294 }
1295
1296
1297 /*
1298  * The three way handshake has completed - we got a valid synack -
1299  * now create the new socket.
1300  */
1301 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1302                                   struct request_sock *req,
1303                                   struct dst_entry *dst)
1304 {
1305         struct inet_request_sock *ireq;
1306         struct inet_sock *newinet;
1307         struct tcp_sock *newtp;
1308         struct sock *newsk;
1309 #ifdef CONFIG_TCP_MD5SIG
1310         struct tcp_md5sig_key *key;
1311 #endif
1312
1313         if (sk_acceptq_is_full(sk))
1314                 goto exit_overflow;
1315
1316         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1317                 goto exit;
1318
1319         newsk = tcp_create_openreq_child(sk, req, skb);
1320         if (!newsk)
1321                 goto exit;
1322
1323         newsk->sk_gso_type = SKB_GSO_TCPV4;
1324         sk_setup_caps(newsk, dst);
1325
1326         newtp                 = tcp_sk(newsk);
1327         newinet               = inet_sk(newsk);
1328         ireq                  = inet_rsk(req);
1329         newinet->daddr        = ireq->rmt_addr;
1330         newinet->rcv_saddr    = ireq->loc_addr;
1331         newinet->saddr        = ireq->loc_addr;
1332         newinet->opt          = ireq->opt;
1333         ireq->opt             = NULL;
1334         newinet->mc_index     = inet_iif(skb);
1335         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1336         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1337         if (newinet->opt)
1338                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1339         newinet->id = newtp->write_seq ^ jiffies;
1340
1341         tcp_mtup_init(newsk);
1342         tcp_sync_mss(newsk, dst_mtu(dst));
1343         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1344         tcp_initialize_rcv_mss(newsk);
1345
1346 #ifdef CONFIG_TCP_MD5SIG
1347         /* Copy over the MD5 key from the original socket */
1348         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1349                 /*
1350                  * We're using one, so create a matching key
1351                  * on the newsk structure. If we fail to get
1352                  * memory, then we end up not copying the key
1353                  * across. Shucks.
1354                  */
1355                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1356                 if (newkey != NULL)
1357                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1358                                           newkey, key->keylen);
1359         }
1360 #endif
1361
1362         __inet_hash_nolisten(newsk);
1363         __inet_inherit_port(sk, newsk);
1364
1365         return newsk;
1366
1367 exit_overflow:
1368         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1369 exit:
1370         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1371         dst_release(dst);
1372         return NULL;
1373 }
1374
1375 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1376 {
1377         struct tcphdr *th = tcp_hdr(skb);
1378         const struct iphdr *iph = ip_hdr(skb);
1379         struct sock *nsk;
1380         struct request_sock **prev;
1381         /* Find possible connection requests. */
1382         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1383                                                        iph->saddr, iph->daddr);
1384         if (req)
1385                 return tcp_check_req(sk, skb, req, prev);
1386
1387         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1388                         th->source, iph->daddr, th->dest, inet_iif(skb));
1389
1390         if (nsk) {
1391                 if (nsk->sk_state != TCP_TIME_WAIT) {
1392                         bh_lock_sock(nsk);
1393                         return nsk;
1394                 }
1395                 inet_twsk_put(inet_twsk(nsk));
1396                 return NULL;
1397         }
1398
1399 #ifdef CONFIG_SYN_COOKIES
1400         if (!th->rst && !th->syn && th->ack)
1401                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1402 #endif
1403         return sk;
1404 }
1405
1406 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1407 {
1408         const struct iphdr *iph = ip_hdr(skb);
1409
1410         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1411                 if (!tcp_v4_check(skb->len, iph->saddr,
1412                                   iph->daddr, skb->csum)) {
1413                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1414                         return 0;
1415                 }
1416         }
1417
1418         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1419                                        skb->len, IPPROTO_TCP, 0);
1420
1421         if (skb->len <= 76) {
1422                 return __skb_checksum_complete(skb);
1423         }
1424         return 0;
1425 }
1426
1427
1428 /* The socket must have it's spinlock held when we get
1429  * here.
1430  *
1431  * We have a potential double-lock case here, so even when
1432  * doing backlog processing we use the BH locking scheme.
1433  * This is because we cannot sleep with the original spinlock
1434  * held.
1435  */
1436 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1437 {
1438         struct sock *rsk;
1439 #ifdef CONFIG_TCP_MD5SIG
1440         /*
1441          * We really want to reject the packet as early as possible
1442          * if:
1443          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1444          *  o There is an MD5 option and we're not expecting one
1445          */
1446         if (tcp_v4_inbound_md5_hash(sk, skb))
1447                 goto discard;
1448 #endif
1449
1450         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1451                 TCP_CHECK_TIMER(sk);
1452                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1453                         rsk = sk;
1454                         goto reset;
1455                 }
1456                 TCP_CHECK_TIMER(sk);
1457                 return 0;
1458         }
1459
1460         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1461                 goto csum_err;
1462
1463         if (sk->sk_state == TCP_LISTEN) {
1464                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1465                 if (!nsk)
1466                         goto discard;
1467
1468                 if (nsk != sk) {
1469                         if (tcp_child_process(sk, nsk, skb)) {
1470                                 rsk = nsk;
1471                                 goto reset;
1472                         }
1473                         return 0;
1474                 }
1475         }
1476
1477         TCP_CHECK_TIMER(sk);
1478         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1479                 rsk = sk;
1480                 goto reset;
1481         }
1482         TCP_CHECK_TIMER(sk);
1483         return 0;
1484
1485 reset:
1486         tcp_v4_send_reset(rsk, skb);
1487 discard:
1488         kfree_skb(skb);
1489         /* Be careful here. If this function gets more complicated and
1490          * gcc suffers from register pressure on the x86, sk (in %ebx)
1491          * might be destroyed here. This current version compiles correctly,
1492          * but you have been warned.
1493          */
1494         return 0;
1495
1496 csum_err:
1497         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1498         goto discard;
1499 }
1500
1501 /*
1502  *      From tcp_input.c
1503  */
1504
1505 int tcp_v4_rcv(struct sk_buff *skb)
1506 {
1507         const struct iphdr *iph;
1508         struct tcphdr *th;
1509         struct sock *sk;
1510         int ret;
1511         struct net *net = dev_net(skb->dev);
1512
1513         if (skb->pkt_type != PACKET_HOST)
1514                 goto discard_it;
1515
1516         /* Count it even if it's bad */
1517         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1518
1519         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1520                 goto discard_it;
1521
1522         th = tcp_hdr(skb);
1523
1524         if (th->doff < sizeof(struct tcphdr) / 4)
1525                 goto bad_packet;
1526         if (!pskb_may_pull(skb, th->doff * 4))
1527                 goto discard_it;
1528
1529         /* An explanation is required here, I think.
1530          * Packet length and doff are validated by header prediction,
1531          * provided case of th->doff==0 is eliminated.
1532          * So, we defer the checks. */
1533         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1534                 goto bad_packet;
1535
1536         th = tcp_hdr(skb);
1537         iph = ip_hdr(skb);
1538         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1539         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1540                                     skb->len - th->doff * 4);
1541         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1542         TCP_SKB_CB(skb)->when    = 0;
1543         TCP_SKB_CB(skb)->flags   = iph->tos;
1544         TCP_SKB_CB(skb)->sacked  = 0;
1545
1546         sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr,
1547                         th->source, iph->daddr, th->dest, inet_iif(skb));
1548         if (!sk)
1549                 goto no_tcp_socket;
1550
1551 process:
1552         if (sk->sk_state == TCP_TIME_WAIT)
1553                 goto do_time_wait;
1554
1555         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1556                 goto discard_and_relse;
1557         nf_reset(skb);
1558
1559         if (sk_filter(sk, skb))
1560                 goto discard_and_relse;
1561
1562         skb->dev = NULL;
1563
1564         bh_lock_sock_nested(sk);
1565         ret = 0;
1566         if (!sock_owned_by_user(sk)) {
1567 #ifdef CONFIG_NET_DMA
1568                 struct tcp_sock *tp = tcp_sk(sk);
1569                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1570                         tp->ucopy.dma_chan = get_softnet_dma();
1571                 if (tp->ucopy.dma_chan)
1572                         ret = tcp_v4_do_rcv(sk, skb);
1573                 else
1574 #endif
1575                 {
1576                         if (!tcp_prequeue(sk, skb))
1577                         ret = tcp_v4_do_rcv(sk, skb);
1578                 }
1579         } else
1580                 sk_add_backlog(sk, skb);
1581         bh_unlock_sock(sk);
1582
1583         sock_put(sk);
1584
1585         return ret;
1586
1587 no_tcp_socket:
1588         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1589                 goto discard_it;
1590
1591         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1592 bad_packet:
1593                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1594         } else {
1595                 tcp_v4_send_reset(NULL, skb);
1596         }
1597
1598 discard_it:
1599         /* Discard frame. */
1600         kfree_skb(skb);
1601         return 0;
1602
1603 discard_and_relse:
1604         sock_put(sk);
1605         goto discard_it;
1606
1607 do_time_wait:
1608         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1609                 inet_twsk_put(inet_twsk(sk));
1610                 goto discard_it;
1611         }
1612
1613         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1614                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1615                 inet_twsk_put(inet_twsk(sk));
1616                 goto discard_it;
1617         }
1618         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1619         case TCP_TW_SYN: {
1620                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1621                                                         &tcp_hashinfo,
1622                                                         iph->daddr, th->dest,
1623                                                         inet_iif(skb));
1624                 if (sk2) {
1625                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1626                         inet_twsk_put(inet_twsk(sk));
1627                         sk = sk2;
1628                         goto process;
1629                 }
1630                 /* Fall through to ACK */
1631         }
1632         case TCP_TW_ACK:
1633                 tcp_v4_timewait_ack(sk, skb);
1634                 break;
1635         case TCP_TW_RST:
1636                 goto no_tcp_socket;
1637         case TCP_TW_SUCCESS:;
1638         }
1639         goto discard_it;
1640 }
1641
1642 /* VJ's idea. Save last timestamp seen from this destination
1643  * and hold it at least for normal timewait interval to use for duplicate
1644  * segment detection in subsequent connections, before they enter synchronized
1645  * state.
1646  */
1647
1648 int tcp_v4_remember_stamp(struct sock *sk)
1649 {
1650         struct inet_sock *inet = inet_sk(sk);
1651         struct tcp_sock *tp = tcp_sk(sk);
1652         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1653         struct inet_peer *peer = NULL;
1654         int release_it = 0;
1655
1656         if (!rt || rt->rt_dst != inet->daddr) {
1657                 peer = inet_getpeer(inet->daddr, 1);
1658                 release_it = 1;
1659         } else {
1660                 if (!rt->peer)
1661                         rt_bind_peer(rt, 1);
1662                 peer = rt->peer;
1663         }
1664
1665         if (peer) {
1666                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1667                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1668                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1669                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1670                         peer->tcp_ts = tp->rx_opt.ts_recent;
1671                 }
1672                 if (release_it)
1673                         inet_putpeer(peer);
1674                 return 1;
1675         }
1676
1677         return 0;
1678 }
1679
1680 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1681 {
1682         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1683
1684         if (peer) {
1685                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1686
1687                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1688                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1689                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1690                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1691                         peer->tcp_ts       = tcptw->tw_ts_recent;
1692                 }
1693                 inet_putpeer(peer);
1694                 return 1;
1695         }
1696
1697         return 0;
1698 }
1699
1700 struct inet_connection_sock_af_ops ipv4_specific = {
1701         .queue_xmit        = ip_queue_xmit,
1702         .send_check        = tcp_v4_send_check,
1703         .rebuild_header    = inet_sk_rebuild_header,
1704         .conn_request      = tcp_v4_conn_request,
1705         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1706         .remember_stamp    = tcp_v4_remember_stamp,
1707         .net_header_len    = sizeof(struct iphdr),
1708         .setsockopt        = ip_setsockopt,
1709         .getsockopt        = ip_getsockopt,
1710         .addr2sockaddr     = inet_csk_addr2sockaddr,
1711         .sockaddr_len      = sizeof(struct sockaddr_in),
1712         .bind_conflict     = inet_csk_bind_conflict,
1713 #ifdef CONFIG_COMPAT
1714         .compat_setsockopt = compat_ip_setsockopt,
1715         .compat_getsockopt = compat_ip_getsockopt,
1716 #endif
1717 };
1718
1719 #ifdef CONFIG_TCP_MD5SIG
1720 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1721         .md5_lookup             = tcp_v4_md5_lookup,
1722         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1723         .md5_add                = tcp_v4_md5_add_func,
1724         .md5_parse              = tcp_v4_parse_md5_keys,
1725 };
1726 #endif
1727
1728 /* NOTE: A lot of things set to zero explicitly by call to
1729  *       sk_alloc() so need not be done here.
1730  */
1731 static int tcp_v4_init_sock(struct sock *sk)
1732 {
1733         struct inet_connection_sock *icsk = inet_csk(sk);
1734         struct tcp_sock *tp = tcp_sk(sk);
1735
1736         skb_queue_head_init(&tp->out_of_order_queue);
1737         tcp_init_xmit_timers(sk);
1738         tcp_prequeue_init(tp);
1739
1740         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1741         tp->mdev = TCP_TIMEOUT_INIT;
1742
1743         /* So many TCP implementations out there (incorrectly) count the
1744          * initial SYN frame in their delayed-ACK and congestion control
1745          * algorithms that we must have the following bandaid to talk
1746          * efficiently to them.  -DaveM
1747          */
1748         tp->snd_cwnd = 2;
1749
1750         /* See draft-stevens-tcpca-spec-01 for discussion of the
1751          * initialization of these values.
1752          */
1753         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1754         tp->snd_cwnd_clamp = ~0;
1755         tp->mss_cache = 536;
1756
1757         tp->reordering = sysctl_tcp_reordering;
1758         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1759
1760         sk->sk_state = TCP_CLOSE;
1761
1762         sk->sk_write_space = sk_stream_write_space;
1763         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1764
1765         icsk->icsk_af_ops = &ipv4_specific;
1766         icsk->icsk_sync_mss = tcp_sync_mss;
1767 #ifdef CONFIG_TCP_MD5SIG
1768         tp->af_specific = &tcp_sock_ipv4_specific;
1769 #endif
1770
1771         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1772         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1773
1774         atomic_inc(&tcp_sockets_allocated);
1775
1776         return 0;
1777 }
1778
1779 void tcp_v4_destroy_sock(struct sock *sk)
1780 {
1781         struct tcp_sock *tp = tcp_sk(sk);
1782
1783         tcp_clear_xmit_timers(sk);
1784
1785         tcp_cleanup_congestion_control(sk);
1786
1787         /* Cleanup up the write buffer. */
1788         tcp_write_queue_purge(sk);
1789
1790         /* Cleans up our, hopefully empty, out_of_order_queue. */
1791         __skb_queue_purge(&tp->out_of_order_queue);
1792
1793 #ifdef CONFIG_TCP_MD5SIG
1794         /* Clean up the MD5 key list, if any */
1795         if (tp->md5sig_info) {
1796                 tcp_v4_clear_md5_list(sk);
1797                 kfree(tp->md5sig_info);
1798                 tp->md5sig_info = NULL;
1799         }
1800 #endif
1801
1802 #ifdef CONFIG_NET_DMA
1803         /* Cleans up our sk_async_wait_queue */
1804         __skb_queue_purge(&sk->sk_async_wait_queue);
1805 #endif
1806
1807         /* Clean prequeue, it must be empty really */
1808         __skb_queue_purge(&tp->ucopy.prequeue);
1809
1810         /* Clean up a referenced TCP bind bucket. */
1811         if (inet_csk(sk)->icsk_bind_hash)
1812                 inet_put_port(sk);
1813
1814         /*
1815          * If sendmsg cached page exists, toss it.
1816          */
1817         if (sk->sk_sndmsg_page) {
1818                 __free_page(sk->sk_sndmsg_page);
1819                 sk->sk_sndmsg_page = NULL;
1820         }
1821
1822         atomic_dec(&tcp_sockets_allocated);
1823 }
1824
1825 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1826
1827 #ifdef CONFIG_PROC_FS
1828 /* Proc filesystem TCP sock list dumping. */
1829
1830 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1831 {
1832         return hlist_empty(head) ? NULL :
1833                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1834 }
1835
1836 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1837 {
1838         return tw->tw_node.next ?
1839                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1840 }
1841
1842 static void *listening_get_next(struct seq_file *seq, void *cur)
1843 {
1844         struct inet_connection_sock *icsk;
1845         struct hlist_node *node;
1846         struct sock *sk = cur;
1847         struct tcp_iter_state* st = seq->private;
1848         struct net *net = seq_file_net(seq);
1849
1850         if (!sk) {
1851                 st->bucket = 0;
1852                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1853                 goto get_sk;
1854         }
1855
1856         ++st->num;
1857
1858         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1859                 struct request_sock *req = cur;
1860
1861                 icsk = inet_csk(st->syn_wait_sk);
1862                 req = req->dl_next;
1863                 while (1) {
1864                         while (req) {
1865                                 if (req->rsk_ops->family == st->family &&
1866                                     net_eq(sock_net(req->sk), net)) {
1867                                         cur = req;
1868                                         goto out;
1869                                 }
1870                                 req = req->dl_next;
1871                         }
1872                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1873                                 break;
1874 get_req:
1875                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1876                 }
1877                 sk        = sk_next(st->syn_wait_sk);
1878                 st->state = TCP_SEQ_STATE_LISTENING;
1879                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1880         } else {
1881                 icsk = inet_csk(sk);
1882                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1883                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1884                         goto start_req;
1885                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1886                 sk = sk_next(sk);
1887         }
1888 get_sk:
1889         sk_for_each_from(sk, node) {
1890                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1891                         cur = sk;
1892                         goto out;
1893                 }
1894                 icsk = inet_csk(sk);
1895                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1896                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1897 start_req:
1898                         st->uid         = sock_i_uid(sk);
1899                         st->syn_wait_sk = sk;
1900                         st->state       = TCP_SEQ_STATE_OPENREQ;
1901                         st->sbucket     = 0;
1902                         goto get_req;
1903                 }
1904                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1905         }
1906         if (++st->bucket < INET_LHTABLE_SIZE) {
1907                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1908                 goto get_sk;
1909         }
1910         cur = NULL;
1911 out:
1912         return cur;
1913 }
1914
1915 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1916 {
1917         void *rc = listening_get_next(seq, NULL);
1918
1919         while (rc && *pos) {
1920                 rc = listening_get_next(seq, rc);
1921                 --*pos;
1922         }
1923         return rc;
1924 }
1925
1926 static void *established_get_first(struct seq_file *seq)
1927 {
1928         struct tcp_iter_state* st = seq->private;
1929         struct net *net = seq_file_net(seq);
1930         void *rc = NULL;
1931
1932         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1933                 struct sock *sk;
1934                 struct hlist_node *node;
1935                 struct inet_timewait_sock *tw;
1936                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1937
1938                 read_lock_bh(lock);
1939                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1940                         if (sk->sk_family != st->family ||
1941                             !net_eq(sock_net(sk), net)) {
1942                                 continue;
1943                         }
1944                         rc = sk;
1945                         goto out;
1946                 }
1947                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1948                 inet_twsk_for_each(tw, node,
1949                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
1950                         if (tw->tw_family != st->family ||
1951                             !net_eq(twsk_net(tw), net)) {
1952                                 continue;
1953                         }
1954                         rc = tw;
1955                         goto out;
1956                 }
1957                 read_unlock_bh(lock);
1958                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1959         }
1960 out:
1961         return rc;
1962 }
1963
1964 static void *established_get_next(struct seq_file *seq, void *cur)
1965 {
1966         struct sock *sk = cur;
1967         struct inet_timewait_sock *tw;
1968         struct hlist_node *node;
1969         struct tcp_iter_state* st = seq->private;
1970         struct net *net = seq_file_net(seq);
1971
1972         ++st->num;
1973
1974         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1975                 tw = cur;
1976                 tw = tw_next(tw);
1977 get_tw:
1978                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1979                         tw = tw_next(tw);
1980                 }
1981                 if (tw) {
1982                         cur = tw;
1983                         goto out;
1984                 }
1985                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1986                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1987
1988                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1989                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1990                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1991                 } else {
1992                         cur = NULL;
1993                         goto out;
1994                 }
1995         } else
1996                 sk = sk_next(sk);
1997
1998         sk_for_each_from(sk, node) {
1999                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2000                         goto found;
2001         }
2002
2003         st->state = TCP_SEQ_STATE_TIME_WAIT;
2004         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2005         goto get_tw;
2006 found:
2007         cur = sk;
2008 out:
2009         return cur;
2010 }
2011
2012 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2013 {
2014         void *rc = established_get_first(seq);
2015
2016         while (rc && pos) {
2017                 rc = established_get_next(seq, rc);
2018                 --pos;
2019         }
2020         return rc;
2021 }
2022
2023 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2024 {
2025         void *rc;
2026         struct tcp_iter_state* st = seq->private;
2027
2028         inet_listen_lock(&tcp_hashinfo);
2029         st->state = TCP_SEQ_STATE_LISTENING;
2030         rc        = listening_get_idx(seq, &pos);
2031
2032         if (!rc) {
2033                 inet_listen_unlock(&tcp_hashinfo);
2034                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2035                 rc        = established_get_idx(seq, pos);
2036         }
2037
2038         return rc;
2039 }
2040
2041 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2042 {
2043         struct tcp_iter_state* st = seq->private;
2044         st->state = TCP_SEQ_STATE_LISTENING;
2045         st->num = 0;
2046         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2047 }
2048
2049 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2050 {
2051         void *rc = NULL;
2052         struct tcp_iter_state* st;
2053
2054         if (v == SEQ_START_TOKEN) {
2055                 rc = tcp_get_idx(seq, 0);
2056                 goto out;
2057         }
2058         st = seq->private;
2059
2060         switch (st->state) {
2061         case TCP_SEQ_STATE_OPENREQ:
2062         case TCP_SEQ_STATE_LISTENING:
2063                 rc = listening_get_next(seq, v);
2064                 if (!rc) {
2065                         inet_listen_unlock(&tcp_hashinfo);
2066                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2067                         rc        = established_get_first(seq);
2068                 }
2069                 break;
2070         case TCP_SEQ_STATE_ESTABLISHED:
2071         case TCP_SEQ_STATE_TIME_WAIT:
2072                 rc = established_get_next(seq, v);
2073                 break;
2074         }
2075 out:
2076         ++*pos;
2077         return rc;
2078 }
2079
2080 static void tcp_seq_stop(struct seq_file *seq, void *v)
2081 {
2082         struct tcp_iter_state* st = seq->private;
2083
2084         switch (st->state) {
2085         case TCP_SEQ_STATE_OPENREQ:
2086                 if (v) {
2087                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2088                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2089                 }
2090         case TCP_SEQ_STATE_LISTENING:
2091                 if (v != SEQ_START_TOKEN)
2092                         inet_listen_unlock(&tcp_hashinfo);
2093                 break;
2094         case TCP_SEQ_STATE_TIME_WAIT:
2095         case TCP_SEQ_STATE_ESTABLISHED:
2096                 if (v)
2097                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2098                 break;
2099         }
2100 }
2101
2102 static int tcp_seq_open(struct inode *inode, struct file *file)
2103 {
2104         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2105         struct tcp_iter_state *s;
2106         int err;
2107
2108         err = seq_open_net(inode, file, &afinfo->seq_ops,
2109                           sizeof(struct tcp_iter_state));
2110         if (err < 0)
2111                 return err;
2112
2113         s = ((struct seq_file *)file->private_data)->private;
2114         s->family               = afinfo->family;
2115         return 0;
2116 }
2117
2118 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2119 {
2120         int rc = 0;
2121         struct proc_dir_entry *p;
2122
2123         afinfo->seq_fops.open           = tcp_seq_open;
2124         afinfo->seq_fops.read           = seq_read;
2125         afinfo->seq_fops.llseek         = seq_lseek;
2126         afinfo->seq_fops.release        = seq_release_net;
2127
2128         afinfo->seq_ops.start           = tcp_seq_start;
2129         afinfo->seq_ops.next            = tcp_seq_next;
2130         afinfo->seq_ops.stop            = tcp_seq_stop;
2131
2132         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2133                              &afinfo->seq_fops, afinfo);
2134         if (!p)
2135                 rc = -ENOMEM;
2136         return rc;
2137 }
2138
2139 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2140 {
2141         proc_net_remove(net, afinfo->name);
2142 }
2143
2144 static void get_openreq4(struct sock *sk, struct request_sock *req,
2145                          struct seq_file *f, int i, int uid, int *len)
2146 {
2147         const struct inet_request_sock *ireq = inet_rsk(req);
2148         int ttd = req->expires - jiffies;
2149
2150         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2151                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2152                 i,
2153                 ireq->loc_addr,
2154                 ntohs(inet_sk(sk)->sport),
2155                 ireq->rmt_addr,
2156                 ntohs(ireq->rmt_port),
2157                 TCP_SYN_RECV,
2158                 0, 0, /* could print option size, but that is af dependent. */
2159                 1,    /* timers active (only the expire timer) */
2160                 jiffies_to_clock_t(ttd),
2161                 req->retrans,
2162                 uid,
2163                 0,  /* non standard timer */
2164                 0, /* open_requests have no inode */
2165                 atomic_read(&sk->sk_refcnt),
2166                 req,
2167                 len);
2168 }
2169
2170 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2171 {
2172         int timer_active;
2173         unsigned long timer_expires;
2174         struct tcp_sock *tp = tcp_sk(sk);
2175         const struct inet_connection_sock *icsk = inet_csk(sk);
2176         struct inet_sock *inet = inet_sk(sk);
2177         __be32 dest = inet->daddr;
2178         __be32 src = inet->rcv_saddr;
2179         __u16 destp = ntohs(inet->dport);
2180         __u16 srcp = ntohs(inet->sport);
2181
2182         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2183                 timer_active    = 1;
2184                 timer_expires   = icsk->icsk_timeout;
2185         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2186                 timer_active    = 4;
2187                 timer_expires   = icsk->icsk_timeout;
2188         } else if (timer_pending(&sk->sk_timer)) {
2189                 timer_active    = 2;
2190                 timer_expires   = sk->sk_timer.expires;
2191         } else {
2192                 timer_active    = 0;
2193                 timer_expires = jiffies;
2194         }
2195
2196         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2197                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2198                 i, src, srcp, dest, destp, sk->sk_state,
2199                 tp->write_seq - tp->snd_una,
2200                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2201                                              (tp->rcv_nxt - tp->copied_seq),
2202                 timer_active,
2203                 jiffies_to_clock_t(timer_expires - jiffies),
2204                 icsk->icsk_retransmits,
2205                 sock_i_uid(sk),
2206                 icsk->icsk_probes_out,
2207                 sock_i_ino(sk),
2208                 atomic_read(&sk->sk_refcnt), sk,
2209                 jiffies_to_clock_t(icsk->icsk_rto),
2210                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2211                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2212                 tp->snd_cwnd,
2213                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2214                 len);
2215 }
2216
2217 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2218                                struct seq_file *f, int i, int *len)
2219 {
2220         __be32 dest, src;
2221         __u16 destp, srcp;
2222         int ttd = tw->tw_ttd - jiffies;
2223
2224         if (ttd < 0)
2225                 ttd = 0;
2226
2227         dest  = tw->tw_daddr;
2228         src   = tw->tw_rcv_saddr;
2229         destp = ntohs(tw->tw_dport);
2230         srcp  = ntohs(tw->tw_sport);
2231
2232         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2233                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2234                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2235                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2236                 atomic_read(&tw->tw_refcnt), tw, len);
2237 }
2238
2239 #define TMPSZ 150
2240
2241 static int tcp4_seq_show(struct seq_file *seq, void *v)
2242 {
2243         struct tcp_iter_state* st;
2244         int len;
2245
2246         if (v == SEQ_START_TOKEN) {
2247                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2248                            "  sl  local_address rem_address   st tx_queue "
2249                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2250                            "inode");
2251                 goto out;
2252         }
2253         st = seq->private;
2254
2255         switch (st->state) {
2256         case TCP_SEQ_STATE_LISTENING:
2257         case TCP_SEQ_STATE_ESTABLISHED:
2258                 get_tcp4_sock(v, seq, st->num, &len);
2259                 break;
2260         case TCP_SEQ_STATE_OPENREQ:
2261                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2262                 break;
2263         case TCP_SEQ_STATE_TIME_WAIT:
2264                 get_timewait4_sock(v, seq, st->num, &len);
2265                 break;
2266         }
2267         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2268 out:
2269         return 0;
2270 }
2271
2272 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2273         .name           = "tcp",
2274         .family         = AF_INET,
2275         .seq_fops       = {
2276                 .owner          = THIS_MODULE,
2277         },
2278         .seq_ops        = {
2279                 .show           = tcp4_seq_show,
2280         },
2281 };
2282
2283 static int tcp4_proc_init_net(struct net *net)
2284 {
2285         return tcp_proc_register(net, &tcp4_seq_afinfo);
2286 }
2287
2288 static void tcp4_proc_exit_net(struct net *net)
2289 {
2290         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2291 }
2292
2293 static struct pernet_operations tcp4_net_ops = {
2294         .init = tcp4_proc_init_net,
2295         .exit = tcp4_proc_exit_net,
2296 };
2297
2298 int __init tcp4_proc_init(void)
2299 {
2300         return register_pernet_subsys(&tcp4_net_ops);
2301 }
2302
2303 void tcp4_proc_exit(void)
2304 {
2305         unregister_pernet_subsys(&tcp4_net_ops);
2306 }
2307 #endif /* CONFIG_PROC_FS */
2308
2309 struct proto tcp_prot = {
2310         .name                   = "TCP",
2311         .owner                  = THIS_MODULE,
2312         .close                  = tcp_close,
2313         .connect                = tcp_v4_connect,
2314         .disconnect             = tcp_disconnect,
2315         .accept                 = inet_csk_accept,
2316         .ioctl                  = tcp_ioctl,
2317         .init                   = tcp_v4_init_sock,
2318         .destroy                = tcp_v4_destroy_sock,
2319         .shutdown               = tcp_shutdown,
2320         .setsockopt             = tcp_setsockopt,
2321         .getsockopt             = tcp_getsockopt,
2322         .recvmsg                = tcp_recvmsg,
2323         .backlog_rcv            = tcp_v4_do_rcv,
2324         .hash                   = inet_hash,
2325         .unhash                 = inet_unhash,
2326         .get_port               = inet_csk_get_port,
2327         .enter_memory_pressure  = tcp_enter_memory_pressure,
2328         .sockets_allocated      = &tcp_sockets_allocated,
2329         .orphan_count           = &tcp_orphan_count,
2330         .memory_allocated       = &tcp_memory_allocated,
2331         .memory_pressure        = &tcp_memory_pressure,
2332         .sysctl_mem             = sysctl_tcp_mem,
2333         .sysctl_wmem            = sysctl_tcp_wmem,
2334         .sysctl_rmem            = sysctl_tcp_rmem,
2335         .max_header             = MAX_TCP_HEADER,
2336         .obj_size               = sizeof(struct tcp_sock),
2337         .twsk_prot              = &tcp_timewait_sock_ops,
2338         .rsk_prot               = &tcp_request_sock_ops,
2339         .h.hashinfo             = &tcp_hashinfo,
2340 #ifdef CONFIG_COMPAT
2341         .compat_setsockopt      = compat_tcp_setsockopt,
2342         .compat_getsockopt      = compat_tcp_getsockopt,
2343 #endif
2344 };
2345
2346
2347 static int __net_init tcp_sk_init(struct net *net)
2348 {
2349         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2350                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2351 }
2352
2353 static void __net_exit tcp_sk_exit(struct net *net)
2354 {
2355         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2356 }
2357
2358 static struct pernet_operations __net_initdata tcp_sk_ops = {
2359        .init = tcp_sk_init,
2360        .exit = tcp_sk_exit,
2361 };
2362
2363 void __init tcp_v4_init(void)
2364 {
2365         if (register_pernet_device(&tcp_sk_ops))
2366                 panic("Failed to create the TCP control socket.\n");
2367 }
2368
2369 EXPORT_SYMBOL(ipv4_specific);
2370 EXPORT_SYMBOL(tcp_hashinfo);
2371 EXPORT_SYMBOL(tcp_prot);
2372 EXPORT_SYMBOL(tcp_v4_conn_request);
2373 EXPORT_SYMBOL(tcp_v4_connect);
2374 EXPORT_SYMBOL(tcp_v4_do_rcv);
2375 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2376 EXPORT_SYMBOL(tcp_v4_send_check);
2377 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2378
2379 #ifdef CONFIG_PROC_FS
2380 EXPORT_SYMBOL(tcp_proc_register);
2381 EXPORT_SYMBOL(tcp_proc_unregister);
2382 #endif
2383 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2384