Merge branch 'e1000-fixes' of master.kernel.org:/pub/scm/linux/kernel/git/jgarzik...
[pandora-kernel.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/timewait_sock.h>
72 #include <net/xfrm.h>
73 #include <net/netdma.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80
81 #include <linux/crypto.h>
82 #include <linux/scatterlist.h>
83
84 int sysctl_tcp_tw_reuse __read_mostly;
85 int sysctl_tcp_low_latency __read_mostly;
86
87 /* Check TCP sequence numbers in ICMP packets. */
88 #define ICMP_MIN_LENGTH 8
89
90 /* Socket used for sending RSTs */
91 static struct socket *tcp_socket __read_mostly;
92
93 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
94
95 #ifdef CONFIG_TCP_MD5SIG
96 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
97                                                    __be32 addr);
98 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
99                                    __be32 saddr, __be32 daddr,
100                                    struct tcphdr *th, int protocol,
101                                    int tcplen);
102 #endif
103
104 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
105         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
106         .lhash_users = ATOMIC_INIT(0),
107         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
108 };
109
110 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
111 {
112         return inet_csk_get_port(&tcp_hashinfo, sk, snum,
113                                  inet_csk_bind_conflict);
114 }
115
116 static void tcp_v4_hash(struct sock *sk)
117 {
118         inet_hash(&tcp_hashinfo, sk);
119 }
120
121 void tcp_unhash(struct sock *sk)
122 {
123         inet_unhash(&tcp_hashinfo, sk);
124 }
125
126 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
127 {
128         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
129                                           ip_hdr(skb)->saddr,
130                                           tcp_hdr(skb)->dest,
131                                           tcp_hdr(skb)->source);
132 }
133
134 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
135 {
136         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
137         struct tcp_sock *tp = tcp_sk(sk);
138
139         /* With PAWS, it is safe from the viewpoint
140            of data integrity. Even without PAWS it is safe provided sequence
141            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143            Actually, the idea is close to VJ's one, only timestamp cache is
144            held not per host, but per port pair and TW bucket is used as state
145            holder.
146
147            If TW bucket has been already destroyed we fall back to VJ's scheme
148            and use initial timestamp retrieved from peer table.
149          */
150         if (tcptw->tw_ts_recent_stamp &&
151             (twp == NULL || (sysctl_tcp_tw_reuse &&
152                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
153                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
154                 if (tp->write_seq == 0)
155                         tp->write_seq = 1;
156                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
157                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
158                 sock_hold(sktw);
159                 return 1;
160         }
161
162         return 0;
163 }
164
165 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
166
167 /* This will initiate an outgoing connection. */
168 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
169 {
170         struct inet_sock *inet = inet_sk(sk);
171         struct tcp_sock *tp = tcp_sk(sk);
172         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
173         struct rtable *rt;
174         __be32 daddr, nexthop;
175         int tmp;
176         int err;
177
178         if (addr_len < sizeof(struct sockaddr_in))
179                 return -EINVAL;
180
181         if (usin->sin_family != AF_INET)
182                 return -EAFNOSUPPORT;
183
184         nexthop = daddr = usin->sin_addr.s_addr;
185         if (inet->opt && inet->opt->srr) {
186                 if (!daddr)
187                         return -EINVAL;
188                 nexthop = inet->opt->faddr;
189         }
190
191         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
192                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
193                                IPPROTO_TCP,
194                                inet->sport, usin->sin_port, sk, 1);
195         if (tmp < 0)
196                 return tmp;
197
198         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
199                 ip_rt_put(rt);
200                 return -ENETUNREACH;
201         }
202
203         if (!inet->opt || !inet->opt->srr)
204                 daddr = rt->rt_dst;
205
206         if (!inet->saddr)
207                 inet->saddr = rt->rt_src;
208         inet->rcv_saddr = inet->saddr;
209
210         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
211                 /* Reset inherited state */
212                 tp->rx_opt.ts_recent       = 0;
213                 tp->rx_opt.ts_recent_stamp = 0;
214                 tp->write_seq              = 0;
215         }
216
217         if (tcp_death_row.sysctl_tw_recycle &&
218             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
219                 struct inet_peer *peer = rt_get_peer(rt);
220                 /*
221                  * VJ's idea. We save last timestamp seen from
222                  * the destination in peer table, when entering state
223                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
224                  * when trying new connection.
225                  */
226                 if (peer != NULL &&
227                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
228                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
229                         tp->rx_opt.ts_recent = peer->tcp_ts;
230                 }
231         }
232
233         inet->dport = usin->sin_port;
234         inet->daddr = daddr;
235
236         inet_csk(sk)->icsk_ext_hdr_len = 0;
237         if (inet->opt)
238                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
239
240         tp->rx_opt.mss_clamp = 536;
241
242         /* Socket identity is still unknown (sport may be zero).
243          * However we set state to SYN-SENT and not releasing socket
244          * lock select source port, enter ourselves into the hash tables and
245          * complete initialization after this.
246          */
247         tcp_set_state(sk, TCP_SYN_SENT);
248         err = inet_hash_connect(&tcp_death_row, sk);
249         if (err)
250                 goto failure;
251
252         err = ip_route_newports(&rt, IPPROTO_TCP,
253                                 inet->sport, inet->dport, sk);
254         if (err)
255                 goto failure;
256
257         /* OK, now commit destination to socket.  */
258         sk->sk_gso_type = SKB_GSO_TCPV4;
259         sk_setup_caps(sk, &rt->u.dst);
260
261         if (!tp->write_seq)
262                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
263                                                            inet->daddr,
264                                                            inet->sport,
265                                                            usin->sin_port);
266
267         inet->id = tp->write_seq ^ jiffies;
268
269         err = tcp_connect(sk);
270         rt = NULL;
271         if (err)
272                 goto failure;
273
274         return 0;
275
276 failure:
277         /*
278          * This unhashes the socket and releases the local port,
279          * if necessary.
280          */
281         tcp_set_state(sk, TCP_CLOSE);
282         ip_rt_put(rt);
283         sk->sk_route_caps = 0;
284         inet->dport = 0;
285         return err;
286 }
287
288 /*
289  * This routine does path mtu discovery as defined in RFC1191.
290  */
291 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
292 {
293         struct dst_entry *dst;
294         struct inet_sock *inet = inet_sk(sk);
295
296         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
297          * send out by Linux are always <576bytes so they should go through
298          * unfragmented).
299          */
300         if (sk->sk_state == TCP_LISTEN)
301                 return;
302
303         /* We don't check in the destentry if pmtu discovery is forbidden
304          * on this route. We just assume that no packet_to_big packets
305          * are send back when pmtu discovery is not active.
306          * There is a small race when the user changes this flag in the
307          * route, but I think that's acceptable.
308          */
309         if ((dst = __sk_dst_check(sk, 0)) == NULL)
310                 return;
311
312         dst->ops->update_pmtu(dst, mtu);
313
314         /* Something is about to be wrong... Remember soft error
315          * for the case, if this connection will not able to recover.
316          */
317         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
318                 sk->sk_err_soft = EMSGSIZE;
319
320         mtu = dst_mtu(dst);
321
322         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
323             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
324                 tcp_sync_mss(sk, mtu);
325
326                 /* Resend the TCP packet because it's
327                  * clear that the old packet has been
328                  * dropped. This is the new "fast" path mtu
329                  * discovery.
330                  */
331                 tcp_simple_retransmit(sk);
332         } /* else let the usual retransmit timer handle it */
333 }
334
335 /*
336  * This routine is called by the ICMP module when it gets some
337  * sort of error condition.  If err < 0 then the socket should
338  * be closed and the error returned to the user.  If err > 0
339  * it's just the icmp type << 8 | icmp code.  After adjustment
340  * header points to the first 8 bytes of the tcp header.  We need
341  * to find the appropriate port.
342  *
343  * The locking strategy used here is very "optimistic". When
344  * someone else accesses the socket the ICMP is just dropped
345  * and for some paths there is no check at all.
346  * A more general error queue to queue errors for later handling
347  * is probably better.
348  *
349  */
350
351 void tcp_v4_err(struct sk_buff *skb, u32 info)
352 {
353         struct iphdr *iph = (struct iphdr *)skb->data;
354         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
355         struct tcp_sock *tp;
356         struct inet_sock *inet;
357         const int type = icmp_hdr(skb)->type;
358         const int code = icmp_hdr(skb)->code;
359         struct sock *sk;
360         __u32 seq;
361         int err;
362
363         if (skb->len < (iph->ihl << 2) + 8) {
364                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
365                 return;
366         }
367
368         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
369                          th->source, inet_iif(skb));
370         if (!sk) {
371                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
372                 return;
373         }
374         if (sk->sk_state == TCP_TIME_WAIT) {
375                 inet_twsk_put(inet_twsk(sk));
376                 return;
377         }
378
379         bh_lock_sock(sk);
380         /* If too many ICMPs get dropped on busy
381          * servers this needs to be solved differently.
382          */
383         if (sock_owned_by_user(sk))
384                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
385
386         if (sk->sk_state == TCP_CLOSE)
387                 goto out;
388
389         tp = tcp_sk(sk);
390         seq = ntohl(th->seq);
391         if (sk->sk_state != TCP_LISTEN &&
392             !between(seq, tp->snd_una, tp->snd_nxt)) {
393                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
394                 goto out;
395         }
396
397         switch (type) {
398         case ICMP_SOURCE_QUENCH:
399                 /* Just silently ignore these. */
400                 goto out;
401         case ICMP_PARAMETERPROB:
402                 err = EPROTO;
403                 break;
404         case ICMP_DEST_UNREACH:
405                 if (code > NR_ICMP_UNREACH)
406                         goto out;
407
408                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
409                         if (!sock_owned_by_user(sk))
410                                 do_pmtu_discovery(sk, iph, info);
411                         goto out;
412                 }
413
414                 err = icmp_err_convert[code].errno;
415                 break;
416         case ICMP_TIME_EXCEEDED:
417                 err = EHOSTUNREACH;
418                 break;
419         default:
420                 goto out;
421         }
422
423         switch (sk->sk_state) {
424                 struct request_sock *req, **prev;
425         case TCP_LISTEN:
426                 if (sock_owned_by_user(sk))
427                         goto out;
428
429                 req = inet_csk_search_req(sk, &prev, th->dest,
430                                           iph->daddr, iph->saddr);
431                 if (!req)
432                         goto out;
433
434                 /* ICMPs are not backlogged, hence we cannot get
435                    an established socket here.
436                  */
437                 BUG_TRAP(!req->sk);
438
439                 if (seq != tcp_rsk(req)->snt_isn) {
440                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
441                         goto out;
442                 }
443
444                 /*
445                  * Still in SYN_RECV, just remove it silently.
446                  * There is no good way to pass the error to the newly
447                  * created socket, and POSIX does not want network
448                  * errors returned from accept().
449                  */
450                 inet_csk_reqsk_queue_drop(sk, req, prev);
451                 goto out;
452
453         case TCP_SYN_SENT:
454         case TCP_SYN_RECV:  /* Cannot happen.
455                                It can f.e. if SYNs crossed.
456                              */
457                 if (!sock_owned_by_user(sk)) {
458                         sk->sk_err = err;
459
460                         sk->sk_error_report(sk);
461
462                         tcp_done(sk);
463                 } else {
464                         sk->sk_err_soft = err;
465                 }
466                 goto out;
467         }
468
469         /* If we've already connected we will keep trying
470          * until we time out, or the user gives up.
471          *
472          * rfc1122 4.2.3.9 allows to consider as hard errors
473          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
474          * but it is obsoleted by pmtu discovery).
475          *
476          * Note, that in modern internet, where routing is unreliable
477          * and in each dark corner broken firewalls sit, sending random
478          * errors ordered by their masters even this two messages finally lose
479          * their original sense (even Linux sends invalid PORT_UNREACHs)
480          *
481          * Now we are in compliance with RFCs.
482          *                                                      --ANK (980905)
483          */
484
485         inet = inet_sk(sk);
486         if (!sock_owned_by_user(sk) && inet->recverr) {
487                 sk->sk_err = err;
488                 sk->sk_error_report(sk);
489         } else  { /* Only an error on timeout */
490                 sk->sk_err_soft = err;
491         }
492
493 out:
494         bh_unlock_sock(sk);
495         sock_put(sk);
496 }
497
498 /* This routine computes an IPv4 TCP checksum. */
499 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
500 {
501         struct inet_sock *inet = inet_sk(sk);
502         struct tcphdr *th = tcp_hdr(skb);
503
504         if (skb->ip_summed == CHECKSUM_PARTIAL) {
505                 th->check = ~tcp_v4_check(len, inet->saddr,
506                                           inet->daddr, 0);
507                 skb->csum_start = skb_transport_header(skb) - skb->head;
508                 skb->csum_offset = offsetof(struct tcphdr, check);
509         } else {
510                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
511                                          csum_partial((char *)th,
512                                                       th->doff << 2,
513                                                       skb->csum));
514         }
515 }
516
517 int tcp_v4_gso_send_check(struct sk_buff *skb)
518 {
519         const struct iphdr *iph;
520         struct tcphdr *th;
521
522         if (!pskb_may_pull(skb, sizeof(*th)))
523                 return -EINVAL;
524
525         iph = ip_hdr(skb);
526         th = tcp_hdr(skb);
527
528         th->check = 0;
529         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
530         skb->csum_start = skb_transport_header(skb) - skb->head;
531         skb->csum_offset = offsetof(struct tcphdr, check);
532         skb->ip_summed = CHECKSUM_PARTIAL;
533         return 0;
534 }
535
536 /*
537  *      This routine will send an RST to the other tcp.
538  *
539  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
540  *                    for reset.
541  *      Answer: if a packet caused RST, it is not for a socket
542  *              existing in our system, if it is matched to a socket,
543  *              it is just duplicate segment or bug in other side's TCP.
544  *              So that we build reply only basing on parameters
545  *              arrived with segment.
546  *      Exception: precedence violation. We do not implement it in any case.
547  */
548
549 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
550 {
551         struct tcphdr *th = tcp_hdr(skb);
552         struct {
553                 struct tcphdr th;
554 #ifdef CONFIG_TCP_MD5SIG
555                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
556 #endif
557         } rep;
558         struct ip_reply_arg arg;
559 #ifdef CONFIG_TCP_MD5SIG
560         struct tcp_md5sig_key *key;
561 #endif
562
563         /* Never send a reset in response to a reset. */
564         if (th->rst)
565                 return;
566
567         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
568                 return;
569
570         /* Swap the send and the receive. */
571         memset(&rep, 0, sizeof(rep));
572         rep.th.dest   = th->source;
573         rep.th.source = th->dest;
574         rep.th.doff   = sizeof(struct tcphdr) / 4;
575         rep.th.rst    = 1;
576
577         if (th->ack) {
578                 rep.th.seq = th->ack_seq;
579         } else {
580                 rep.th.ack = 1;
581                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
582                                        skb->len - (th->doff << 2));
583         }
584
585         memset(&arg, 0, sizeof(arg));
586         arg.iov[0].iov_base = (unsigned char *)&rep;
587         arg.iov[0].iov_len  = sizeof(rep.th);
588
589 #ifdef CONFIG_TCP_MD5SIG
590         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
591         if (key) {
592                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
593                                    (TCPOPT_NOP << 16) |
594                                    (TCPOPT_MD5SIG << 8) |
595                                    TCPOLEN_MD5SIG);
596                 /* Update length and the length the header thinks exists */
597                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
598                 rep.th.doff = arg.iov[0].iov_len / 4;
599
600                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
601                                         key,
602                                         ip_hdr(skb)->daddr,
603                                         ip_hdr(skb)->saddr,
604                                         &rep.th, IPPROTO_TCP,
605                                         arg.iov[0].iov_len);
606         }
607 #endif
608         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
609                                       ip_hdr(skb)->saddr, /* XXX */
610                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
611         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
612
613         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
614
615         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
616         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
617 }
618
619 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
620    outside socket context is ugly, certainly. What can I do?
621  */
622
623 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
624                             struct sk_buff *skb, u32 seq, u32 ack,
625                             u32 win, u32 ts)
626 {
627         struct tcphdr *th = tcp_hdr(skb);
628         struct {
629                 struct tcphdr th;
630                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
631 #ifdef CONFIG_TCP_MD5SIG
632                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
633 #endif
634                         ];
635         } rep;
636         struct ip_reply_arg arg;
637 #ifdef CONFIG_TCP_MD5SIG
638         struct tcp_md5sig_key *key;
639         struct tcp_md5sig_key tw_key;
640 #endif
641
642         memset(&rep.th, 0, sizeof(struct tcphdr));
643         memset(&arg, 0, sizeof(arg));
644
645         arg.iov[0].iov_base = (unsigned char *)&rep;
646         arg.iov[0].iov_len  = sizeof(rep.th);
647         if (ts) {
648                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
649                                    (TCPOPT_TIMESTAMP << 8) |
650                                    TCPOLEN_TIMESTAMP);
651                 rep.opt[1] = htonl(tcp_time_stamp);
652                 rep.opt[2] = htonl(ts);
653                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
654         }
655
656         /* Swap the send and the receive. */
657         rep.th.dest    = th->source;
658         rep.th.source  = th->dest;
659         rep.th.doff    = arg.iov[0].iov_len / 4;
660         rep.th.seq     = htonl(seq);
661         rep.th.ack_seq = htonl(ack);
662         rep.th.ack     = 1;
663         rep.th.window  = htons(win);
664
665 #ifdef CONFIG_TCP_MD5SIG
666         /*
667          * The SKB holds an imcoming packet, but may not have a valid ->sk
668          * pointer. This is especially the case when we're dealing with a
669          * TIME_WAIT ack, because the sk structure is long gone, and only
670          * the tcp_timewait_sock remains. So the md5 key is stashed in that
671          * structure, and we use it in preference.  I believe that (twsk ||
672          * skb->sk) holds true, but we program defensively.
673          */
674         if (!twsk && skb->sk) {
675                 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
676         } else if (twsk && twsk->tw_md5_keylen) {
677                 tw_key.key = twsk->tw_md5_key;
678                 tw_key.keylen = twsk->tw_md5_keylen;
679                 key = &tw_key;
680         } else
681                 key = NULL;
682
683         if (key) {
684                 int offset = (ts) ? 3 : 0;
685
686                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
687                                           (TCPOPT_NOP << 16) |
688                                           (TCPOPT_MD5SIG << 8) |
689                                           TCPOLEN_MD5SIG);
690                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
691                 rep.th.doff = arg.iov[0].iov_len/4;
692
693                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
694                                         key,
695                                         ip_hdr(skb)->daddr,
696                                         ip_hdr(skb)->saddr,
697                                         &rep.th, IPPROTO_TCP,
698                                         arg.iov[0].iov_len);
699         }
700 #endif
701         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
702                                       ip_hdr(skb)->saddr, /* XXX */
703                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
704         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
705
706         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
707
708         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
709 }
710
711 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
712 {
713         struct inet_timewait_sock *tw = inet_twsk(sk);
714         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
715
716         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
717                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
718                         tcptw->tw_ts_recent);
719
720         inet_twsk_put(tw);
721 }
722
723 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
724                                   struct request_sock *req)
725 {
726         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
727                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
728                         req->ts_recent);
729 }
730
731 /*
732  *      Send a SYN-ACK after having received an ACK.
733  *      This still operates on a request_sock only, not on a big
734  *      socket.
735  */
736 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
737                               struct dst_entry *dst)
738 {
739         const struct inet_request_sock *ireq = inet_rsk(req);
740         int err = -1;
741         struct sk_buff * skb;
742
743         /* First, grab a route. */
744         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
745                 goto out;
746
747         skb = tcp_make_synack(sk, dst, req);
748
749         if (skb) {
750                 struct tcphdr *th = tcp_hdr(skb);
751
752                 th->check = tcp_v4_check(skb->len,
753                                          ireq->loc_addr,
754                                          ireq->rmt_addr,
755                                          csum_partial((char *)th, skb->len,
756                                                       skb->csum));
757
758                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
759                                             ireq->rmt_addr,
760                                             ireq->opt);
761                 err = net_xmit_eval(err);
762         }
763
764 out:
765         dst_release(dst);
766         return err;
767 }
768
769 /*
770  *      IPv4 request_sock destructor.
771  */
772 static void tcp_v4_reqsk_destructor(struct request_sock *req)
773 {
774         kfree(inet_rsk(req)->opt);
775 }
776
777 #ifdef CONFIG_SYN_COOKIES
778 static void syn_flood_warning(struct sk_buff *skb)
779 {
780         static unsigned long warntime;
781
782         if (time_after(jiffies, (warntime + HZ * 60))) {
783                 warntime = jiffies;
784                 printk(KERN_INFO
785                        "possible SYN flooding on port %d. Sending cookies.\n",
786                        ntohs(tcp_hdr(skb)->dest));
787         }
788 }
789 #endif
790
791 /*
792  * Save and compile IPv4 options into the request_sock if needed.
793  */
794 static struct ip_options *tcp_v4_save_options(struct sock *sk,
795                                               struct sk_buff *skb)
796 {
797         struct ip_options *opt = &(IPCB(skb)->opt);
798         struct ip_options *dopt = NULL;
799
800         if (opt && opt->optlen) {
801                 int opt_size = optlength(opt);
802                 dopt = kmalloc(opt_size, GFP_ATOMIC);
803                 if (dopt) {
804                         if (ip_options_echo(dopt, skb)) {
805                                 kfree(dopt);
806                                 dopt = NULL;
807                         }
808                 }
809         }
810         return dopt;
811 }
812
813 #ifdef CONFIG_TCP_MD5SIG
814 /*
815  * RFC2385 MD5 checksumming requires a mapping of
816  * IP address->MD5 Key.
817  * We need to maintain these in the sk structure.
818  */
819
820 /* Find the Key structure for an address.  */
821 static struct tcp_md5sig_key *
822                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
823 {
824         struct tcp_sock *tp = tcp_sk(sk);
825         int i;
826
827         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
828                 return NULL;
829         for (i = 0; i < tp->md5sig_info->entries4; i++) {
830                 if (tp->md5sig_info->keys4[i].addr == addr)
831                         return (struct tcp_md5sig_key *)
832                                                 &tp->md5sig_info->keys4[i];
833         }
834         return NULL;
835 }
836
837 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
838                                          struct sock *addr_sk)
839 {
840         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
841 }
842
843 EXPORT_SYMBOL(tcp_v4_md5_lookup);
844
845 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
846                                                       struct request_sock *req)
847 {
848         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
849 }
850
851 /* This can be called on a newly created socket, from other files */
852 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
853                       u8 *newkey, u8 newkeylen)
854 {
855         /* Add Key to the list */
856         struct tcp4_md5sig_key *key;
857         struct tcp_sock *tp = tcp_sk(sk);
858         struct tcp4_md5sig_key *keys;
859
860         key = (struct tcp4_md5sig_key *)tcp_v4_md5_do_lookup(sk, addr);
861         if (key) {
862                 /* Pre-existing entry - just update that one. */
863                 kfree(key->key);
864                 key->key = newkey;
865                 key->keylen = newkeylen;
866         } else {
867                 struct tcp_md5sig_info *md5sig;
868
869                 if (!tp->md5sig_info) {
870                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
871                                                   GFP_ATOMIC);
872                         if (!tp->md5sig_info) {
873                                 kfree(newkey);
874                                 return -ENOMEM;
875                         }
876                 }
877                 if (tcp_alloc_md5sig_pool() == NULL) {
878                         kfree(newkey);
879                         return -ENOMEM;
880                 }
881                 md5sig = tp->md5sig_info;
882
883                 if (md5sig->alloced4 == md5sig->entries4) {
884                         keys = kmalloc((sizeof(*keys) *
885                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
886                         if (!keys) {
887                                 kfree(newkey);
888                                 tcp_free_md5sig_pool();
889                                 return -ENOMEM;
890                         }
891
892                         if (md5sig->entries4)
893                                 memcpy(keys, md5sig->keys4,
894                                        sizeof(*keys) * md5sig->entries4);
895
896                         /* Free old key list, and reference new one */
897                         if (md5sig->keys4)
898                                 kfree(md5sig->keys4);
899                         md5sig->keys4 = keys;
900                         md5sig->alloced4++;
901                 }
902                 md5sig->entries4++;
903                 md5sig->keys4[md5sig->entries4 - 1].addr   = addr;
904                 md5sig->keys4[md5sig->entries4 - 1].key    = newkey;
905                 md5sig->keys4[md5sig->entries4 - 1].keylen = newkeylen;
906         }
907         return 0;
908 }
909
910 EXPORT_SYMBOL(tcp_v4_md5_do_add);
911
912 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
913                                u8 *newkey, u8 newkeylen)
914 {
915         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
916                                  newkey, newkeylen);
917 }
918
919 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
920 {
921         struct tcp_sock *tp = tcp_sk(sk);
922         int i;
923
924         for (i = 0; i < tp->md5sig_info->entries4; i++) {
925                 if (tp->md5sig_info->keys4[i].addr == addr) {
926                         /* Free the key */
927                         kfree(tp->md5sig_info->keys4[i].key);
928                         tp->md5sig_info->entries4--;
929
930                         if (tp->md5sig_info->entries4 == 0) {
931                                 kfree(tp->md5sig_info->keys4);
932                                 tp->md5sig_info->keys4 = NULL;
933                                 tp->md5sig_info->alloced4 = 0;
934                         } else if (tp->md5sig_info->entries4 != i) {
935                                 /* Need to do some manipulation */
936                                 memcpy(&tp->md5sig_info->keys4[i],
937                                        &tp->md5sig_info->keys4[i+1],
938                                        (tp->md5sig_info->entries4 - i) *
939                                         sizeof(struct tcp4_md5sig_key));
940                         }
941                         tcp_free_md5sig_pool();
942                         return 0;
943                 }
944         }
945         return -ENOENT;
946 }
947
948 EXPORT_SYMBOL(tcp_v4_md5_do_del);
949
950 static void tcp_v4_clear_md5_list(struct sock *sk)
951 {
952         struct tcp_sock *tp = tcp_sk(sk);
953
954         /* Free each key, then the set of key keys,
955          * the crypto element, and then decrement our
956          * hold on the last resort crypto.
957          */
958         if (tp->md5sig_info->entries4) {
959                 int i;
960                 for (i = 0; i < tp->md5sig_info->entries4; i++)
961                         kfree(tp->md5sig_info->keys4[i].key);
962                 tp->md5sig_info->entries4 = 0;
963                 tcp_free_md5sig_pool();
964         }
965         if (tp->md5sig_info->keys4) {
966                 kfree(tp->md5sig_info->keys4);
967                 tp->md5sig_info->keys4 = NULL;
968                 tp->md5sig_info->alloced4  = 0;
969         }
970 }
971
972 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
973                                  int optlen)
974 {
975         struct tcp_md5sig cmd;
976         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
977         u8 *newkey;
978
979         if (optlen < sizeof(cmd))
980                 return -EINVAL;
981
982         if (copy_from_user(&cmd, optval, sizeof(cmd)))
983                 return -EFAULT;
984
985         if (sin->sin_family != AF_INET)
986                 return -EINVAL;
987
988         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
989                 if (!tcp_sk(sk)->md5sig_info)
990                         return -ENOENT;
991                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
992         }
993
994         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
995                 return -EINVAL;
996
997         if (!tcp_sk(sk)->md5sig_info) {
998                 struct tcp_sock *tp = tcp_sk(sk);
999                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
1000
1001                 if (!p)
1002                         return -EINVAL;
1003
1004                 tp->md5sig_info = p;
1005
1006         }
1007
1008         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1009         if (!newkey)
1010                 return -ENOMEM;
1011         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1012                                  newkey, cmd.tcpm_keylen);
1013 }
1014
1015 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1016                                    __be32 saddr, __be32 daddr,
1017                                    struct tcphdr *th, int protocol,
1018                                    int tcplen)
1019 {
1020         struct scatterlist sg[4];
1021         __u16 data_len;
1022         int block = 0;
1023         __sum16 old_checksum;
1024         struct tcp_md5sig_pool *hp;
1025         struct tcp4_pseudohdr *bp;
1026         struct hash_desc *desc;
1027         int err;
1028         unsigned int nbytes = 0;
1029
1030         /*
1031          * Okay, so RFC2385 is turned on for this connection,
1032          * so we need to generate the MD5 hash for the packet now.
1033          */
1034
1035         hp = tcp_get_md5sig_pool();
1036         if (!hp)
1037                 goto clear_hash_noput;
1038
1039         bp = &hp->md5_blk.ip4;
1040         desc = &hp->md5_desc;
1041
1042         /*
1043          * 1. the TCP pseudo-header (in the order: source IP address,
1044          * destination IP address, zero-padded protocol number, and
1045          * segment length)
1046          */
1047         bp->saddr = saddr;
1048         bp->daddr = daddr;
1049         bp->pad = 0;
1050         bp->protocol = protocol;
1051         bp->len = htons(tcplen);
1052         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1053         nbytes += sizeof(*bp);
1054
1055         /* 2. the TCP header, excluding options, and assuming a
1056          * checksum of zero/
1057          */
1058         old_checksum = th->check;
1059         th->check = 0;
1060         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1061         nbytes += sizeof(struct tcphdr);
1062
1063         /* 3. the TCP segment data (if any) */
1064         data_len = tcplen - (th->doff << 2);
1065         if (data_len > 0) {
1066                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1067                 sg_set_buf(&sg[block++], data, data_len);
1068                 nbytes += data_len;
1069         }
1070
1071         /* 4. an independently-specified key or password, known to both
1072          * TCPs and presumably connection-specific
1073          */
1074         sg_set_buf(&sg[block++], key->key, key->keylen);
1075         nbytes += key->keylen;
1076
1077         /* Now store the Hash into the packet */
1078         err = crypto_hash_init(desc);
1079         if (err)
1080                 goto clear_hash;
1081         err = crypto_hash_update(desc, sg, nbytes);
1082         if (err)
1083                 goto clear_hash;
1084         err = crypto_hash_final(desc, md5_hash);
1085         if (err)
1086                 goto clear_hash;
1087
1088         /* Reset header, and free up the crypto */
1089         tcp_put_md5sig_pool();
1090         th->check = old_checksum;
1091
1092 out:
1093         return 0;
1094 clear_hash:
1095         tcp_put_md5sig_pool();
1096 clear_hash_noput:
1097         memset(md5_hash, 0, 16);
1098         goto out;
1099 }
1100
1101 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1102                          struct sock *sk,
1103                          struct dst_entry *dst,
1104                          struct request_sock *req,
1105                          struct tcphdr *th, int protocol,
1106                          int tcplen)
1107 {
1108         __be32 saddr, daddr;
1109
1110         if (sk) {
1111                 saddr = inet_sk(sk)->saddr;
1112                 daddr = inet_sk(sk)->daddr;
1113         } else {
1114                 struct rtable *rt = (struct rtable *)dst;
1115                 BUG_ON(!rt);
1116                 saddr = rt->rt_src;
1117                 daddr = rt->rt_dst;
1118         }
1119         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1120                                        saddr, daddr,
1121                                        th, protocol, tcplen);
1122 }
1123
1124 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1125
1126 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1127 {
1128         /*
1129          * This gets called for each TCP segment that arrives
1130          * so we want to be efficient.
1131          * We have 3 drop cases:
1132          * o No MD5 hash and one expected.
1133          * o MD5 hash and we're not expecting one.
1134          * o MD5 hash and its wrong.
1135          */
1136         __u8 *hash_location = NULL;
1137         struct tcp_md5sig_key *hash_expected;
1138         const struct iphdr *iph = ip_hdr(skb);
1139         struct tcphdr *th = tcp_hdr(skb);
1140         int length = (th->doff << 2) - sizeof(struct tcphdr);
1141         int genhash;
1142         unsigned char *ptr;
1143         unsigned char newhash[16];
1144
1145         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1146
1147         /*
1148          * If the TCP option length is less than the TCP_MD5SIG
1149          * option length, then we can shortcut
1150          */
1151         if (length < TCPOLEN_MD5SIG) {
1152                 if (hash_expected)
1153                         return 1;
1154                 else
1155                         return 0;
1156         }
1157
1158         /* Okay, we can't shortcut - we have to grub through the options */
1159         ptr = (unsigned char *)(th + 1);
1160         while (length > 0) {
1161                 int opcode = *ptr++;
1162                 int opsize;
1163
1164                 switch (opcode) {
1165                 case TCPOPT_EOL:
1166                         goto done_opts;
1167                 case TCPOPT_NOP:
1168                         length--;
1169                         continue;
1170                 default:
1171                         opsize = *ptr++;
1172                         if (opsize < 2)
1173                                 goto done_opts;
1174                         if (opsize > length)
1175                                 goto done_opts;
1176
1177                         if (opcode == TCPOPT_MD5SIG) {
1178                                 hash_location = ptr;
1179                                 goto done_opts;
1180                         }
1181                 }
1182                 ptr += opsize-2;
1183                 length -= opsize;
1184         }
1185 done_opts:
1186         /* We've parsed the options - do we have a hash? */
1187         if (!hash_expected && !hash_location)
1188                 return 0;
1189
1190         if (hash_expected && !hash_location) {
1191                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1192                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1193                                NIPQUAD(iph->saddr), ntohs(th->source),
1194                                NIPQUAD(iph->daddr), ntohs(th->dest));
1195                 return 1;
1196         }
1197
1198         if (!hash_expected && hash_location) {
1199                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1200                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1201                                NIPQUAD(iph->saddr), ntohs(th->source),
1202                                NIPQUAD(iph->daddr), ntohs(th->dest));
1203                 return 1;
1204         }
1205
1206         /* Okay, so this is hash_expected and hash_location -
1207          * so we need to calculate the checksum.
1208          */
1209         genhash = tcp_v4_do_calc_md5_hash(newhash,
1210                                           hash_expected,
1211                                           iph->saddr, iph->daddr,
1212                                           th, sk->sk_protocol,
1213                                           skb->len);
1214
1215         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1216                 if (net_ratelimit()) {
1217                         printk(KERN_INFO "MD5 Hash failed for "
1218                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1219                                NIPQUAD(iph->saddr), ntohs(th->source),
1220                                NIPQUAD(iph->daddr), ntohs(th->dest),
1221                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1222                 }
1223                 return 1;
1224         }
1225         return 0;
1226 }
1227
1228 #endif
1229
1230 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1231         .family         =       PF_INET,
1232         .obj_size       =       sizeof(struct tcp_request_sock),
1233         .rtx_syn_ack    =       tcp_v4_send_synack,
1234         .send_ack       =       tcp_v4_reqsk_send_ack,
1235         .destructor     =       tcp_v4_reqsk_destructor,
1236         .send_reset     =       tcp_v4_send_reset,
1237 };
1238
1239 #ifdef CONFIG_TCP_MD5SIG
1240 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1241         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1242 };
1243 #endif
1244
1245 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1246         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1247         .twsk_unique    = tcp_twsk_unique,
1248         .twsk_destructor= tcp_twsk_destructor,
1249 };
1250
1251 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1252 {
1253         struct inet_request_sock *ireq;
1254         struct tcp_options_received tmp_opt;
1255         struct request_sock *req;
1256         __be32 saddr = ip_hdr(skb)->saddr;
1257         __be32 daddr = ip_hdr(skb)->daddr;
1258         __u32 isn = TCP_SKB_CB(skb)->when;
1259         struct dst_entry *dst = NULL;
1260 #ifdef CONFIG_SYN_COOKIES
1261         int want_cookie = 0;
1262 #else
1263 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1264 #endif
1265
1266         /* Never answer to SYNs send to broadcast or multicast */
1267         if (((struct rtable *)skb->dst)->rt_flags &
1268             (RTCF_BROADCAST | RTCF_MULTICAST))
1269                 goto drop;
1270
1271         /* TW buckets are converted to open requests without
1272          * limitations, they conserve resources and peer is
1273          * evidently real one.
1274          */
1275         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1276 #ifdef CONFIG_SYN_COOKIES
1277                 if (sysctl_tcp_syncookies) {
1278                         want_cookie = 1;
1279                 } else
1280 #endif
1281                 goto drop;
1282         }
1283
1284         /* Accept backlog is full. If we have already queued enough
1285          * of warm entries in syn queue, drop request. It is better than
1286          * clogging syn queue with openreqs with exponentially increasing
1287          * timeout.
1288          */
1289         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1290                 goto drop;
1291
1292         req = reqsk_alloc(&tcp_request_sock_ops);
1293         if (!req)
1294                 goto drop;
1295
1296 #ifdef CONFIG_TCP_MD5SIG
1297         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1298 #endif
1299
1300         tcp_clear_options(&tmp_opt);
1301         tmp_opt.mss_clamp = 536;
1302         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1303
1304         tcp_parse_options(skb, &tmp_opt, 0);
1305
1306         if (want_cookie) {
1307                 tcp_clear_options(&tmp_opt);
1308                 tmp_opt.saw_tstamp = 0;
1309         }
1310
1311         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1312                 /* Some OSes (unknown ones, but I see them on web server, which
1313                  * contains information interesting only for windows'
1314                  * users) do not send their stamp in SYN. It is easy case.
1315                  * We simply do not advertise TS support.
1316                  */
1317                 tmp_opt.saw_tstamp = 0;
1318                 tmp_opt.tstamp_ok  = 0;
1319         }
1320         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1321
1322         tcp_openreq_init(req, &tmp_opt, skb);
1323
1324         if (security_inet_conn_request(sk, skb, req))
1325                 goto drop_and_free;
1326
1327         ireq = inet_rsk(req);
1328         ireq->loc_addr = daddr;
1329         ireq->rmt_addr = saddr;
1330         ireq->opt = tcp_v4_save_options(sk, skb);
1331         if (!want_cookie)
1332                 TCP_ECN_create_request(req, tcp_hdr(skb));
1333
1334         if (want_cookie) {
1335 #ifdef CONFIG_SYN_COOKIES
1336                 syn_flood_warning(skb);
1337 #endif
1338                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1339         } else if (!isn) {
1340                 struct inet_peer *peer = NULL;
1341
1342                 /* VJ's idea. We save last timestamp seen
1343                  * from the destination in peer table, when entering
1344                  * state TIME-WAIT, and check against it before
1345                  * accepting new connection request.
1346                  *
1347                  * If "isn" is not zero, this request hit alive
1348                  * timewait bucket, so that all the necessary checks
1349                  * are made in the function processing timewait state.
1350                  */
1351                 if (tmp_opt.saw_tstamp &&
1352                     tcp_death_row.sysctl_tw_recycle &&
1353                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1354                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1355                     peer->v4daddr == saddr) {
1356                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1357                             (s32)(peer->tcp_ts - req->ts_recent) >
1358                                                         TCP_PAWS_WINDOW) {
1359                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1360                                 dst_release(dst);
1361                                 goto drop_and_free;
1362                         }
1363                 }
1364                 /* Kill the following clause, if you dislike this way. */
1365                 else if (!sysctl_tcp_syncookies &&
1366                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1367                           (sysctl_max_syn_backlog >> 2)) &&
1368                          (!peer || !peer->tcp_ts_stamp) &&
1369                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1370                         /* Without syncookies last quarter of
1371                          * backlog is filled with destinations,
1372                          * proven to be alive.
1373                          * It means that we continue to communicate
1374                          * to destinations, already remembered
1375                          * to the moment of synflood.
1376                          */
1377                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1378                                        "request from %u.%u.%u.%u/%u\n",
1379                                        NIPQUAD(saddr),
1380                                        ntohs(tcp_hdr(skb)->source));
1381                         dst_release(dst);
1382                         goto drop_and_free;
1383                 }
1384
1385                 isn = tcp_v4_init_sequence(skb);
1386         }
1387         tcp_rsk(req)->snt_isn = isn;
1388
1389         if (tcp_v4_send_synack(sk, req, dst))
1390                 goto drop_and_free;
1391
1392         if (want_cookie) {
1393                 reqsk_free(req);
1394         } else {
1395                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1396         }
1397         return 0;
1398
1399 drop_and_free:
1400         reqsk_free(req);
1401 drop:
1402         return 0;
1403 }
1404
1405
1406 /*
1407  * The three way handshake has completed - we got a valid synack -
1408  * now create the new socket.
1409  */
1410 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1411                                   struct request_sock *req,
1412                                   struct dst_entry *dst)
1413 {
1414         struct inet_request_sock *ireq;
1415         struct inet_sock *newinet;
1416         struct tcp_sock *newtp;
1417         struct sock *newsk;
1418 #ifdef CONFIG_TCP_MD5SIG
1419         struct tcp_md5sig_key *key;
1420 #endif
1421
1422         if (sk_acceptq_is_full(sk))
1423                 goto exit_overflow;
1424
1425         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1426                 goto exit;
1427
1428         newsk = tcp_create_openreq_child(sk, req, skb);
1429         if (!newsk)
1430                 goto exit;
1431
1432         newsk->sk_gso_type = SKB_GSO_TCPV4;
1433         sk_setup_caps(newsk, dst);
1434
1435         newtp                 = tcp_sk(newsk);
1436         newinet               = inet_sk(newsk);
1437         ireq                  = inet_rsk(req);
1438         newinet->daddr        = ireq->rmt_addr;
1439         newinet->rcv_saddr    = ireq->loc_addr;
1440         newinet->saddr        = ireq->loc_addr;
1441         newinet->opt          = ireq->opt;
1442         ireq->opt             = NULL;
1443         newinet->mc_index     = inet_iif(skb);
1444         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1445         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1446         if (newinet->opt)
1447                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1448         newinet->id = newtp->write_seq ^ jiffies;
1449
1450         tcp_mtup_init(newsk);
1451         tcp_sync_mss(newsk, dst_mtu(dst));
1452         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1453         tcp_initialize_rcv_mss(newsk);
1454
1455 #ifdef CONFIG_TCP_MD5SIG
1456         /* Copy over the MD5 key from the original socket */
1457         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1458                 /*
1459                  * We're using one, so create a matching key
1460                  * on the newsk structure. If we fail to get
1461                  * memory, then we end up not copying the key
1462                  * across. Shucks.
1463                  */
1464                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1465                 if (newkey != NULL)
1466                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1467                                           newkey, key->keylen);
1468         }
1469 #endif
1470
1471         __inet_hash(&tcp_hashinfo, newsk, 0);
1472         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1473
1474         return newsk;
1475
1476 exit_overflow:
1477         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1478 exit:
1479         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1480         dst_release(dst);
1481         return NULL;
1482 }
1483
1484 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1485 {
1486         struct tcphdr *th = tcp_hdr(skb);
1487         const struct iphdr *iph = ip_hdr(skb);
1488         struct sock *nsk;
1489         struct request_sock **prev;
1490         /* Find possible connection requests. */
1491         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1492                                                        iph->saddr, iph->daddr);
1493         if (req)
1494                 return tcp_check_req(sk, skb, req, prev);
1495
1496         nsk = inet_lookup_established(&tcp_hashinfo, iph->saddr, th->source,
1497                                       iph->daddr, th->dest, inet_iif(skb));
1498
1499         if (nsk) {
1500                 if (nsk->sk_state != TCP_TIME_WAIT) {
1501                         bh_lock_sock(nsk);
1502                         return nsk;
1503                 }
1504                 inet_twsk_put(inet_twsk(nsk));
1505                 return NULL;
1506         }
1507
1508 #ifdef CONFIG_SYN_COOKIES
1509         if (!th->rst && !th->syn && th->ack)
1510                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1511 #endif
1512         return sk;
1513 }
1514
1515 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1516 {
1517         const struct iphdr *iph = ip_hdr(skb);
1518
1519         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1520                 if (!tcp_v4_check(skb->len, iph->saddr,
1521                                   iph->daddr, skb->csum)) {
1522                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1523                         return 0;
1524                 }
1525         }
1526
1527         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1528                                        skb->len, IPPROTO_TCP, 0);
1529
1530         if (skb->len <= 76) {
1531                 return __skb_checksum_complete(skb);
1532         }
1533         return 0;
1534 }
1535
1536
1537 /* The socket must have it's spinlock held when we get
1538  * here.
1539  *
1540  * We have a potential double-lock case here, so even when
1541  * doing backlog processing we use the BH locking scheme.
1542  * This is because we cannot sleep with the original spinlock
1543  * held.
1544  */
1545 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1546 {
1547         struct sock *rsk;
1548 #ifdef CONFIG_TCP_MD5SIG
1549         /*
1550          * We really want to reject the packet as early as possible
1551          * if:
1552          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1553          *  o There is an MD5 option and we're not expecting one
1554          */
1555         if (tcp_v4_inbound_md5_hash(sk, skb))
1556                 goto discard;
1557 #endif
1558
1559         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1560                 TCP_CHECK_TIMER(sk);
1561                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1562                         rsk = sk;
1563                         goto reset;
1564                 }
1565                 TCP_CHECK_TIMER(sk);
1566                 return 0;
1567         }
1568
1569         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1570                 goto csum_err;
1571
1572         if (sk->sk_state == TCP_LISTEN) {
1573                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1574                 if (!nsk)
1575                         goto discard;
1576
1577                 if (nsk != sk) {
1578                         if (tcp_child_process(sk, nsk, skb)) {
1579                                 rsk = nsk;
1580                                 goto reset;
1581                         }
1582                         return 0;
1583                 }
1584         }
1585
1586         TCP_CHECK_TIMER(sk);
1587         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1588                 rsk = sk;
1589                 goto reset;
1590         }
1591         TCP_CHECK_TIMER(sk);
1592         return 0;
1593
1594 reset:
1595         tcp_v4_send_reset(rsk, skb);
1596 discard:
1597         kfree_skb(skb);
1598         /* Be careful here. If this function gets more complicated and
1599          * gcc suffers from register pressure on the x86, sk (in %ebx)
1600          * might be destroyed here. This current version compiles correctly,
1601          * but you have been warned.
1602          */
1603         return 0;
1604
1605 csum_err:
1606         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1607         goto discard;
1608 }
1609
1610 /*
1611  *      From tcp_input.c
1612  */
1613
1614 int tcp_v4_rcv(struct sk_buff *skb)
1615 {
1616         const struct iphdr *iph;
1617         struct tcphdr *th;
1618         struct sock *sk;
1619         int ret;
1620
1621         if (skb->pkt_type != PACKET_HOST)
1622                 goto discard_it;
1623
1624         /* Count it even if it's bad */
1625         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1626
1627         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1628                 goto discard_it;
1629
1630         th = tcp_hdr(skb);
1631
1632         if (th->doff < sizeof(struct tcphdr) / 4)
1633                 goto bad_packet;
1634         if (!pskb_may_pull(skb, th->doff * 4))
1635                 goto discard_it;
1636
1637         /* An explanation is required here, I think.
1638          * Packet length and doff are validated by header prediction,
1639          * provided case of th->doff==0 is eliminated.
1640          * So, we defer the checks. */
1641         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1642                 goto bad_packet;
1643
1644         th = tcp_hdr(skb);
1645         iph = ip_hdr(skb);
1646         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1647         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1648                                     skb->len - th->doff * 4);
1649         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1650         TCP_SKB_CB(skb)->when    = 0;
1651         TCP_SKB_CB(skb)->flags   = iph->tos;
1652         TCP_SKB_CB(skb)->sacked  = 0;
1653
1654         sk = __inet_lookup(&tcp_hashinfo, iph->saddr, th->source,
1655                            iph->daddr, th->dest, inet_iif(skb));
1656         if (!sk)
1657                 goto no_tcp_socket;
1658
1659 process:
1660         if (sk->sk_state == TCP_TIME_WAIT)
1661                 goto do_time_wait;
1662
1663         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1664                 goto discard_and_relse;
1665         nf_reset(skb);
1666
1667         if (sk_filter(sk, skb))
1668                 goto discard_and_relse;
1669
1670         skb->dev = NULL;
1671
1672         bh_lock_sock_nested(sk);
1673         ret = 0;
1674         if (!sock_owned_by_user(sk)) {
1675 #ifdef CONFIG_NET_DMA
1676                 struct tcp_sock *tp = tcp_sk(sk);
1677                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1678                         tp->ucopy.dma_chan = get_softnet_dma();
1679                 if (tp->ucopy.dma_chan)
1680                         ret = tcp_v4_do_rcv(sk, skb);
1681                 else
1682 #endif
1683                 {
1684                         if (!tcp_prequeue(sk, skb))
1685                         ret = tcp_v4_do_rcv(sk, skb);
1686                 }
1687         } else
1688                 sk_add_backlog(sk, skb);
1689         bh_unlock_sock(sk);
1690
1691         sock_put(sk);
1692
1693         return ret;
1694
1695 no_tcp_socket:
1696         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1697                 goto discard_it;
1698
1699         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1700 bad_packet:
1701                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1702         } else {
1703                 tcp_v4_send_reset(NULL, skb);
1704         }
1705
1706 discard_it:
1707         /* Discard frame. */
1708         kfree_skb(skb);
1709         return 0;
1710
1711 discard_and_relse:
1712         sock_put(sk);
1713         goto discard_it;
1714
1715 do_time_wait:
1716         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1717                 inet_twsk_put(inet_twsk(sk));
1718                 goto discard_it;
1719         }
1720
1721         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1722                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1723                 inet_twsk_put(inet_twsk(sk));
1724                 goto discard_it;
1725         }
1726         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1727         case TCP_TW_SYN: {
1728                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1729                                                         iph->daddr, th->dest,
1730                                                         inet_iif(skb));
1731                 if (sk2) {
1732                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1733                         inet_twsk_put(inet_twsk(sk));
1734                         sk = sk2;
1735                         goto process;
1736                 }
1737                 /* Fall through to ACK */
1738         }
1739         case TCP_TW_ACK:
1740                 tcp_v4_timewait_ack(sk, skb);
1741                 break;
1742         case TCP_TW_RST:
1743                 goto no_tcp_socket;
1744         case TCP_TW_SUCCESS:;
1745         }
1746         goto discard_it;
1747 }
1748
1749 /* VJ's idea. Save last timestamp seen from this destination
1750  * and hold it at least for normal timewait interval to use for duplicate
1751  * segment detection in subsequent connections, before they enter synchronized
1752  * state.
1753  */
1754
1755 int tcp_v4_remember_stamp(struct sock *sk)
1756 {
1757         struct inet_sock *inet = inet_sk(sk);
1758         struct tcp_sock *tp = tcp_sk(sk);
1759         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1760         struct inet_peer *peer = NULL;
1761         int release_it = 0;
1762
1763         if (!rt || rt->rt_dst != inet->daddr) {
1764                 peer = inet_getpeer(inet->daddr, 1);
1765                 release_it = 1;
1766         } else {
1767                 if (!rt->peer)
1768                         rt_bind_peer(rt, 1);
1769                 peer = rt->peer;
1770         }
1771
1772         if (peer) {
1773                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1774                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1775                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1776                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1777                         peer->tcp_ts = tp->rx_opt.ts_recent;
1778                 }
1779                 if (release_it)
1780                         inet_putpeer(peer);
1781                 return 1;
1782         }
1783
1784         return 0;
1785 }
1786
1787 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1788 {
1789         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1790
1791         if (peer) {
1792                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1793
1794                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1795                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1796                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1797                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1798                         peer->tcp_ts       = tcptw->tw_ts_recent;
1799                 }
1800                 inet_putpeer(peer);
1801                 return 1;
1802         }
1803
1804         return 0;
1805 }
1806
1807 struct inet_connection_sock_af_ops ipv4_specific = {
1808         .queue_xmit        = ip_queue_xmit,
1809         .send_check        = tcp_v4_send_check,
1810         .rebuild_header    = inet_sk_rebuild_header,
1811         .conn_request      = tcp_v4_conn_request,
1812         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1813         .remember_stamp    = tcp_v4_remember_stamp,
1814         .net_header_len    = sizeof(struct iphdr),
1815         .setsockopt        = ip_setsockopt,
1816         .getsockopt        = ip_getsockopt,
1817         .addr2sockaddr     = inet_csk_addr2sockaddr,
1818         .sockaddr_len      = sizeof(struct sockaddr_in),
1819 #ifdef CONFIG_COMPAT
1820         .compat_setsockopt = compat_ip_setsockopt,
1821         .compat_getsockopt = compat_ip_getsockopt,
1822 #endif
1823 };
1824
1825 #ifdef CONFIG_TCP_MD5SIG
1826 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1827         .md5_lookup             = tcp_v4_md5_lookup,
1828         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1829         .md5_add                = tcp_v4_md5_add_func,
1830         .md5_parse              = tcp_v4_parse_md5_keys,
1831 };
1832 #endif
1833
1834 /* NOTE: A lot of things set to zero explicitly by call to
1835  *       sk_alloc() so need not be done here.
1836  */
1837 static int tcp_v4_init_sock(struct sock *sk)
1838 {
1839         struct inet_connection_sock *icsk = inet_csk(sk);
1840         struct tcp_sock *tp = tcp_sk(sk);
1841
1842         skb_queue_head_init(&tp->out_of_order_queue);
1843         tcp_init_xmit_timers(sk);
1844         tcp_prequeue_init(tp);
1845
1846         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1847         tp->mdev = TCP_TIMEOUT_INIT;
1848
1849         /* So many TCP implementations out there (incorrectly) count the
1850          * initial SYN frame in their delayed-ACK and congestion control
1851          * algorithms that we must have the following bandaid to talk
1852          * efficiently to them.  -DaveM
1853          */
1854         tp->snd_cwnd = 2;
1855
1856         /* See draft-stevens-tcpca-spec-01 for discussion of the
1857          * initialization of these values.
1858          */
1859         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1860         tp->snd_cwnd_clamp = ~0;
1861         tp->mss_cache = 536;
1862
1863         tp->reordering = sysctl_tcp_reordering;
1864         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1865
1866         sk->sk_state = TCP_CLOSE;
1867
1868         sk->sk_write_space = sk_stream_write_space;
1869         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1870
1871         icsk->icsk_af_ops = &ipv4_specific;
1872         icsk->icsk_sync_mss = tcp_sync_mss;
1873 #ifdef CONFIG_TCP_MD5SIG
1874         tp->af_specific = &tcp_sock_ipv4_specific;
1875 #endif
1876
1877         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1878         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1879
1880         atomic_inc(&tcp_sockets_allocated);
1881
1882         return 0;
1883 }
1884
1885 int tcp_v4_destroy_sock(struct sock *sk)
1886 {
1887         struct tcp_sock *tp = tcp_sk(sk);
1888
1889         tcp_clear_xmit_timers(sk);
1890
1891         tcp_cleanup_congestion_control(sk);
1892
1893         /* Cleanup up the write buffer. */
1894         tcp_write_queue_purge(sk);
1895
1896         /* Cleans up our, hopefully empty, out_of_order_queue. */
1897         __skb_queue_purge(&tp->out_of_order_queue);
1898
1899 #ifdef CONFIG_TCP_MD5SIG
1900         /* Clean up the MD5 key list, if any */
1901         if (tp->md5sig_info) {
1902                 tcp_v4_clear_md5_list(sk);
1903                 kfree(tp->md5sig_info);
1904                 tp->md5sig_info = NULL;
1905         }
1906 #endif
1907
1908 #ifdef CONFIG_NET_DMA
1909         /* Cleans up our sk_async_wait_queue */
1910         __skb_queue_purge(&sk->sk_async_wait_queue);
1911 #endif
1912
1913         /* Clean prequeue, it must be empty really */
1914         __skb_queue_purge(&tp->ucopy.prequeue);
1915
1916         /* Clean up a referenced TCP bind bucket. */
1917         if (inet_csk(sk)->icsk_bind_hash)
1918                 inet_put_port(&tcp_hashinfo, sk);
1919
1920         /*
1921          * If sendmsg cached page exists, toss it.
1922          */
1923         if (sk->sk_sndmsg_page) {
1924                 __free_page(sk->sk_sndmsg_page);
1925                 sk->sk_sndmsg_page = NULL;
1926         }
1927
1928         atomic_dec(&tcp_sockets_allocated);
1929
1930         return 0;
1931 }
1932
1933 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1934
1935 #ifdef CONFIG_PROC_FS
1936 /* Proc filesystem TCP sock list dumping. */
1937
1938 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1939 {
1940         return hlist_empty(head) ? NULL :
1941                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1942 }
1943
1944 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1945 {
1946         return tw->tw_node.next ?
1947                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1948 }
1949
1950 static void *listening_get_next(struct seq_file *seq, void *cur)
1951 {
1952         struct inet_connection_sock *icsk;
1953         struct hlist_node *node;
1954         struct sock *sk = cur;
1955         struct tcp_iter_state* st = seq->private;
1956
1957         if (!sk) {
1958                 st->bucket = 0;
1959                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1960                 goto get_sk;
1961         }
1962
1963         ++st->num;
1964
1965         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1966                 struct request_sock *req = cur;
1967
1968                 icsk = inet_csk(st->syn_wait_sk);
1969                 req = req->dl_next;
1970                 while (1) {
1971                         while (req) {
1972                                 if (req->rsk_ops->family == st->family) {
1973                                         cur = req;
1974                                         goto out;
1975                                 }
1976                                 req = req->dl_next;
1977                         }
1978                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1979                                 break;
1980 get_req:
1981                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1982                 }
1983                 sk        = sk_next(st->syn_wait_sk);
1984                 st->state = TCP_SEQ_STATE_LISTENING;
1985                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1986         } else {
1987                 icsk = inet_csk(sk);
1988                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1989                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1990                         goto start_req;
1991                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1992                 sk = sk_next(sk);
1993         }
1994 get_sk:
1995         sk_for_each_from(sk, node) {
1996                 if (sk->sk_family == st->family) {
1997                         cur = sk;
1998                         goto out;
1999                 }
2000                 icsk = inet_csk(sk);
2001                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2002                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2003 start_req:
2004                         st->uid         = sock_i_uid(sk);
2005                         st->syn_wait_sk = sk;
2006                         st->state       = TCP_SEQ_STATE_OPENREQ;
2007                         st->sbucket     = 0;
2008                         goto get_req;
2009                 }
2010                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2011         }
2012         if (++st->bucket < INET_LHTABLE_SIZE) {
2013                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2014                 goto get_sk;
2015         }
2016         cur = NULL;
2017 out:
2018         return cur;
2019 }
2020
2021 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2022 {
2023         void *rc = listening_get_next(seq, NULL);
2024
2025         while (rc && *pos) {
2026                 rc = listening_get_next(seq, rc);
2027                 --*pos;
2028         }
2029         return rc;
2030 }
2031
2032 static void *established_get_first(struct seq_file *seq)
2033 {
2034         struct tcp_iter_state* st = seq->private;
2035         void *rc = NULL;
2036
2037         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2038                 struct sock *sk;
2039                 struct hlist_node *node;
2040                 struct inet_timewait_sock *tw;
2041
2042                 /* We can reschedule _before_ having picked the target: */
2043                 cond_resched_softirq();
2044
2045                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2046                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2047                         if (sk->sk_family != st->family) {
2048                                 continue;
2049                         }
2050                         rc = sk;
2051                         goto out;
2052                 }
2053                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2054                 inet_twsk_for_each(tw, node,
2055                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2056                         if (tw->tw_family != st->family) {
2057                                 continue;
2058                         }
2059                         rc = tw;
2060                         goto out;
2061                 }
2062                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2063                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2064         }
2065 out:
2066         return rc;
2067 }
2068
2069 static void *established_get_next(struct seq_file *seq, void *cur)
2070 {
2071         struct sock *sk = cur;
2072         struct inet_timewait_sock *tw;
2073         struct hlist_node *node;
2074         struct tcp_iter_state* st = seq->private;
2075
2076         ++st->num;
2077
2078         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2079                 tw = cur;
2080                 tw = tw_next(tw);
2081 get_tw:
2082                 while (tw && tw->tw_family != st->family) {
2083                         tw = tw_next(tw);
2084                 }
2085                 if (tw) {
2086                         cur = tw;
2087                         goto out;
2088                 }
2089                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2090                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2091
2092                 /* We can reschedule between buckets: */
2093                 cond_resched_softirq();
2094
2095                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2096                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2097                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2098                 } else {
2099                         cur = NULL;
2100                         goto out;
2101                 }
2102         } else
2103                 sk = sk_next(sk);
2104
2105         sk_for_each_from(sk, node) {
2106                 if (sk->sk_family == st->family)
2107                         goto found;
2108         }
2109
2110         st->state = TCP_SEQ_STATE_TIME_WAIT;
2111         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2112         goto get_tw;
2113 found:
2114         cur = sk;
2115 out:
2116         return cur;
2117 }
2118
2119 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2120 {
2121         void *rc = established_get_first(seq);
2122
2123         while (rc && pos) {
2124                 rc = established_get_next(seq, rc);
2125                 --pos;
2126         }
2127         return rc;
2128 }
2129
2130 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2131 {
2132         void *rc;
2133         struct tcp_iter_state* st = seq->private;
2134
2135         inet_listen_lock(&tcp_hashinfo);
2136         st->state = TCP_SEQ_STATE_LISTENING;
2137         rc        = listening_get_idx(seq, &pos);
2138
2139         if (!rc) {
2140                 inet_listen_unlock(&tcp_hashinfo);
2141                 local_bh_disable();
2142                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2143                 rc        = established_get_idx(seq, pos);
2144         }
2145
2146         return rc;
2147 }
2148
2149 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2150 {
2151         struct tcp_iter_state* st = seq->private;
2152         st->state = TCP_SEQ_STATE_LISTENING;
2153         st->num = 0;
2154         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2155 }
2156
2157 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2158 {
2159         void *rc = NULL;
2160         struct tcp_iter_state* st;
2161
2162         if (v == SEQ_START_TOKEN) {
2163                 rc = tcp_get_idx(seq, 0);
2164                 goto out;
2165         }
2166         st = seq->private;
2167
2168         switch (st->state) {
2169         case TCP_SEQ_STATE_OPENREQ:
2170         case TCP_SEQ_STATE_LISTENING:
2171                 rc = listening_get_next(seq, v);
2172                 if (!rc) {
2173                         inet_listen_unlock(&tcp_hashinfo);
2174                         local_bh_disable();
2175                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2176                         rc        = established_get_first(seq);
2177                 }
2178                 break;
2179         case TCP_SEQ_STATE_ESTABLISHED:
2180         case TCP_SEQ_STATE_TIME_WAIT:
2181                 rc = established_get_next(seq, v);
2182                 break;
2183         }
2184 out:
2185         ++*pos;
2186         return rc;
2187 }
2188
2189 static void tcp_seq_stop(struct seq_file *seq, void *v)
2190 {
2191         struct tcp_iter_state* st = seq->private;
2192
2193         switch (st->state) {
2194         case TCP_SEQ_STATE_OPENREQ:
2195                 if (v) {
2196                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2197                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2198                 }
2199         case TCP_SEQ_STATE_LISTENING:
2200                 if (v != SEQ_START_TOKEN)
2201                         inet_listen_unlock(&tcp_hashinfo);
2202                 break;
2203         case TCP_SEQ_STATE_TIME_WAIT:
2204         case TCP_SEQ_STATE_ESTABLISHED:
2205                 if (v)
2206                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2207                 local_bh_enable();
2208                 break;
2209         }
2210 }
2211
2212 static int tcp_seq_open(struct inode *inode, struct file *file)
2213 {
2214         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2215         struct seq_file *seq;
2216         struct tcp_iter_state *s;
2217         int rc;
2218
2219         if (unlikely(afinfo == NULL))
2220                 return -EINVAL;
2221
2222         s = kzalloc(sizeof(*s), GFP_KERNEL);
2223         if (!s)
2224                 return -ENOMEM;
2225         s->family               = afinfo->family;
2226         s->seq_ops.start        = tcp_seq_start;
2227         s->seq_ops.next         = tcp_seq_next;
2228         s->seq_ops.show         = afinfo->seq_show;
2229         s->seq_ops.stop         = tcp_seq_stop;
2230
2231         rc = seq_open(file, &s->seq_ops);
2232         if (rc)
2233                 goto out_kfree;
2234         seq          = file->private_data;
2235         seq->private = s;
2236 out:
2237         return rc;
2238 out_kfree:
2239         kfree(s);
2240         goto out;
2241 }
2242
2243 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2244 {
2245         int rc = 0;
2246         struct proc_dir_entry *p;
2247
2248         if (!afinfo)
2249                 return -EINVAL;
2250         afinfo->seq_fops->owner         = afinfo->owner;
2251         afinfo->seq_fops->open          = tcp_seq_open;
2252         afinfo->seq_fops->read          = seq_read;
2253         afinfo->seq_fops->llseek        = seq_lseek;
2254         afinfo->seq_fops->release       = seq_release_private;
2255
2256         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2257         if (p)
2258                 p->data = afinfo;
2259         else
2260                 rc = -ENOMEM;
2261         return rc;
2262 }
2263
2264 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2265 {
2266         if (!afinfo)
2267                 return;
2268         proc_net_remove(afinfo->name);
2269         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2270 }
2271
2272 static void get_openreq4(struct sock *sk, struct request_sock *req,
2273                          char *tmpbuf, int i, int uid)
2274 {
2275         const struct inet_request_sock *ireq = inet_rsk(req);
2276         int ttd = req->expires - jiffies;
2277
2278         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2279                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2280                 i,
2281                 ireq->loc_addr,
2282                 ntohs(inet_sk(sk)->sport),
2283                 ireq->rmt_addr,
2284                 ntohs(ireq->rmt_port),
2285                 TCP_SYN_RECV,
2286                 0, 0, /* could print option size, but that is af dependent. */
2287                 1,    /* timers active (only the expire timer) */
2288                 jiffies_to_clock_t(ttd),
2289                 req->retrans,
2290                 uid,
2291                 0,  /* non standard timer */
2292                 0, /* open_requests have no inode */
2293                 atomic_read(&sk->sk_refcnt),
2294                 req);
2295 }
2296
2297 static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
2298 {
2299         int timer_active;
2300         unsigned long timer_expires;
2301         struct tcp_sock *tp = tcp_sk(sk);
2302         const struct inet_connection_sock *icsk = inet_csk(sk);
2303         struct inet_sock *inet = inet_sk(sk);
2304         __be32 dest = inet->daddr;
2305         __be32 src = inet->rcv_saddr;
2306         __u16 destp = ntohs(inet->dport);
2307         __u16 srcp = ntohs(inet->sport);
2308
2309         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2310                 timer_active    = 1;
2311                 timer_expires   = icsk->icsk_timeout;
2312         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2313                 timer_active    = 4;
2314                 timer_expires   = icsk->icsk_timeout;
2315         } else if (timer_pending(&sk->sk_timer)) {
2316                 timer_active    = 2;
2317                 timer_expires   = sk->sk_timer.expires;
2318         } else {
2319                 timer_active    = 0;
2320                 timer_expires = jiffies;
2321         }
2322
2323         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2324                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2325                 i, src, srcp, dest, destp, sk->sk_state,
2326                 tp->write_seq - tp->snd_una,
2327                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2328                                              (tp->rcv_nxt - tp->copied_seq),
2329                 timer_active,
2330                 jiffies_to_clock_t(timer_expires - jiffies),
2331                 icsk->icsk_retransmits,
2332                 sock_i_uid(sk),
2333                 icsk->icsk_probes_out,
2334                 sock_i_ino(sk),
2335                 atomic_read(&sk->sk_refcnt), sk,
2336                 icsk->icsk_rto,
2337                 icsk->icsk_ack.ato,
2338                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2339                 tp->snd_cwnd,
2340                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2341 }
2342
2343 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2344                                char *tmpbuf, int i)
2345 {
2346         __be32 dest, src;
2347         __u16 destp, srcp;
2348         int ttd = tw->tw_ttd - jiffies;
2349
2350         if (ttd < 0)
2351                 ttd = 0;
2352
2353         dest  = tw->tw_daddr;
2354         src   = tw->tw_rcv_saddr;
2355         destp = ntohs(tw->tw_dport);
2356         srcp  = ntohs(tw->tw_sport);
2357
2358         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2359                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2360                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2361                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2362                 atomic_read(&tw->tw_refcnt), tw);
2363 }
2364
2365 #define TMPSZ 150
2366
2367 static int tcp4_seq_show(struct seq_file *seq, void *v)
2368 {
2369         struct tcp_iter_state* st;
2370         char tmpbuf[TMPSZ + 1];
2371
2372         if (v == SEQ_START_TOKEN) {
2373                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2374                            "  sl  local_address rem_address   st tx_queue "
2375                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2376                            "inode");
2377                 goto out;
2378         }
2379         st = seq->private;
2380
2381         switch (st->state) {
2382         case TCP_SEQ_STATE_LISTENING:
2383         case TCP_SEQ_STATE_ESTABLISHED:
2384                 get_tcp4_sock(v, tmpbuf, st->num);
2385                 break;
2386         case TCP_SEQ_STATE_OPENREQ:
2387                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2388                 break;
2389         case TCP_SEQ_STATE_TIME_WAIT:
2390                 get_timewait4_sock(v, tmpbuf, st->num);
2391                 break;
2392         }
2393         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2394 out:
2395         return 0;
2396 }
2397
2398 static struct file_operations tcp4_seq_fops;
2399 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2400         .owner          = THIS_MODULE,
2401         .name           = "tcp",
2402         .family         = AF_INET,
2403         .seq_show       = tcp4_seq_show,
2404         .seq_fops       = &tcp4_seq_fops,
2405 };
2406
2407 int __init tcp4_proc_init(void)
2408 {
2409         return tcp_proc_register(&tcp4_seq_afinfo);
2410 }
2411
2412 void tcp4_proc_exit(void)
2413 {
2414         tcp_proc_unregister(&tcp4_seq_afinfo);
2415 }
2416 #endif /* CONFIG_PROC_FS */
2417
2418 struct proto tcp_prot = {
2419         .name                   = "TCP",
2420         .owner                  = THIS_MODULE,
2421         .close                  = tcp_close,
2422         .connect                = tcp_v4_connect,
2423         .disconnect             = tcp_disconnect,
2424         .accept                 = inet_csk_accept,
2425         .ioctl                  = tcp_ioctl,
2426         .init                   = tcp_v4_init_sock,
2427         .destroy                = tcp_v4_destroy_sock,
2428         .shutdown               = tcp_shutdown,
2429         .setsockopt             = tcp_setsockopt,
2430         .getsockopt             = tcp_getsockopt,
2431         .sendmsg                = tcp_sendmsg,
2432         .recvmsg                = tcp_recvmsg,
2433         .backlog_rcv            = tcp_v4_do_rcv,
2434         .hash                   = tcp_v4_hash,
2435         .unhash                 = tcp_unhash,
2436         .get_port               = tcp_v4_get_port,
2437         .enter_memory_pressure  = tcp_enter_memory_pressure,
2438         .sockets_allocated      = &tcp_sockets_allocated,
2439         .orphan_count           = &tcp_orphan_count,
2440         .memory_allocated       = &tcp_memory_allocated,
2441         .memory_pressure        = &tcp_memory_pressure,
2442         .sysctl_mem             = sysctl_tcp_mem,
2443         .sysctl_wmem            = sysctl_tcp_wmem,
2444         .sysctl_rmem            = sysctl_tcp_rmem,
2445         .max_header             = MAX_TCP_HEADER,
2446         .obj_size               = sizeof(struct tcp_sock),
2447         .twsk_prot              = &tcp_timewait_sock_ops,
2448         .rsk_prot               = &tcp_request_sock_ops,
2449 #ifdef CONFIG_COMPAT
2450         .compat_setsockopt      = compat_tcp_setsockopt,
2451         .compat_getsockopt      = compat_tcp_getsockopt,
2452 #endif
2453 };
2454
2455 void __init tcp_v4_init(struct net_proto_family *ops)
2456 {
2457         if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2458                                      IPPROTO_TCP) < 0)
2459                 panic("Failed to create the TCP control socket.\n");
2460 }
2461
2462 EXPORT_SYMBOL(ipv4_specific);
2463 EXPORT_SYMBOL(tcp_hashinfo);
2464 EXPORT_SYMBOL(tcp_prot);
2465 EXPORT_SYMBOL(tcp_unhash);
2466 EXPORT_SYMBOL(tcp_v4_conn_request);
2467 EXPORT_SYMBOL(tcp_v4_connect);
2468 EXPORT_SYMBOL(tcp_v4_do_rcv);
2469 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2470 EXPORT_SYMBOL(tcp_v4_send_check);
2471 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2472
2473 #ifdef CONFIG_PROC_FS
2474 EXPORT_SYMBOL(tcp_proc_register);
2475 EXPORT_SYMBOL(tcp_proc_unregister);
2476 #endif
2477 EXPORT_SYMBOL(sysctl_local_port_range);
2478 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2479