gre: Fix MTU calculation for bound GRE tunnels
[pandora-kernel.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63
64 #include <net/net_namespace.h>
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/timewait_sock.h>
72 #include <net/xfrm.h>
73 #include <net/netdma.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80
81 #include <linux/crypto.h>
82 #include <linux/scatterlist.h>
83
84 int sysctl_tcp_tw_reuse __read_mostly;
85 int sysctl_tcp_low_latency __read_mostly;
86
87
88 #ifdef CONFIG_TCP_MD5SIG
89 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
90                                                    __be32 addr);
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
92                                __be32 daddr, __be32 saddr, struct tcphdr *th);
93 #else
94 static inline
95 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
96 {
97         return NULL;
98 }
99 #endif
100
101 struct inet_hashinfo tcp_hashinfo;
102
103 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
104 {
105         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
106                                           ip_hdr(skb)->saddr,
107                                           tcp_hdr(skb)->dest,
108                                           tcp_hdr(skb)->source);
109 }
110
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114         struct tcp_sock *tp = tcp_sk(sk);
115
116         /* With PAWS, it is safe from the viewpoint
117            of data integrity. Even without PAWS it is safe provided sequence
118            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119
120            Actually, the idea is close to VJ's one, only timestamp cache is
121            held not per host, but per port pair and TW bucket is used as state
122            holder.
123
124            If TW bucket has been already destroyed we fall back to VJ's scheme
125            and use initial timestamp retrieved from peer table.
126          */
127         if (tcptw->tw_ts_recent_stamp &&
128             (twp == NULL || (sysctl_tcp_tw_reuse &&
129                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131                 if (tp->write_seq == 0)
132                         tp->write_seq = 1;
133                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
134                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135                 sock_hold(sktw);
136                 return 1;
137         }
138
139         return 0;
140 }
141
142 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
143
144 /* This will initiate an outgoing connection. */
145 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
146 {
147         struct inet_sock *inet = inet_sk(sk);
148         struct tcp_sock *tp = tcp_sk(sk);
149         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
150         struct rtable *rt;
151         __be32 daddr, nexthop;
152         int tmp;
153         int err;
154
155         if (addr_len < sizeof(struct sockaddr_in))
156                 return -EINVAL;
157
158         if (usin->sin_family != AF_INET)
159                 return -EAFNOSUPPORT;
160
161         nexthop = daddr = usin->sin_addr.s_addr;
162         if (inet->opt && inet->opt->srr) {
163                 if (!daddr)
164                         return -EINVAL;
165                 nexthop = inet->opt->faddr;
166         }
167
168         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
169                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
170                                IPPROTO_TCP,
171                                inet->sport, usin->sin_port, sk, 1);
172         if (tmp < 0) {
173                 if (tmp == -ENETUNREACH)
174                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
175                 return tmp;
176         }
177
178         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
179                 ip_rt_put(rt);
180                 return -ENETUNREACH;
181         }
182
183         if (!inet->opt || !inet->opt->srr)
184                 daddr = rt->rt_dst;
185
186         if (!inet->saddr)
187                 inet->saddr = rt->rt_src;
188         inet->rcv_saddr = inet->saddr;
189
190         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
191                 /* Reset inherited state */
192                 tp->rx_opt.ts_recent       = 0;
193                 tp->rx_opt.ts_recent_stamp = 0;
194                 tp->write_seq              = 0;
195         }
196
197         if (tcp_death_row.sysctl_tw_recycle &&
198             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
199                 struct inet_peer *peer = rt_get_peer(rt);
200                 /*
201                  * VJ's idea. We save last timestamp seen from
202                  * the destination in peer table, when entering state
203                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
204                  * when trying new connection.
205                  */
206                 if (peer != NULL &&
207                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
208                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
209                         tp->rx_opt.ts_recent = peer->tcp_ts;
210                 }
211         }
212
213         inet->dport = usin->sin_port;
214         inet->daddr = daddr;
215
216         inet_csk(sk)->icsk_ext_hdr_len = 0;
217         if (inet->opt)
218                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
219
220         tp->rx_opt.mss_clamp = 536;
221
222         /* Socket identity is still unknown (sport may be zero).
223          * However we set state to SYN-SENT and not releasing socket
224          * lock select source port, enter ourselves into the hash tables and
225          * complete initialization after this.
226          */
227         tcp_set_state(sk, TCP_SYN_SENT);
228         err = inet_hash_connect(&tcp_death_row, sk);
229         if (err)
230                 goto failure;
231
232         err = ip_route_newports(&rt, IPPROTO_TCP,
233                                 inet->sport, inet->dport, sk);
234         if (err)
235                 goto failure;
236
237         /* OK, now commit destination to socket.  */
238         sk->sk_gso_type = SKB_GSO_TCPV4;
239         sk_setup_caps(sk, &rt->u.dst);
240
241         if (!tp->write_seq)
242                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
243                                                            inet->daddr,
244                                                            inet->sport,
245                                                            usin->sin_port);
246
247         inet->id = tp->write_seq ^ jiffies;
248
249         err = tcp_connect(sk);
250         rt = NULL;
251         if (err)
252                 goto failure;
253
254         return 0;
255
256 failure:
257         /*
258          * This unhashes the socket and releases the local port,
259          * if necessary.
260          */
261         tcp_set_state(sk, TCP_CLOSE);
262         ip_rt_put(rt);
263         sk->sk_route_caps = 0;
264         inet->dport = 0;
265         return err;
266 }
267
268 /*
269  * This routine does path mtu discovery as defined in RFC1191.
270  */
271 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
272 {
273         struct dst_entry *dst;
274         struct inet_sock *inet = inet_sk(sk);
275
276         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
277          * send out by Linux are always <576bytes so they should go through
278          * unfragmented).
279          */
280         if (sk->sk_state == TCP_LISTEN)
281                 return;
282
283         /* We don't check in the destentry if pmtu discovery is forbidden
284          * on this route. We just assume that no packet_to_big packets
285          * are send back when pmtu discovery is not active.
286          * There is a small race when the user changes this flag in the
287          * route, but I think that's acceptable.
288          */
289         if ((dst = __sk_dst_check(sk, 0)) == NULL)
290                 return;
291
292         dst->ops->update_pmtu(dst, mtu);
293
294         /* Something is about to be wrong... Remember soft error
295          * for the case, if this connection will not able to recover.
296          */
297         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
298                 sk->sk_err_soft = EMSGSIZE;
299
300         mtu = dst_mtu(dst);
301
302         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
303             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
304                 tcp_sync_mss(sk, mtu);
305
306                 /* Resend the TCP packet because it's
307                  * clear that the old packet has been
308                  * dropped. This is the new "fast" path mtu
309                  * discovery.
310                  */
311                 tcp_simple_retransmit(sk);
312         } /* else let the usual retransmit timer handle it */
313 }
314
315 /*
316  * This routine is called by the ICMP module when it gets some
317  * sort of error condition.  If err < 0 then the socket should
318  * be closed and the error returned to the user.  If err > 0
319  * it's just the icmp type << 8 | icmp code.  After adjustment
320  * header points to the first 8 bytes of the tcp header.  We need
321  * to find the appropriate port.
322  *
323  * The locking strategy used here is very "optimistic". When
324  * someone else accesses the socket the ICMP is just dropped
325  * and for some paths there is no check at all.
326  * A more general error queue to queue errors for later handling
327  * is probably better.
328  *
329  */
330
331 void tcp_v4_err(struct sk_buff *skb, u32 info)
332 {
333         struct iphdr *iph = (struct iphdr *)skb->data;
334         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
335         struct tcp_sock *tp;
336         struct inet_sock *inet;
337         const int type = icmp_hdr(skb)->type;
338         const int code = icmp_hdr(skb)->code;
339         struct sock *sk;
340         __u32 seq;
341         int err;
342         struct net *net = dev_net(skb->dev);
343
344         if (skb->len < (iph->ihl << 2) + 8) {
345                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
346                 return;
347         }
348
349         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
350                         iph->saddr, th->source, inet_iif(skb));
351         if (!sk) {
352                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
353                 return;
354         }
355         if (sk->sk_state == TCP_TIME_WAIT) {
356                 inet_twsk_put(inet_twsk(sk));
357                 return;
358         }
359
360         bh_lock_sock(sk);
361         /* If too many ICMPs get dropped on busy
362          * servers this needs to be solved differently.
363          */
364         if (sock_owned_by_user(sk))
365                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
366
367         if (sk->sk_state == TCP_CLOSE)
368                 goto out;
369
370         tp = tcp_sk(sk);
371         seq = ntohl(th->seq);
372         if (sk->sk_state != TCP_LISTEN &&
373             !between(seq, tp->snd_una, tp->snd_nxt)) {
374                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
375                 goto out;
376         }
377
378         switch (type) {
379         case ICMP_SOURCE_QUENCH:
380                 /* Just silently ignore these. */
381                 goto out;
382         case ICMP_PARAMETERPROB:
383                 err = EPROTO;
384                 break;
385         case ICMP_DEST_UNREACH:
386                 if (code > NR_ICMP_UNREACH)
387                         goto out;
388
389                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
390                         if (!sock_owned_by_user(sk))
391                                 do_pmtu_discovery(sk, iph, info);
392                         goto out;
393                 }
394
395                 err = icmp_err_convert[code].errno;
396                 break;
397         case ICMP_TIME_EXCEEDED:
398                 err = EHOSTUNREACH;
399                 break;
400         default:
401                 goto out;
402         }
403
404         switch (sk->sk_state) {
405                 struct request_sock *req, **prev;
406         case TCP_LISTEN:
407                 if (sock_owned_by_user(sk))
408                         goto out;
409
410                 req = inet_csk_search_req(sk, &prev, th->dest,
411                                           iph->daddr, iph->saddr);
412                 if (!req)
413                         goto out;
414
415                 /* ICMPs are not backlogged, hence we cannot get
416                    an established socket here.
417                  */
418                 WARN_ON(req->sk);
419
420                 if (seq != tcp_rsk(req)->snt_isn) {
421                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
422                         goto out;
423                 }
424
425                 /*
426                  * Still in SYN_RECV, just remove it silently.
427                  * There is no good way to pass the error to the newly
428                  * created socket, and POSIX does not want network
429                  * errors returned from accept().
430                  */
431                 inet_csk_reqsk_queue_drop(sk, req, prev);
432                 goto out;
433
434         case TCP_SYN_SENT:
435         case TCP_SYN_RECV:  /* Cannot happen.
436                                It can f.e. if SYNs crossed.
437                              */
438                 if (!sock_owned_by_user(sk)) {
439                         sk->sk_err = err;
440
441                         sk->sk_error_report(sk);
442
443                         tcp_done(sk);
444                 } else {
445                         sk->sk_err_soft = err;
446                 }
447                 goto out;
448         }
449
450         /* If we've already connected we will keep trying
451          * until we time out, or the user gives up.
452          *
453          * rfc1122 4.2.3.9 allows to consider as hard errors
454          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
455          * but it is obsoleted by pmtu discovery).
456          *
457          * Note, that in modern internet, where routing is unreliable
458          * and in each dark corner broken firewalls sit, sending random
459          * errors ordered by their masters even this two messages finally lose
460          * their original sense (even Linux sends invalid PORT_UNREACHs)
461          *
462          * Now we are in compliance with RFCs.
463          *                                                      --ANK (980905)
464          */
465
466         inet = inet_sk(sk);
467         if (!sock_owned_by_user(sk) && inet->recverr) {
468                 sk->sk_err = err;
469                 sk->sk_error_report(sk);
470         } else  { /* Only an error on timeout */
471                 sk->sk_err_soft = err;
472         }
473
474 out:
475         bh_unlock_sock(sk);
476         sock_put(sk);
477 }
478
479 /* This routine computes an IPv4 TCP checksum. */
480 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
481 {
482         struct inet_sock *inet = inet_sk(sk);
483         struct tcphdr *th = tcp_hdr(skb);
484
485         if (skb->ip_summed == CHECKSUM_PARTIAL) {
486                 th->check = ~tcp_v4_check(len, inet->saddr,
487                                           inet->daddr, 0);
488                 skb->csum_start = skb_transport_header(skb) - skb->head;
489                 skb->csum_offset = offsetof(struct tcphdr, check);
490         } else {
491                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
492                                          csum_partial(th,
493                                                       th->doff << 2,
494                                                       skb->csum));
495         }
496 }
497
498 int tcp_v4_gso_send_check(struct sk_buff *skb)
499 {
500         const struct iphdr *iph;
501         struct tcphdr *th;
502
503         if (!pskb_may_pull(skb, sizeof(*th)))
504                 return -EINVAL;
505
506         iph = ip_hdr(skb);
507         th = tcp_hdr(skb);
508
509         th->check = 0;
510         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
511         skb->csum_start = skb_transport_header(skb) - skb->head;
512         skb->csum_offset = offsetof(struct tcphdr, check);
513         skb->ip_summed = CHECKSUM_PARTIAL;
514         return 0;
515 }
516
517 /*
518  *      This routine will send an RST to the other tcp.
519  *
520  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
521  *                    for reset.
522  *      Answer: if a packet caused RST, it is not for a socket
523  *              existing in our system, if it is matched to a socket,
524  *              it is just duplicate segment or bug in other side's TCP.
525  *              So that we build reply only basing on parameters
526  *              arrived with segment.
527  *      Exception: precedence violation. We do not implement it in any case.
528  */
529
530 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
531 {
532         struct tcphdr *th = tcp_hdr(skb);
533         struct {
534                 struct tcphdr th;
535 #ifdef CONFIG_TCP_MD5SIG
536                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
537 #endif
538         } rep;
539         struct ip_reply_arg arg;
540 #ifdef CONFIG_TCP_MD5SIG
541         struct tcp_md5sig_key *key;
542 #endif
543         struct net *net;
544
545         /* Never send a reset in response to a reset. */
546         if (th->rst)
547                 return;
548
549         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
550                 return;
551
552         /* Swap the send and the receive. */
553         memset(&rep, 0, sizeof(rep));
554         rep.th.dest   = th->source;
555         rep.th.source = th->dest;
556         rep.th.doff   = sizeof(struct tcphdr) / 4;
557         rep.th.rst    = 1;
558
559         if (th->ack) {
560                 rep.th.seq = th->ack_seq;
561         } else {
562                 rep.th.ack = 1;
563                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
564                                        skb->len - (th->doff << 2));
565         }
566
567         memset(&arg, 0, sizeof(arg));
568         arg.iov[0].iov_base = (unsigned char *)&rep;
569         arg.iov[0].iov_len  = sizeof(rep.th);
570
571 #ifdef CONFIG_TCP_MD5SIG
572         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
573         if (key) {
574                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
575                                    (TCPOPT_NOP << 16) |
576                                    (TCPOPT_MD5SIG << 8) |
577                                    TCPOLEN_MD5SIG);
578                 /* Update length and the length the header thinks exists */
579                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
580                 rep.th.doff = arg.iov[0].iov_len / 4;
581
582                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
583                                      key, ip_hdr(skb)->saddr,
584                                      ip_hdr(skb)->daddr, &rep.th);
585         }
586 #endif
587         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
588                                       ip_hdr(skb)->saddr, /* XXX */
589                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
590         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
591         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
592
593         net = dev_net(skb_dst(skb)->dev);
594         ip_send_reply(net->ipv4.tcp_sock, skb,
595                       &arg, arg.iov[0].iov_len);
596
597         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
598         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
599 }
600
601 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
602    outside socket context is ugly, certainly. What can I do?
603  */
604
605 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
606                             u32 win, u32 ts, int oif,
607                             struct tcp_md5sig_key *key,
608                             int reply_flags)
609 {
610         struct tcphdr *th = tcp_hdr(skb);
611         struct {
612                 struct tcphdr th;
613                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
614 #ifdef CONFIG_TCP_MD5SIG
615                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
616 #endif
617                         ];
618         } rep;
619         struct ip_reply_arg arg;
620         struct net *net = dev_net(skb_dst(skb)->dev);
621
622         memset(&rep.th, 0, sizeof(struct tcphdr));
623         memset(&arg, 0, sizeof(arg));
624
625         arg.iov[0].iov_base = (unsigned char *)&rep;
626         arg.iov[0].iov_len  = sizeof(rep.th);
627         if (ts) {
628                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
629                                    (TCPOPT_TIMESTAMP << 8) |
630                                    TCPOLEN_TIMESTAMP);
631                 rep.opt[1] = htonl(tcp_time_stamp);
632                 rep.opt[2] = htonl(ts);
633                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
634         }
635
636         /* Swap the send and the receive. */
637         rep.th.dest    = th->source;
638         rep.th.source  = th->dest;
639         rep.th.doff    = arg.iov[0].iov_len / 4;
640         rep.th.seq     = htonl(seq);
641         rep.th.ack_seq = htonl(ack);
642         rep.th.ack     = 1;
643         rep.th.window  = htons(win);
644
645 #ifdef CONFIG_TCP_MD5SIG
646         if (key) {
647                 int offset = (ts) ? 3 : 0;
648
649                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
650                                           (TCPOPT_NOP << 16) |
651                                           (TCPOPT_MD5SIG << 8) |
652                                           TCPOLEN_MD5SIG);
653                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
654                 rep.th.doff = arg.iov[0].iov_len/4;
655
656                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
657                                     key, ip_hdr(skb)->saddr,
658                                     ip_hdr(skb)->daddr, &rep.th);
659         }
660 #endif
661         arg.flags = reply_flags;
662         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
663                                       ip_hdr(skb)->saddr, /* XXX */
664                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
665         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
666         if (oif)
667                 arg.bound_dev_if = oif;
668
669         ip_send_reply(net->ipv4.tcp_sock, skb,
670                       &arg, arg.iov[0].iov_len);
671
672         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
673 }
674
675 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
676 {
677         struct inet_timewait_sock *tw = inet_twsk(sk);
678         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
679
680         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
681                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
682                         tcptw->tw_ts_recent,
683                         tw->tw_bound_dev_if,
684                         tcp_twsk_md5_key(tcptw),
685                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
686                         );
687
688         inet_twsk_put(tw);
689 }
690
691 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
692                                   struct request_sock *req)
693 {
694         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
695                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
696                         req->ts_recent,
697                         0,
698                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
699                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
700 }
701
702 /*
703  *      Send a SYN-ACK after having received a SYN.
704  *      This still operates on a request_sock only, not on a big
705  *      socket.
706  */
707 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
708                                 struct dst_entry *dst)
709 {
710         const struct inet_request_sock *ireq = inet_rsk(req);
711         int err = -1;
712         struct sk_buff * skb;
713
714         /* First, grab a route. */
715         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
716                 return -1;
717
718         skb = tcp_make_synack(sk, dst, req);
719
720         if (skb) {
721                 struct tcphdr *th = tcp_hdr(skb);
722
723                 th->check = tcp_v4_check(skb->len,
724                                          ireq->loc_addr,
725                                          ireq->rmt_addr,
726                                          csum_partial(th, skb->len,
727                                                       skb->csum));
728
729                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
730                                             ireq->rmt_addr,
731                                             ireq->opt);
732                 err = net_xmit_eval(err);
733         }
734
735         dst_release(dst);
736         return err;
737 }
738
739 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
740 {
741         return __tcp_v4_send_synack(sk, req, NULL);
742 }
743
744 /*
745  *      IPv4 request_sock destructor.
746  */
747 static void tcp_v4_reqsk_destructor(struct request_sock *req)
748 {
749         kfree(inet_rsk(req)->opt);
750 }
751
752 #ifdef CONFIG_SYN_COOKIES
753 static void syn_flood_warning(struct sk_buff *skb)
754 {
755         static unsigned long warntime;
756
757         if (time_after(jiffies, (warntime + HZ * 60))) {
758                 warntime = jiffies;
759                 printk(KERN_INFO
760                        "possible SYN flooding on port %d. Sending cookies.\n",
761                        ntohs(tcp_hdr(skb)->dest));
762         }
763 }
764 #endif
765
766 /*
767  * Save and compile IPv4 options into the request_sock if needed.
768  */
769 static struct ip_options *tcp_v4_save_options(struct sock *sk,
770                                               struct sk_buff *skb)
771 {
772         struct ip_options *opt = &(IPCB(skb)->opt);
773         struct ip_options *dopt = NULL;
774
775         if (opt && opt->optlen) {
776                 int opt_size = optlength(opt);
777                 dopt = kmalloc(opt_size, GFP_ATOMIC);
778                 if (dopt) {
779                         if (ip_options_echo(dopt, skb)) {
780                                 kfree(dopt);
781                                 dopt = NULL;
782                         }
783                 }
784         }
785         return dopt;
786 }
787
788 #ifdef CONFIG_TCP_MD5SIG
789 /*
790  * RFC2385 MD5 checksumming requires a mapping of
791  * IP address->MD5 Key.
792  * We need to maintain these in the sk structure.
793  */
794
795 /* Find the Key structure for an address.  */
796 static struct tcp_md5sig_key *
797                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
798 {
799         struct tcp_sock *tp = tcp_sk(sk);
800         int i;
801
802         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
803                 return NULL;
804         for (i = 0; i < tp->md5sig_info->entries4; i++) {
805                 if (tp->md5sig_info->keys4[i].addr == addr)
806                         return &tp->md5sig_info->keys4[i].base;
807         }
808         return NULL;
809 }
810
811 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
812                                          struct sock *addr_sk)
813 {
814         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
815 }
816
817 EXPORT_SYMBOL(tcp_v4_md5_lookup);
818
819 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
820                                                       struct request_sock *req)
821 {
822         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
823 }
824
825 /* This can be called on a newly created socket, from other files */
826 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
827                       u8 *newkey, u8 newkeylen)
828 {
829         /* Add Key to the list */
830         struct tcp_md5sig_key *key;
831         struct tcp_sock *tp = tcp_sk(sk);
832         struct tcp4_md5sig_key *keys;
833
834         key = tcp_v4_md5_do_lookup(sk, addr);
835         if (key) {
836                 /* Pre-existing entry - just update that one. */
837                 kfree(key->key);
838                 key->key = newkey;
839                 key->keylen = newkeylen;
840         } else {
841                 struct tcp_md5sig_info *md5sig;
842
843                 if (!tp->md5sig_info) {
844                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
845                                                   GFP_ATOMIC);
846                         if (!tp->md5sig_info) {
847                                 kfree(newkey);
848                                 return -ENOMEM;
849                         }
850                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
851                 }
852                 if (tcp_alloc_md5sig_pool() == NULL) {
853                         kfree(newkey);
854                         return -ENOMEM;
855                 }
856                 md5sig = tp->md5sig_info;
857
858                 if (md5sig->alloced4 == md5sig->entries4) {
859                         keys = kmalloc((sizeof(*keys) *
860                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
861                         if (!keys) {
862                                 kfree(newkey);
863                                 tcp_free_md5sig_pool();
864                                 return -ENOMEM;
865                         }
866
867                         if (md5sig->entries4)
868                                 memcpy(keys, md5sig->keys4,
869                                        sizeof(*keys) * md5sig->entries4);
870
871                         /* Free old key list, and reference new one */
872                         kfree(md5sig->keys4);
873                         md5sig->keys4 = keys;
874                         md5sig->alloced4++;
875                 }
876                 md5sig->entries4++;
877                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
878                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
879                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
880         }
881         return 0;
882 }
883
884 EXPORT_SYMBOL(tcp_v4_md5_do_add);
885
886 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
887                                u8 *newkey, u8 newkeylen)
888 {
889         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
890                                  newkey, newkeylen);
891 }
892
893 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
894 {
895         struct tcp_sock *tp = tcp_sk(sk);
896         int i;
897
898         for (i = 0; i < tp->md5sig_info->entries4; i++) {
899                 if (tp->md5sig_info->keys4[i].addr == addr) {
900                         /* Free the key */
901                         kfree(tp->md5sig_info->keys4[i].base.key);
902                         tp->md5sig_info->entries4--;
903
904                         if (tp->md5sig_info->entries4 == 0) {
905                                 kfree(tp->md5sig_info->keys4);
906                                 tp->md5sig_info->keys4 = NULL;
907                                 tp->md5sig_info->alloced4 = 0;
908                         } else if (tp->md5sig_info->entries4 != i) {
909                                 /* Need to do some manipulation */
910                                 memmove(&tp->md5sig_info->keys4[i],
911                                         &tp->md5sig_info->keys4[i+1],
912                                         (tp->md5sig_info->entries4 - i) *
913                                          sizeof(struct tcp4_md5sig_key));
914                         }
915                         tcp_free_md5sig_pool();
916                         return 0;
917                 }
918         }
919         return -ENOENT;
920 }
921
922 EXPORT_SYMBOL(tcp_v4_md5_do_del);
923
924 static void tcp_v4_clear_md5_list(struct sock *sk)
925 {
926         struct tcp_sock *tp = tcp_sk(sk);
927
928         /* Free each key, then the set of key keys,
929          * the crypto element, and then decrement our
930          * hold on the last resort crypto.
931          */
932         if (tp->md5sig_info->entries4) {
933                 int i;
934                 for (i = 0; i < tp->md5sig_info->entries4; i++)
935                         kfree(tp->md5sig_info->keys4[i].base.key);
936                 tp->md5sig_info->entries4 = 0;
937                 tcp_free_md5sig_pool();
938         }
939         if (tp->md5sig_info->keys4) {
940                 kfree(tp->md5sig_info->keys4);
941                 tp->md5sig_info->keys4 = NULL;
942                 tp->md5sig_info->alloced4  = 0;
943         }
944 }
945
946 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
947                                  int optlen)
948 {
949         struct tcp_md5sig cmd;
950         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
951         u8 *newkey;
952
953         if (optlen < sizeof(cmd))
954                 return -EINVAL;
955
956         if (copy_from_user(&cmd, optval, sizeof(cmd)))
957                 return -EFAULT;
958
959         if (sin->sin_family != AF_INET)
960                 return -EINVAL;
961
962         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
963                 if (!tcp_sk(sk)->md5sig_info)
964                         return -ENOENT;
965                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
966         }
967
968         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
969                 return -EINVAL;
970
971         if (!tcp_sk(sk)->md5sig_info) {
972                 struct tcp_sock *tp = tcp_sk(sk);
973                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
974
975                 if (!p)
976                         return -EINVAL;
977
978                 tp->md5sig_info = p;
979                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
980         }
981
982         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
983         if (!newkey)
984                 return -ENOMEM;
985         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
986                                  newkey, cmd.tcpm_keylen);
987 }
988
989 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
990                                         __be32 daddr, __be32 saddr, int nbytes)
991 {
992         struct tcp4_pseudohdr *bp;
993         struct scatterlist sg;
994
995         bp = &hp->md5_blk.ip4;
996
997         /*
998          * 1. the TCP pseudo-header (in the order: source IP address,
999          * destination IP address, zero-padded protocol number, and
1000          * segment length)
1001          */
1002         bp->saddr = saddr;
1003         bp->daddr = daddr;
1004         bp->pad = 0;
1005         bp->protocol = IPPROTO_TCP;
1006         bp->len = cpu_to_be16(nbytes);
1007
1008         sg_init_one(&sg, bp, sizeof(*bp));
1009         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1010 }
1011
1012 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1013                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1014 {
1015         struct tcp_md5sig_pool *hp;
1016         struct hash_desc *desc;
1017
1018         hp = tcp_get_md5sig_pool();
1019         if (!hp)
1020                 goto clear_hash_noput;
1021         desc = &hp->md5_desc;
1022
1023         if (crypto_hash_init(desc))
1024                 goto clear_hash;
1025         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1026                 goto clear_hash;
1027         if (tcp_md5_hash_header(hp, th))
1028                 goto clear_hash;
1029         if (tcp_md5_hash_key(hp, key))
1030                 goto clear_hash;
1031         if (crypto_hash_final(desc, md5_hash))
1032                 goto clear_hash;
1033
1034         tcp_put_md5sig_pool();
1035         return 0;
1036
1037 clear_hash:
1038         tcp_put_md5sig_pool();
1039 clear_hash_noput:
1040         memset(md5_hash, 0, 16);
1041         return 1;
1042 }
1043
1044 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1045                         struct sock *sk, struct request_sock *req,
1046                         struct sk_buff *skb)
1047 {
1048         struct tcp_md5sig_pool *hp;
1049         struct hash_desc *desc;
1050         struct tcphdr *th = tcp_hdr(skb);
1051         __be32 saddr, daddr;
1052
1053         if (sk) {
1054                 saddr = inet_sk(sk)->saddr;
1055                 daddr = inet_sk(sk)->daddr;
1056         } else if (req) {
1057                 saddr = inet_rsk(req)->loc_addr;
1058                 daddr = inet_rsk(req)->rmt_addr;
1059         } else {
1060                 const struct iphdr *iph = ip_hdr(skb);
1061                 saddr = iph->saddr;
1062                 daddr = iph->daddr;
1063         }
1064
1065         hp = tcp_get_md5sig_pool();
1066         if (!hp)
1067                 goto clear_hash_noput;
1068         desc = &hp->md5_desc;
1069
1070         if (crypto_hash_init(desc))
1071                 goto clear_hash;
1072
1073         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1074                 goto clear_hash;
1075         if (tcp_md5_hash_header(hp, th))
1076                 goto clear_hash;
1077         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1078                 goto clear_hash;
1079         if (tcp_md5_hash_key(hp, key))
1080                 goto clear_hash;
1081         if (crypto_hash_final(desc, md5_hash))
1082                 goto clear_hash;
1083
1084         tcp_put_md5sig_pool();
1085         return 0;
1086
1087 clear_hash:
1088         tcp_put_md5sig_pool();
1089 clear_hash_noput:
1090         memset(md5_hash, 0, 16);
1091         return 1;
1092 }
1093
1094 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1095
1096 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1097 {
1098         /*
1099          * This gets called for each TCP segment that arrives
1100          * so we want to be efficient.
1101          * We have 3 drop cases:
1102          * o No MD5 hash and one expected.
1103          * o MD5 hash and we're not expecting one.
1104          * o MD5 hash and its wrong.
1105          */
1106         __u8 *hash_location = NULL;
1107         struct tcp_md5sig_key *hash_expected;
1108         const struct iphdr *iph = ip_hdr(skb);
1109         struct tcphdr *th = tcp_hdr(skb);
1110         int genhash;
1111         unsigned char newhash[16];
1112
1113         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1114         hash_location = tcp_parse_md5sig_option(th);
1115
1116         /* We've parsed the options - do we have a hash? */
1117         if (!hash_expected && !hash_location)
1118                 return 0;
1119
1120         if (hash_expected && !hash_location) {
1121                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1122                 return 1;
1123         }
1124
1125         if (!hash_expected && hash_location) {
1126                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1127                 return 1;
1128         }
1129
1130         /* Okay, so this is hash_expected and hash_location -
1131          * so we need to calculate the checksum.
1132          */
1133         genhash = tcp_v4_md5_hash_skb(newhash,
1134                                       hash_expected,
1135                                       NULL, NULL, skb);
1136
1137         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1138                 if (net_ratelimit()) {
1139                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1140                                &iph->saddr, ntohs(th->source),
1141                                &iph->daddr, ntohs(th->dest),
1142                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1143                 }
1144                 return 1;
1145         }
1146         return 0;
1147 }
1148
1149 #endif
1150
1151 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1152         .family         =       PF_INET,
1153         .obj_size       =       sizeof(struct tcp_request_sock),
1154         .rtx_syn_ack    =       tcp_v4_send_synack,
1155         .send_ack       =       tcp_v4_reqsk_send_ack,
1156         .destructor     =       tcp_v4_reqsk_destructor,
1157         .send_reset     =       tcp_v4_send_reset,
1158 };
1159
1160 #ifdef CONFIG_TCP_MD5SIG
1161 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1162         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1163         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1164 };
1165 #endif
1166
1167 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1168         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1169         .twsk_unique    = tcp_twsk_unique,
1170         .twsk_destructor= tcp_twsk_destructor,
1171 };
1172
1173 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1174 {
1175         struct inet_request_sock *ireq;
1176         struct tcp_options_received tmp_opt;
1177         struct request_sock *req;
1178         __be32 saddr = ip_hdr(skb)->saddr;
1179         __be32 daddr = ip_hdr(skb)->daddr;
1180         __u32 isn = TCP_SKB_CB(skb)->when;
1181         struct dst_entry *dst = NULL;
1182 #ifdef CONFIG_SYN_COOKIES
1183         int want_cookie = 0;
1184 #else
1185 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1186 #endif
1187
1188         /* Never answer to SYNs send to broadcast or multicast */
1189         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1190                 goto drop;
1191
1192         /* TW buckets are converted to open requests without
1193          * limitations, they conserve resources and peer is
1194          * evidently real one.
1195          */
1196         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1197 #ifdef CONFIG_SYN_COOKIES
1198                 if (sysctl_tcp_syncookies) {
1199                         want_cookie = 1;
1200                 } else
1201 #endif
1202                 goto drop;
1203         }
1204
1205         /* Accept backlog is full. If we have already queued enough
1206          * of warm entries in syn queue, drop request. It is better than
1207          * clogging syn queue with openreqs with exponentially increasing
1208          * timeout.
1209          */
1210         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1211                 goto drop;
1212
1213         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1214         if (!req)
1215                 goto drop;
1216
1217 #ifdef CONFIG_TCP_MD5SIG
1218         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1219 #endif
1220
1221         tcp_clear_options(&tmp_opt);
1222         tmp_opt.mss_clamp = 536;
1223         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1224
1225         tcp_parse_options(skb, &tmp_opt, 0);
1226
1227         if (want_cookie && !tmp_opt.saw_tstamp)
1228                 tcp_clear_options(&tmp_opt);
1229
1230         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1231
1232         tcp_openreq_init(req, &tmp_opt, skb);
1233
1234         ireq = inet_rsk(req);
1235         ireq->loc_addr = daddr;
1236         ireq->rmt_addr = saddr;
1237         ireq->no_srccheck = inet_sk(sk)->transparent;
1238         ireq->opt = tcp_v4_save_options(sk, skb);
1239
1240         if (security_inet_conn_request(sk, skb, req))
1241                 goto drop_and_free;
1242
1243         if (!want_cookie)
1244                 TCP_ECN_create_request(req, tcp_hdr(skb));
1245
1246         if (want_cookie) {
1247 #ifdef CONFIG_SYN_COOKIES
1248                 syn_flood_warning(skb);
1249                 req->cookie_ts = tmp_opt.tstamp_ok;
1250 #endif
1251                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1252         } else if (!isn) {
1253                 struct inet_peer *peer = NULL;
1254
1255                 /* VJ's idea. We save last timestamp seen
1256                  * from the destination in peer table, when entering
1257                  * state TIME-WAIT, and check against it before
1258                  * accepting new connection request.
1259                  *
1260                  * If "isn" is not zero, this request hit alive
1261                  * timewait bucket, so that all the necessary checks
1262                  * are made in the function processing timewait state.
1263                  */
1264                 if (tmp_opt.saw_tstamp &&
1265                     tcp_death_row.sysctl_tw_recycle &&
1266                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1267                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1268                     peer->v4daddr == saddr) {
1269                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1270                             (s32)(peer->tcp_ts - req->ts_recent) >
1271                                                         TCP_PAWS_WINDOW) {
1272                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1273                                 goto drop_and_release;
1274                         }
1275                 }
1276                 /* Kill the following clause, if you dislike this way. */
1277                 else if (!sysctl_tcp_syncookies &&
1278                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1279                           (sysctl_max_syn_backlog >> 2)) &&
1280                          (!peer || !peer->tcp_ts_stamp) &&
1281                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1282                         /* Without syncookies last quarter of
1283                          * backlog is filled with destinations,
1284                          * proven to be alive.
1285                          * It means that we continue to communicate
1286                          * to destinations, already remembered
1287                          * to the moment of synflood.
1288                          */
1289                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1290                                        &saddr, ntohs(tcp_hdr(skb)->source));
1291                         goto drop_and_release;
1292                 }
1293
1294                 isn = tcp_v4_init_sequence(skb);
1295         }
1296         tcp_rsk(req)->snt_isn = isn;
1297
1298         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1299                 goto drop_and_free;
1300
1301         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1302         return 0;
1303
1304 drop_and_release:
1305         dst_release(dst);
1306 drop_and_free:
1307         reqsk_free(req);
1308 drop:
1309         return 0;
1310 }
1311
1312
1313 /*
1314  * The three way handshake has completed - we got a valid synack -
1315  * now create the new socket.
1316  */
1317 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1318                                   struct request_sock *req,
1319                                   struct dst_entry *dst)
1320 {
1321         struct inet_request_sock *ireq;
1322         struct inet_sock *newinet;
1323         struct tcp_sock *newtp;
1324         struct sock *newsk;
1325 #ifdef CONFIG_TCP_MD5SIG
1326         struct tcp_md5sig_key *key;
1327 #endif
1328
1329         if (sk_acceptq_is_full(sk))
1330                 goto exit_overflow;
1331
1332         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1333                 goto exit;
1334
1335         newsk = tcp_create_openreq_child(sk, req, skb);
1336         if (!newsk)
1337                 goto exit;
1338
1339         newsk->sk_gso_type = SKB_GSO_TCPV4;
1340         sk_setup_caps(newsk, dst);
1341
1342         newtp                 = tcp_sk(newsk);
1343         newinet               = inet_sk(newsk);
1344         ireq                  = inet_rsk(req);
1345         newinet->daddr        = ireq->rmt_addr;
1346         newinet->rcv_saddr    = ireq->loc_addr;
1347         newinet->saddr        = ireq->loc_addr;
1348         newinet->opt          = ireq->opt;
1349         ireq->opt             = NULL;
1350         newinet->mc_index     = inet_iif(skb);
1351         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1352         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1353         if (newinet->opt)
1354                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1355         newinet->id = newtp->write_seq ^ jiffies;
1356
1357         tcp_mtup_init(newsk);
1358         tcp_sync_mss(newsk, dst_mtu(dst));
1359         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1360         if (tcp_sk(sk)->rx_opt.user_mss &&
1361             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1362                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1363
1364         tcp_initialize_rcv_mss(newsk);
1365
1366 #ifdef CONFIG_TCP_MD5SIG
1367         /* Copy over the MD5 key from the original socket */
1368         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1369                 /*
1370                  * We're using one, so create a matching key
1371                  * on the newsk structure. If we fail to get
1372                  * memory, then we end up not copying the key
1373                  * across. Shucks.
1374                  */
1375                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1376                 if (newkey != NULL)
1377                         tcp_v4_md5_do_add(newsk, newinet->daddr,
1378                                           newkey, key->keylen);
1379                 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1380         }
1381 #endif
1382
1383         __inet_hash_nolisten(newsk);
1384         __inet_inherit_port(sk, newsk);
1385
1386         return newsk;
1387
1388 exit_overflow:
1389         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1390 exit:
1391         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1392         dst_release(dst);
1393         return NULL;
1394 }
1395
1396 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1397 {
1398         struct tcphdr *th = tcp_hdr(skb);
1399         const struct iphdr *iph = ip_hdr(skb);
1400         struct sock *nsk;
1401         struct request_sock **prev;
1402         /* Find possible connection requests. */
1403         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1404                                                        iph->saddr, iph->daddr);
1405         if (req)
1406                 return tcp_check_req(sk, skb, req, prev);
1407
1408         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1409                         th->source, iph->daddr, th->dest, inet_iif(skb));
1410
1411         if (nsk) {
1412                 if (nsk->sk_state != TCP_TIME_WAIT) {
1413                         bh_lock_sock(nsk);
1414                         return nsk;
1415                 }
1416                 inet_twsk_put(inet_twsk(nsk));
1417                 return NULL;
1418         }
1419
1420 #ifdef CONFIG_SYN_COOKIES
1421         if (!th->rst && !th->syn && th->ack)
1422                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1423 #endif
1424         return sk;
1425 }
1426
1427 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1428 {
1429         const struct iphdr *iph = ip_hdr(skb);
1430
1431         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1432                 if (!tcp_v4_check(skb->len, iph->saddr,
1433                                   iph->daddr, skb->csum)) {
1434                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1435                         return 0;
1436                 }
1437         }
1438
1439         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1440                                        skb->len, IPPROTO_TCP, 0);
1441
1442         if (skb->len <= 76) {
1443                 return __skb_checksum_complete(skb);
1444         }
1445         return 0;
1446 }
1447
1448
1449 /* The socket must have it's spinlock held when we get
1450  * here.
1451  *
1452  * We have a potential double-lock case here, so even when
1453  * doing backlog processing we use the BH locking scheme.
1454  * This is because we cannot sleep with the original spinlock
1455  * held.
1456  */
1457 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1458 {
1459         struct sock *rsk;
1460 #ifdef CONFIG_TCP_MD5SIG
1461         /*
1462          * We really want to reject the packet as early as possible
1463          * if:
1464          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1465          *  o There is an MD5 option and we're not expecting one
1466          */
1467         if (tcp_v4_inbound_md5_hash(sk, skb))
1468                 goto discard;
1469 #endif
1470
1471         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1472                 TCP_CHECK_TIMER(sk);
1473                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1474                         rsk = sk;
1475                         goto reset;
1476                 }
1477                 TCP_CHECK_TIMER(sk);
1478                 return 0;
1479         }
1480
1481         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1482                 goto csum_err;
1483
1484         if (sk->sk_state == TCP_LISTEN) {
1485                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1486                 if (!nsk)
1487                         goto discard;
1488
1489                 if (nsk != sk) {
1490                         if (tcp_child_process(sk, nsk, skb)) {
1491                                 rsk = nsk;
1492                                 goto reset;
1493                         }
1494                         return 0;
1495                 }
1496         }
1497
1498         TCP_CHECK_TIMER(sk);
1499         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1500                 rsk = sk;
1501                 goto reset;
1502         }
1503         TCP_CHECK_TIMER(sk);
1504         return 0;
1505
1506 reset:
1507         tcp_v4_send_reset(rsk, skb);
1508 discard:
1509         kfree_skb(skb);
1510         /* Be careful here. If this function gets more complicated and
1511          * gcc suffers from register pressure on the x86, sk (in %ebx)
1512          * might be destroyed here. This current version compiles correctly,
1513          * but you have been warned.
1514          */
1515         return 0;
1516
1517 csum_err:
1518         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1519         goto discard;
1520 }
1521
1522 /*
1523  *      From tcp_input.c
1524  */
1525
1526 int tcp_v4_rcv(struct sk_buff *skb)
1527 {
1528         const struct iphdr *iph;
1529         struct tcphdr *th;
1530         struct sock *sk;
1531         int ret;
1532         struct net *net = dev_net(skb->dev);
1533
1534         if (skb->pkt_type != PACKET_HOST)
1535                 goto discard_it;
1536
1537         /* Count it even if it's bad */
1538         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1539
1540         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1541                 goto discard_it;
1542
1543         th = tcp_hdr(skb);
1544
1545         if (th->doff < sizeof(struct tcphdr) / 4)
1546                 goto bad_packet;
1547         if (!pskb_may_pull(skb, th->doff * 4))
1548                 goto discard_it;
1549
1550         /* An explanation is required here, I think.
1551          * Packet length and doff are validated by header prediction,
1552          * provided case of th->doff==0 is eliminated.
1553          * So, we defer the checks. */
1554         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1555                 goto bad_packet;
1556
1557         th = tcp_hdr(skb);
1558         iph = ip_hdr(skb);
1559         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1560         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1561                                     skb->len - th->doff * 4);
1562         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1563         TCP_SKB_CB(skb)->when    = 0;
1564         TCP_SKB_CB(skb)->flags   = iph->tos;
1565         TCP_SKB_CB(skb)->sacked  = 0;
1566
1567         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1568         if (!sk)
1569                 goto no_tcp_socket;
1570
1571 process:
1572         if (sk->sk_state == TCP_TIME_WAIT)
1573                 goto do_time_wait;
1574
1575         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1576                 goto discard_and_relse;
1577         nf_reset(skb);
1578
1579         if (sk_filter(sk, skb))
1580                 goto discard_and_relse;
1581
1582         skb->dev = NULL;
1583
1584         bh_lock_sock_nested(sk);
1585         ret = 0;
1586         if (!sock_owned_by_user(sk)) {
1587 #ifdef CONFIG_NET_DMA
1588                 struct tcp_sock *tp = tcp_sk(sk);
1589                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1590                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1591                 if (tp->ucopy.dma_chan)
1592                         ret = tcp_v4_do_rcv(sk, skb);
1593                 else
1594 #endif
1595                 {
1596                         if (!tcp_prequeue(sk, skb))
1597                                 ret = tcp_v4_do_rcv(sk, skb);
1598                 }
1599         } else
1600                 sk_add_backlog(sk, skb);
1601         bh_unlock_sock(sk);
1602
1603         sock_put(sk);
1604
1605         return ret;
1606
1607 no_tcp_socket:
1608         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1609                 goto discard_it;
1610
1611         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1612 bad_packet:
1613                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1614         } else {
1615                 tcp_v4_send_reset(NULL, skb);
1616         }
1617
1618 discard_it:
1619         /* Discard frame. */
1620         kfree_skb(skb);
1621         return 0;
1622
1623 discard_and_relse:
1624         sock_put(sk);
1625         goto discard_it;
1626
1627 do_time_wait:
1628         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1629                 inet_twsk_put(inet_twsk(sk));
1630                 goto discard_it;
1631         }
1632
1633         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1634                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1635                 inet_twsk_put(inet_twsk(sk));
1636                 goto discard_it;
1637         }
1638         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1639         case TCP_TW_SYN: {
1640                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1641                                                         &tcp_hashinfo,
1642                                                         iph->daddr, th->dest,
1643                                                         inet_iif(skb));
1644                 if (sk2) {
1645                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1646                         inet_twsk_put(inet_twsk(sk));
1647                         sk = sk2;
1648                         goto process;
1649                 }
1650                 /* Fall through to ACK */
1651         }
1652         case TCP_TW_ACK:
1653                 tcp_v4_timewait_ack(sk, skb);
1654                 break;
1655         case TCP_TW_RST:
1656                 goto no_tcp_socket;
1657         case TCP_TW_SUCCESS:;
1658         }
1659         goto discard_it;
1660 }
1661
1662 /* VJ's idea. Save last timestamp seen from this destination
1663  * and hold it at least for normal timewait interval to use for duplicate
1664  * segment detection in subsequent connections, before they enter synchronized
1665  * state.
1666  */
1667
1668 int tcp_v4_remember_stamp(struct sock *sk)
1669 {
1670         struct inet_sock *inet = inet_sk(sk);
1671         struct tcp_sock *tp = tcp_sk(sk);
1672         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1673         struct inet_peer *peer = NULL;
1674         int release_it = 0;
1675
1676         if (!rt || rt->rt_dst != inet->daddr) {
1677                 peer = inet_getpeer(inet->daddr, 1);
1678                 release_it = 1;
1679         } else {
1680                 if (!rt->peer)
1681                         rt_bind_peer(rt, 1);
1682                 peer = rt->peer;
1683         }
1684
1685         if (peer) {
1686                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1687                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1688                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1689                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1690                         peer->tcp_ts = tp->rx_opt.ts_recent;
1691                 }
1692                 if (release_it)
1693                         inet_putpeer(peer);
1694                 return 1;
1695         }
1696
1697         return 0;
1698 }
1699
1700 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1701 {
1702         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1703
1704         if (peer) {
1705                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1706
1707                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1708                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1709                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1710                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1711                         peer->tcp_ts       = tcptw->tw_ts_recent;
1712                 }
1713                 inet_putpeer(peer);
1714                 return 1;
1715         }
1716
1717         return 0;
1718 }
1719
1720 struct inet_connection_sock_af_ops ipv4_specific = {
1721         .queue_xmit        = ip_queue_xmit,
1722         .send_check        = tcp_v4_send_check,
1723         .rebuild_header    = inet_sk_rebuild_header,
1724         .conn_request      = tcp_v4_conn_request,
1725         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1726         .remember_stamp    = tcp_v4_remember_stamp,
1727         .net_header_len    = sizeof(struct iphdr),
1728         .setsockopt        = ip_setsockopt,
1729         .getsockopt        = ip_getsockopt,
1730         .addr2sockaddr     = inet_csk_addr2sockaddr,
1731         .sockaddr_len      = sizeof(struct sockaddr_in),
1732         .bind_conflict     = inet_csk_bind_conflict,
1733 #ifdef CONFIG_COMPAT
1734         .compat_setsockopt = compat_ip_setsockopt,
1735         .compat_getsockopt = compat_ip_getsockopt,
1736 #endif
1737 };
1738
1739 #ifdef CONFIG_TCP_MD5SIG
1740 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1741         .md5_lookup             = tcp_v4_md5_lookup,
1742         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1743         .md5_add                = tcp_v4_md5_add_func,
1744         .md5_parse              = tcp_v4_parse_md5_keys,
1745 };
1746 #endif
1747
1748 /* NOTE: A lot of things set to zero explicitly by call to
1749  *       sk_alloc() so need not be done here.
1750  */
1751 static int tcp_v4_init_sock(struct sock *sk)
1752 {
1753         struct inet_connection_sock *icsk = inet_csk(sk);
1754         struct tcp_sock *tp = tcp_sk(sk);
1755
1756         skb_queue_head_init(&tp->out_of_order_queue);
1757         tcp_init_xmit_timers(sk);
1758         tcp_prequeue_init(tp);
1759
1760         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1761         tp->mdev = TCP_TIMEOUT_INIT;
1762
1763         /* So many TCP implementations out there (incorrectly) count the
1764          * initial SYN frame in their delayed-ACK and congestion control
1765          * algorithms that we must have the following bandaid to talk
1766          * efficiently to them.  -DaveM
1767          */
1768         tp->snd_cwnd = 2;
1769
1770         /* See draft-stevens-tcpca-spec-01 for discussion of the
1771          * initialization of these values.
1772          */
1773         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1774         tp->snd_cwnd_clamp = ~0;
1775         tp->mss_cache = 536;
1776
1777         tp->reordering = sysctl_tcp_reordering;
1778         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1779
1780         sk->sk_state = TCP_CLOSE;
1781
1782         sk->sk_write_space = sk_stream_write_space;
1783         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1784
1785         icsk->icsk_af_ops = &ipv4_specific;
1786         icsk->icsk_sync_mss = tcp_sync_mss;
1787 #ifdef CONFIG_TCP_MD5SIG
1788         tp->af_specific = &tcp_sock_ipv4_specific;
1789 #endif
1790
1791         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1792         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1793
1794         local_bh_disable();
1795         percpu_counter_inc(&tcp_sockets_allocated);
1796         local_bh_enable();
1797
1798         return 0;
1799 }
1800
1801 void tcp_v4_destroy_sock(struct sock *sk)
1802 {
1803         struct tcp_sock *tp = tcp_sk(sk);
1804
1805         tcp_clear_xmit_timers(sk);
1806
1807         tcp_cleanup_congestion_control(sk);
1808
1809         /* Cleanup up the write buffer. */
1810         tcp_write_queue_purge(sk);
1811
1812         /* Cleans up our, hopefully empty, out_of_order_queue. */
1813         __skb_queue_purge(&tp->out_of_order_queue);
1814
1815 #ifdef CONFIG_TCP_MD5SIG
1816         /* Clean up the MD5 key list, if any */
1817         if (tp->md5sig_info) {
1818                 tcp_v4_clear_md5_list(sk);
1819                 kfree(tp->md5sig_info);
1820                 tp->md5sig_info = NULL;
1821         }
1822 #endif
1823
1824 #ifdef CONFIG_NET_DMA
1825         /* Cleans up our sk_async_wait_queue */
1826         __skb_queue_purge(&sk->sk_async_wait_queue);
1827 #endif
1828
1829         /* Clean prequeue, it must be empty really */
1830         __skb_queue_purge(&tp->ucopy.prequeue);
1831
1832         /* Clean up a referenced TCP bind bucket. */
1833         if (inet_csk(sk)->icsk_bind_hash)
1834                 inet_put_port(sk);
1835
1836         /*
1837          * If sendmsg cached page exists, toss it.
1838          */
1839         if (sk->sk_sndmsg_page) {
1840                 __free_page(sk->sk_sndmsg_page);
1841                 sk->sk_sndmsg_page = NULL;
1842         }
1843
1844         percpu_counter_dec(&tcp_sockets_allocated);
1845 }
1846
1847 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1848
1849 #ifdef CONFIG_PROC_FS
1850 /* Proc filesystem TCP sock list dumping. */
1851
1852 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1853 {
1854         return hlist_nulls_empty(head) ? NULL :
1855                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1856 }
1857
1858 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1859 {
1860         return !is_a_nulls(tw->tw_node.next) ?
1861                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1862 }
1863
1864 static void *listening_get_next(struct seq_file *seq, void *cur)
1865 {
1866         struct inet_connection_sock *icsk;
1867         struct hlist_nulls_node *node;
1868         struct sock *sk = cur;
1869         struct inet_listen_hashbucket *ilb;
1870         struct tcp_iter_state *st = seq->private;
1871         struct net *net = seq_file_net(seq);
1872
1873         if (!sk) {
1874                 st->bucket = 0;
1875                 ilb = &tcp_hashinfo.listening_hash[0];
1876                 spin_lock_bh(&ilb->lock);
1877                 sk = sk_nulls_head(&ilb->head);
1878                 goto get_sk;
1879         }
1880         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1881         ++st->num;
1882
1883         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1884                 struct request_sock *req = cur;
1885
1886                 icsk = inet_csk(st->syn_wait_sk);
1887                 req = req->dl_next;
1888                 while (1) {
1889                         while (req) {
1890                                 if (req->rsk_ops->family == st->family) {
1891                                         cur = req;
1892                                         goto out;
1893                                 }
1894                                 req = req->dl_next;
1895                         }
1896                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1897                                 break;
1898 get_req:
1899                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1900                 }
1901                 sk        = sk_next(st->syn_wait_sk);
1902                 st->state = TCP_SEQ_STATE_LISTENING;
1903                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1904         } else {
1905                 icsk = inet_csk(sk);
1906                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1907                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1908                         goto start_req;
1909                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1910                 sk = sk_next(sk);
1911         }
1912 get_sk:
1913         sk_nulls_for_each_from(sk, node) {
1914                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1915                         cur = sk;
1916                         goto out;
1917                 }
1918                 icsk = inet_csk(sk);
1919                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1920                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1921 start_req:
1922                         st->uid         = sock_i_uid(sk);
1923                         st->syn_wait_sk = sk;
1924                         st->state       = TCP_SEQ_STATE_OPENREQ;
1925                         st->sbucket     = 0;
1926                         goto get_req;
1927                 }
1928                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1929         }
1930         spin_unlock_bh(&ilb->lock);
1931         if (++st->bucket < INET_LHTABLE_SIZE) {
1932                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1933                 spin_lock_bh(&ilb->lock);
1934                 sk = sk_nulls_head(&ilb->head);
1935                 goto get_sk;
1936         }
1937         cur = NULL;
1938 out:
1939         return cur;
1940 }
1941
1942 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1943 {
1944         void *rc = listening_get_next(seq, NULL);
1945
1946         while (rc && *pos) {
1947                 rc = listening_get_next(seq, rc);
1948                 --*pos;
1949         }
1950         return rc;
1951 }
1952
1953 static inline int empty_bucket(struct tcp_iter_state *st)
1954 {
1955         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1956                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1957 }
1958
1959 static void *established_get_first(struct seq_file *seq)
1960 {
1961         struct tcp_iter_state *st = seq->private;
1962         struct net *net = seq_file_net(seq);
1963         void *rc = NULL;
1964
1965         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1966                 struct sock *sk;
1967                 struct hlist_nulls_node *node;
1968                 struct inet_timewait_sock *tw;
1969                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1970
1971                 /* Lockless fast path for the common case of empty buckets */
1972                 if (empty_bucket(st))
1973                         continue;
1974
1975                 spin_lock_bh(lock);
1976                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1977                         if (sk->sk_family != st->family ||
1978                             !net_eq(sock_net(sk), net)) {
1979                                 continue;
1980                         }
1981                         rc = sk;
1982                         goto out;
1983                 }
1984                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1985                 inet_twsk_for_each(tw, node,
1986                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
1987                         if (tw->tw_family != st->family ||
1988                             !net_eq(twsk_net(tw), net)) {
1989                                 continue;
1990                         }
1991                         rc = tw;
1992                         goto out;
1993                 }
1994                 spin_unlock_bh(lock);
1995                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1996         }
1997 out:
1998         return rc;
1999 }
2000
2001 static void *established_get_next(struct seq_file *seq, void *cur)
2002 {
2003         struct sock *sk = cur;
2004         struct inet_timewait_sock *tw;
2005         struct hlist_nulls_node *node;
2006         struct tcp_iter_state *st = seq->private;
2007         struct net *net = seq_file_net(seq);
2008
2009         ++st->num;
2010
2011         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2012                 tw = cur;
2013                 tw = tw_next(tw);
2014 get_tw:
2015                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2016                         tw = tw_next(tw);
2017                 }
2018                 if (tw) {
2019                         cur = tw;
2020                         goto out;
2021                 }
2022                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2023                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2024
2025                 /* Look for next non empty bucket */
2026                 while (++st->bucket < tcp_hashinfo.ehash_size &&
2027                                 empty_bucket(st))
2028                         ;
2029                 if (st->bucket >= tcp_hashinfo.ehash_size)
2030                         return NULL;
2031
2032                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2033                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2034         } else
2035                 sk = sk_nulls_next(sk);
2036
2037         sk_nulls_for_each_from(sk, node) {
2038                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2039                         goto found;
2040         }
2041
2042         st->state = TCP_SEQ_STATE_TIME_WAIT;
2043         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2044         goto get_tw;
2045 found:
2046         cur = sk;
2047 out:
2048         return cur;
2049 }
2050
2051 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2052 {
2053         void *rc = established_get_first(seq);
2054
2055         while (rc && pos) {
2056                 rc = established_get_next(seq, rc);
2057                 --pos;
2058         }
2059         return rc;
2060 }
2061
2062 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2063 {
2064         void *rc;
2065         struct tcp_iter_state *st = seq->private;
2066
2067         st->state = TCP_SEQ_STATE_LISTENING;
2068         rc        = listening_get_idx(seq, &pos);
2069
2070         if (!rc) {
2071                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2072                 rc        = established_get_idx(seq, pos);
2073         }
2074
2075         return rc;
2076 }
2077
2078 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2079 {
2080         struct tcp_iter_state *st = seq->private;
2081         st->state = TCP_SEQ_STATE_LISTENING;
2082         st->num = 0;
2083         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2084 }
2085
2086 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2087 {
2088         void *rc = NULL;
2089         struct tcp_iter_state *st;
2090
2091         if (v == SEQ_START_TOKEN) {
2092                 rc = tcp_get_idx(seq, 0);
2093                 goto out;
2094         }
2095         st = seq->private;
2096
2097         switch (st->state) {
2098         case TCP_SEQ_STATE_OPENREQ:
2099         case TCP_SEQ_STATE_LISTENING:
2100                 rc = listening_get_next(seq, v);
2101                 if (!rc) {
2102                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2103                         rc        = established_get_first(seq);
2104                 }
2105                 break;
2106         case TCP_SEQ_STATE_ESTABLISHED:
2107         case TCP_SEQ_STATE_TIME_WAIT:
2108                 rc = established_get_next(seq, v);
2109                 break;
2110         }
2111 out:
2112         ++*pos;
2113         return rc;
2114 }
2115
2116 static void tcp_seq_stop(struct seq_file *seq, void *v)
2117 {
2118         struct tcp_iter_state *st = seq->private;
2119
2120         switch (st->state) {
2121         case TCP_SEQ_STATE_OPENREQ:
2122                 if (v) {
2123                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2124                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2125                 }
2126         case TCP_SEQ_STATE_LISTENING:
2127                 if (v != SEQ_START_TOKEN)
2128                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2129                 break;
2130         case TCP_SEQ_STATE_TIME_WAIT:
2131         case TCP_SEQ_STATE_ESTABLISHED:
2132                 if (v)
2133                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2134                 break;
2135         }
2136 }
2137
2138 static int tcp_seq_open(struct inode *inode, struct file *file)
2139 {
2140         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2141         struct tcp_iter_state *s;
2142         int err;
2143
2144         err = seq_open_net(inode, file, &afinfo->seq_ops,
2145                           sizeof(struct tcp_iter_state));
2146         if (err < 0)
2147                 return err;
2148
2149         s = ((struct seq_file *)file->private_data)->private;
2150         s->family               = afinfo->family;
2151         return 0;
2152 }
2153
2154 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2155 {
2156         int rc = 0;
2157         struct proc_dir_entry *p;
2158
2159         afinfo->seq_fops.open           = tcp_seq_open;
2160         afinfo->seq_fops.read           = seq_read;
2161         afinfo->seq_fops.llseek         = seq_lseek;
2162         afinfo->seq_fops.release        = seq_release_net;
2163
2164         afinfo->seq_ops.start           = tcp_seq_start;
2165         afinfo->seq_ops.next            = tcp_seq_next;
2166         afinfo->seq_ops.stop            = tcp_seq_stop;
2167
2168         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2169                              &afinfo->seq_fops, afinfo);
2170         if (!p)
2171                 rc = -ENOMEM;
2172         return rc;
2173 }
2174
2175 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2176 {
2177         proc_net_remove(net, afinfo->name);
2178 }
2179
2180 static void get_openreq4(struct sock *sk, struct request_sock *req,
2181                          struct seq_file *f, int i, int uid, int *len)
2182 {
2183         const struct inet_request_sock *ireq = inet_rsk(req);
2184         int ttd = req->expires - jiffies;
2185
2186         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2187                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2188                 i,
2189                 ireq->loc_addr,
2190                 ntohs(inet_sk(sk)->sport),
2191                 ireq->rmt_addr,
2192                 ntohs(ireq->rmt_port),
2193                 TCP_SYN_RECV,
2194                 0, 0, /* could print option size, but that is af dependent. */
2195                 1,    /* timers active (only the expire timer) */
2196                 jiffies_to_clock_t(ttd),
2197                 req->retrans,
2198                 uid,
2199                 0,  /* non standard timer */
2200                 0, /* open_requests have no inode */
2201                 atomic_read(&sk->sk_refcnt),
2202                 req,
2203                 len);
2204 }
2205
2206 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2207 {
2208         int timer_active;
2209         unsigned long timer_expires;
2210         struct tcp_sock *tp = tcp_sk(sk);
2211         const struct inet_connection_sock *icsk = inet_csk(sk);
2212         struct inet_sock *inet = inet_sk(sk);
2213         __be32 dest = inet->daddr;
2214         __be32 src = inet->rcv_saddr;
2215         __u16 destp = ntohs(inet->dport);
2216         __u16 srcp = ntohs(inet->sport);
2217
2218         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2219                 timer_active    = 1;
2220                 timer_expires   = icsk->icsk_timeout;
2221         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2222                 timer_active    = 4;
2223                 timer_expires   = icsk->icsk_timeout;
2224         } else if (timer_pending(&sk->sk_timer)) {
2225                 timer_active    = 2;
2226                 timer_expires   = sk->sk_timer.expires;
2227         } else {
2228                 timer_active    = 0;
2229                 timer_expires = jiffies;
2230         }
2231
2232         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2233                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2234                 i, src, srcp, dest, destp, sk->sk_state,
2235                 tp->write_seq - tp->snd_una,
2236                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2237                                              (tp->rcv_nxt - tp->copied_seq),
2238                 timer_active,
2239                 jiffies_to_clock_t(timer_expires - jiffies),
2240                 icsk->icsk_retransmits,
2241                 sock_i_uid(sk),
2242                 icsk->icsk_probes_out,
2243                 sock_i_ino(sk),
2244                 atomic_read(&sk->sk_refcnt), sk,
2245                 jiffies_to_clock_t(icsk->icsk_rto),
2246                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2247                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2248                 tp->snd_cwnd,
2249                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2250                 len);
2251 }
2252
2253 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2254                                struct seq_file *f, int i, int *len)
2255 {
2256         __be32 dest, src;
2257         __u16 destp, srcp;
2258         int ttd = tw->tw_ttd - jiffies;
2259
2260         if (ttd < 0)
2261                 ttd = 0;
2262
2263         dest  = tw->tw_daddr;
2264         src   = tw->tw_rcv_saddr;
2265         destp = ntohs(tw->tw_dport);
2266         srcp  = ntohs(tw->tw_sport);
2267
2268         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2269                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2270                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2271                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2272                 atomic_read(&tw->tw_refcnt), tw, len);
2273 }
2274
2275 #define TMPSZ 150
2276
2277 static int tcp4_seq_show(struct seq_file *seq, void *v)
2278 {
2279         struct tcp_iter_state *st;
2280         int len;
2281
2282         if (v == SEQ_START_TOKEN) {
2283                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2284                            "  sl  local_address rem_address   st tx_queue "
2285                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2286                            "inode");
2287                 goto out;
2288         }
2289         st = seq->private;
2290
2291         switch (st->state) {
2292         case TCP_SEQ_STATE_LISTENING:
2293         case TCP_SEQ_STATE_ESTABLISHED:
2294                 get_tcp4_sock(v, seq, st->num, &len);
2295                 break;
2296         case TCP_SEQ_STATE_OPENREQ:
2297                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2298                 break;
2299         case TCP_SEQ_STATE_TIME_WAIT:
2300                 get_timewait4_sock(v, seq, st->num, &len);
2301                 break;
2302         }
2303         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2304 out:
2305         return 0;
2306 }
2307
2308 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2309         .name           = "tcp",
2310         .family         = AF_INET,
2311         .seq_fops       = {
2312                 .owner          = THIS_MODULE,
2313         },
2314         .seq_ops        = {
2315                 .show           = tcp4_seq_show,
2316         },
2317 };
2318
2319 static int tcp4_proc_init_net(struct net *net)
2320 {
2321         return tcp_proc_register(net, &tcp4_seq_afinfo);
2322 }
2323
2324 static void tcp4_proc_exit_net(struct net *net)
2325 {
2326         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2327 }
2328
2329 static struct pernet_operations tcp4_net_ops = {
2330         .init = tcp4_proc_init_net,
2331         .exit = tcp4_proc_exit_net,
2332 };
2333
2334 int __init tcp4_proc_init(void)
2335 {
2336         return register_pernet_subsys(&tcp4_net_ops);
2337 }
2338
2339 void tcp4_proc_exit(void)
2340 {
2341         unregister_pernet_subsys(&tcp4_net_ops);
2342 }
2343 #endif /* CONFIG_PROC_FS */
2344
2345 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2346 {
2347         struct iphdr *iph = skb_gro_network_header(skb);
2348
2349         switch (skb->ip_summed) {
2350         case CHECKSUM_COMPLETE:
2351                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2352                                   skb->csum)) {
2353                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2354                         break;
2355                 }
2356
2357                 /* fall through */
2358         case CHECKSUM_NONE:
2359                 NAPI_GRO_CB(skb)->flush = 1;
2360                 return NULL;
2361         }
2362
2363         return tcp_gro_receive(head, skb);
2364 }
2365 EXPORT_SYMBOL(tcp4_gro_receive);
2366
2367 int tcp4_gro_complete(struct sk_buff *skb)
2368 {
2369         struct iphdr *iph = ip_hdr(skb);
2370         struct tcphdr *th = tcp_hdr(skb);
2371
2372         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2373                                   iph->saddr, iph->daddr, 0);
2374         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2375
2376         return tcp_gro_complete(skb);
2377 }
2378 EXPORT_SYMBOL(tcp4_gro_complete);
2379
2380 struct proto tcp_prot = {
2381         .name                   = "TCP",
2382         .owner                  = THIS_MODULE,
2383         .close                  = tcp_close,
2384         .connect                = tcp_v4_connect,
2385         .disconnect             = tcp_disconnect,
2386         .accept                 = inet_csk_accept,
2387         .ioctl                  = tcp_ioctl,
2388         .init                   = tcp_v4_init_sock,
2389         .destroy                = tcp_v4_destroy_sock,
2390         .shutdown               = tcp_shutdown,
2391         .setsockopt             = tcp_setsockopt,
2392         .getsockopt             = tcp_getsockopt,
2393         .recvmsg                = tcp_recvmsg,
2394         .backlog_rcv            = tcp_v4_do_rcv,
2395         .hash                   = inet_hash,
2396         .unhash                 = inet_unhash,
2397         .get_port               = inet_csk_get_port,
2398         .enter_memory_pressure  = tcp_enter_memory_pressure,
2399         .sockets_allocated      = &tcp_sockets_allocated,
2400         .orphan_count           = &tcp_orphan_count,
2401         .memory_allocated       = &tcp_memory_allocated,
2402         .memory_pressure        = &tcp_memory_pressure,
2403         .sysctl_mem             = sysctl_tcp_mem,
2404         .sysctl_wmem            = sysctl_tcp_wmem,
2405         .sysctl_rmem            = sysctl_tcp_rmem,
2406         .max_header             = MAX_TCP_HEADER,
2407         .obj_size               = sizeof(struct tcp_sock),
2408         .slab_flags             = SLAB_DESTROY_BY_RCU,
2409         .twsk_prot              = &tcp_timewait_sock_ops,
2410         .rsk_prot               = &tcp_request_sock_ops,
2411         .h.hashinfo             = &tcp_hashinfo,
2412 #ifdef CONFIG_COMPAT
2413         .compat_setsockopt      = compat_tcp_setsockopt,
2414         .compat_getsockopt      = compat_tcp_getsockopt,
2415 #endif
2416 };
2417
2418
2419 static int __net_init tcp_sk_init(struct net *net)
2420 {
2421         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2422                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2423 }
2424
2425 static void __net_exit tcp_sk_exit(struct net *net)
2426 {
2427         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2428         inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
2429 }
2430
2431 static struct pernet_operations __net_initdata tcp_sk_ops = {
2432        .init = tcp_sk_init,
2433        .exit = tcp_sk_exit,
2434 };
2435
2436 void __init tcp_v4_init(void)
2437 {
2438         inet_hashinfo_init(&tcp_hashinfo);
2439         if (register_pernet_subsys(&tcp_sk_ops))
2440                 panic("Failed to create the TCP control socket.\n");
2441 }
2442
2443 EXPORT_SYMBOL(ipv4_specific);
2444 EXPORT_SYMBOL(tcp_hashinfo);
2445 EXPORT_SYMBOL(tcp_prot);
2446 EXPORT_SYMBOL(tcp_v4_conn_request);
2447 EXPORT_SYMBOL(tcp_v4_connect);
2448 EXPORT_SYMBOL(tcp_v4_do_rcv);
2449 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2450 EXPORT_SYMBOL(tcp_v4_send_check);
2451 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2452
2453 #ifdef CONFIG_PROC_FS
2454 EXPORT_SYMBOL(tcp_proc_register);
2455 EXPORT_SYMBOL(tcp_proc_unregister);
2456 #endif
2457 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2458