[REQSK]: Move the syn_table destroy from tcp_listen_stop to reqsk_queue_destroy
[pandora-kernel.git] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but it's a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
208  *                                      csum_and_copy_from_user() if possible.
209  *
210  *              This program is free software; you can redistribute it and/or
211  *              modify it under the terms of the GNU General Public License
212  *              as published by the Free Software Foundation; either version
213  *              2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
218  *
219  *      TCP_SYN_RECV            received a connection request, sent ack,
220  *                              waiting for final ack in three-way handshake.
221  *
222  *      TCP_ESTABLISHED         connection established
223  *
224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
225  *                              transmission of remaining buffered data
226  *
227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
228  *                              to shutdown
229  *
230  *      TCP_CLOSING             both sides have shutdown but we still have
231  *                              data we have to finish sending
232  *
233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
234  *                              closed, can only be entered from FIN_WAIT2
235  *                              or CLOSING.  Required because the other end
236  *                              may not have gotten our last ACK causing it
237  *                              to retransmit the data packet (which we ignore)
238  *
239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
240  *                              us to finish writing our data and to shutdown
241  *                              (we have to close() to move on to LAST_ACK)
242  *
243  *      TCP_LAST_ACK            out side has shutdown after remote has
244  *                              shutdown.  There may still be data in our
245  *                              buffer that we have to finish sending
246  *
247  *      TCP_CLOSE               socket is finished
248  */
249
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260
261 #include <net/icmp.h>
262 #include <net/tcp.h>
263 #include <net/xfrm.h>
264 #include <net/ip.h>
265
266
267 #include <asm/uaccess.h>
268 #include <asm/ioctls.h>
269
270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274 kmem_cache_t *tcp_bucket_cachep;
275 kmem_cache_t *tcp_timewait_cachep;
276
277 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278
279 int sysctl_tcp_mem[3];
280 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
282
283 EXPORT_SYMBOL(sysctl_tcp_mem);
284 EXPORT_SYMBOL(sysctl_tcp_rmem);
285 EXPORT_SYMBOL(sysctl_tcp_wmem);
286
287 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
288 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
289
290 EXPORT_SYMBOL(tcp_memory_allocated);
291 EXPORT_SYMBOL(tcp_sockets_allocated);
292
293 /*
294  * Pressure flag: try to collapse.
295  * Technical note: it is used by multiple contexts non atomically.
296  * All the sk_stream_mem_schedule() is of this nature: accounting
297  * is strict, actions are advisory and have some latency.
298  */
299 int tcp_memory_pressure;
300
301 EXPORT_SYMBOL(tcp_memory_pressure);
302
303 void tcp_enter_memory_pressure(void)
304 {
305         if (!tcp_memory_pressure) {
306                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
307                 tcp_memory_pressure = 1;
308         }
309 }
310
311 EXPORT_SYMBOL(tcp_enter_memory_pressure);
312
313 /*
314  * LISTEN is a special case for poll..
315  */
316 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
317                                                poll_table *wait)
318 {
319         return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
320 }
321
322 /*
323  *      Wait for a TCP event.
324  *
325  *      Note that we don't need to lock the socket, as the upper poll layers
326  *      take care of normal races (between the test and the event) and we don't
327  *      go look at any of the socket buffers directly.
328  */
329 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
330 {
331         unsigned int mask;
332         struct sock *sk = sock->sk;
333         struct tcp_sock *tp = tcp_sk(sk);
334
335         poll_wait(file, sk->sk_sleep, wait);
336         if (sk->sk_state == TCP_LISTEN)
337                 return tcp_listen_poll(sk, wait);
338
339         /* Socket is not locked. We are protected from async events
340            by poll logic and correct handling of state changes
341            made by another threads is impossible in any case.
342          */
343
344         mask = 0;
345         if (sk->sk_err)
346                 mask = POLLERR;
347
348         /*
349          * POLLHUP is certainly not done right. But poll() doesn't
350          * have a notion of HUP in just one direction, and for a
351          * socket the read side is more interesting.
352          *
353          * Some poll() documentation says that POLLHUP is incompatible
354          * with the POLLOUT/POLLWR flags, so somebody should check this
355          * all. But careful, it tends to be safer to return too many
356          * bits than too few, and you can easily break real applications
357          * if you don't tell them that something has hung up!
358          *
359          * Check-me.
360          *
361          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
362          * our fs/select.c). It means that after we received EOF,
363          * poll always returns immediately, making impossible poll() on write()
364          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
365          * if and only if shutdown has been made in both directions.
366          * Actually, it is interesting to look how Solaris and DUX
367          * solve this dilemma. I would prefer, if PULLHUP were maskable,
368          * then we could set it on SND_SHUTDOWN. BTW examples given
369          * in Stevens' books assume exactly this behaviour, it explains
370          * why PULLHUP is incompatible with POLLOUT.    --ANK
371          *
372          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
373          * blocking on fresh not-connected or disconnected socket. --ANK
374          */
375         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
376                 mask |= POLLHUP;
377         if (sk->sk_shutdown & RCV_SHUTDOWN)
378                 mask |= POLLIN | POLLRDNORM;
379
380         /* Connected? */
381         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
382                 /* Potential race condition. If read of tp below will
383                  * escape above sk->sk_state, we can be illegally awaken
384                  * in SYN_* states. */
385                 if ((tp->rcv_nxt != tp->copied_seq) &&
386                     (tp->urg_seq != tp->copied_seq ||
387                      tp->rcv_nxt != tp->copied_seq + 1 ||
388                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
389                         mask |= POLLIN | POLLRDNORM;
390
391                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
392                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
393                                 mask |= POLLOUT | POLLWRNORM;
394                         } else {  /* send SIGIO later */
395                                 set_bit(SOCK_ASYNC_NOSPACE,
396                                         &sk->sk_socket->flags);
397                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
398
399                                 /* Race breaker. If space is freed after
400                                  * wspace test but before the flags are set,
401                                  * IO signal will be lost.
402                                  */
403                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
404                                         mask |= POLLOUT | POLLWRNORM;
405                         }
406                 }
407
408                 if (tp->urg_data & TCP_URG_VALID)
409                         mask |= POLLPRI;
410         }
411         return mask;
412 }
413
414 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
415 {
416         struct tcp_sock *tp = tcp_sk(sk);
417         int answ;
418
419         switch (cmd) {
420         case SIOCINQ:
421                 if (sk->sk_state == TCP_LISTEN)
422                         return -EINVAL;
423
424                 lock_sock(sk);
425                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
426                         answ = 0;
427                 else if (sock_flag(sk, SOCK_URGINLINE) ||
428                          !tp->urg_data ||
429                          before(tp->urg_seq, tp->copied_seq) ||
430                          !before(tp->urg_seq, tp->rcv_nxt)) {
431                         answ = tp->rcv_nxt - tp->copied_seq;
432
433                         /* Subtract 1, if FIN is in queue. */
434                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
435                                 answ -=
436                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
437                 } else
438                         answ = tp->urg_seq - tp->copied_seq;
439                 release_sock(sk);
440                 break;
441         case SIOCATMARK:
442                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
443                 break;
444         case SIOCOUTQ:
445                 if (sk->sk_state == TCP_LISTEN)
446                         return -EINVAL;
447
448                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
449                         answ = 0;
450                 else
451                         answ = tp->write_seq - tp->snd_una;
452                 break;
453         default:
454                 return -ENOIOCTLCMD;
455         };
456
457         return put_user(answ, (int __user *)arg);
458 }
459
460
461 int tcp_listen_start(struct sock *sk)
462 {
463         struct inet_sock *inet = inet_sk(sk);
464         struct tcp_sock *tp = tcp_sk(sk);
465         int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466
467         if (rc != 0)
468                 return rc;
469
470         sk->sk_max_ack_backlog = 0;
471         sk->sk_ack_backlog = 0;
472         tcp_delack_init(tp);
473
474         /* There is race window here: we announce ourselves listening,
475          * but this transition is still not validated by get_port().
476          * It is OK, because this socket enters to hash table only
477          * after validation is complete.
478          */
479         sk->sk_state = TCP_LISTEN;
480         if (!sk->sk_prot->get_port(sk, inet->num)) {
481                 inet->sport = htons(inet->num);
482
483                 sk_dst_reset(sk);
484                 sk->sk_prot->hash(sk);
485
486                 return 0;
487         }
488
489         sk->sk_state = TCP_CLOSE;
490         __reqsk_queue_destroy(&tp->accept_queue);
491         return -EADDRINUSE;
492 }
493
494 /*
495  *      This routine closes sockets which have been at least partially
496  *      opened, but not yet accepted.
497  */
498
499 static void tcp_listen_stop (struct sock *sk)
500 {
501         struct tcp_sock *tp = tcp_sk(sk);
502         struct request_sock *acc_req;
503         struct request_sock *req;
504
505         tcp_delete_keepalive_timer(sk);
506
507         /* make all the listen_opt local to us */
508         acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
509
510         /* Following specs, it would be better either to send FIN
511          * (and enter FIN-WAIT-1, it is normal close)
512          * or to send active reset (abort).
513          * Certainly, it is pretty dangerous while synflood, but it is
514          * bad justification for our negligence 8)
515          * To be honest, we are not able to make either
516          * of the variants now.                 --ANK
517          */
518         reqsk_queue_destroy(&tp->accept_queue);
519
520         while ((req = acc_req) != NULL) {
521                 struct sock *child = req->sk;
522
523                 acc_req = req->dl_next;
524
525                 local_bh_disable();
526                 bh_lock_sock(child);
527                 BUG_TRAP(!sock_owned_by_user(child));
528                 sock_hold(child);
529
530                 tcp_disconnect(child, O_NONBLOCK);
531
532                 sock_orphan(child);
533
534                 atomic_inc(&tcp_orphan_count);
535
536                 tcp_destroy_sock(child);
537
538                 bh_unlock_sock(child);
539                 local_bh_enable();
540                 sock_put(child);
541
542                 sk_acceptq_removed(sk);
543                 __reqsk_free(req);
544         }
545         BUG_TRAP(!sk->sk_ack_backlog);
546 }
547
548 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
549 {
550         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
551         tp->pushed_seq = tp->write_seq;
552 }
553
554 static inline int forced_push(struct tcp_sock *tp)
555 {
556         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
557 }
558
559 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
560                               struct sk_buff *skb)
561 {
562         skb->csum = 0;
563         TCP_SKB_CB(skb)->seq = tp->write_seq;
564         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
565         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
566         TCP_SKB_CB(skb)->sacked = 0;
567         skb_header_release(skb);
568         __skb_queue_tail(&sk->sk_write_queue, skb);
569         sk_charge_skb(sk, skb);
570         if (!sk->sk_send_head)
571                 sk->sk_send_head = skb;
572         if (tp->nonagle & TCP_NAGLE_PUSH)
573                 tp->nonagle &= ~TCP_NAGLE_PUSH; 
574 }
575
576 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
577                                 struct sk_buff *skb)
578 {
579         if (flags & MSG_OOB) {
580                 tp->urg_mode = 1;
581                 tp->snd_up = tp->write_seq;
582                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
583         }
584 }
585
586 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
587                             int mss_now, int nonagle)
588 {
589         if (sk->sk_send_head) {
590                 struct sk_buff *skb = sk->sk_write_queue.prev;
591                 if (!(flags & MSG_MORE) || forced_push(tp))
592                         tcp_mark_push(tp, skb);
593                 tcp_mark_urg(tp, flags, skb);
594                 __tcp_push_pending_frames(sk, tp, mss_now,
595                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
596         }
597 }
598
599 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
600                          size_t psize, int flags)
601 {
602         struct tcp_sock *tp = tcp_sk(sk);
603         int mss_now, size_goal;
604         int err;
605         ssize_t copied;
606         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
607
608         /* Wait for a connection to finish. */
609         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
610                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
611                         goto out_err;
612
613         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
614
615         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
616         size_goal = tp->xmit_size_goal;
617         copied = 0;
618
619         err = -EPIPE;
620         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
621                 goto do_error;
622
623         while (psize > 0) {
624                 struct sk_buff *skb = sk->sk_write_queue.prev;
625                 struct page *page = pages[poffset / PAGE_SIZE];
626                 int copy, i, can_coalesce;
627                 int offset = poffset % PAGE_SIZE;
628                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
629
630                 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
631 new_segment:
632                         if (!sk_stream_memory_free(sk))
633                                 goto wait_for_sndbuf;
634
635                         skb = sk_stream_alloc_pskb(sk, 0, 0,
636                                                    sk->sk_allocation);
637                         if (!skb)
638                                 goto wait_for_memory;
639
640                         skb_entail(sk, tp, skb);
641                         copy = size_goal;
642                 }
643
644                 if (copy > size)
645                         copy = size;
646
647                 i = skb_shinfo(skb)->nr_frags;
648                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
649                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
650                         tcp_mark_push(tp, skb);
651                         goto new_segment;
652                 }
653                 if (sk->sk_forward_alloc < copy &&
654                     !sk_stream_mem_schedule(sk, copy, 0))
655                         goto wait_for_memory;
656                 
657                 if (can_coalesce) {
658                         skb_shinfo(skb)->frags[i - 1].size += copy;
659                 } else {
660                         get_page(page);
661                         skb_fill_page_desc(skb, i, page, offset, copy);
662                 }
663
664                 skb->len += copy;
665                 skb->data_len += copy;
666                 skb->truesize += copy;
667                 sk->sk_wmem_queued += copy;
668                 sk->sk_forward_alloc -= copy;
669                 skb->ip_summed = CHECKSUM_HW;
670                 tp->write_seq += copy;
671                 TCP_SKB_CB(skb)->end_seq += copy;
672                 skb_shinfo(skb)->tso_segs = 0;
673
674                 if (!copied)
675                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
676
677                 copied += copy;
678                 poffset += copy;
679                 if (!(psize -= copy))
680                         goto out;
681
682                 if (skb->len < mss_now || (flags & MSG_OOB))
683                         continue;
684
685                 if (forced_push(tp)) {
686                         tcp_mark_push(tp, skb);
687                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
688                 } else if (skb == sk->sk_send_head)
689                         tcp_push_one(sk, mss_now);
690                 continue;
691
692 wait_for_sndbuf:
693                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
694 wait_for_memory:
695                 if (copied)
696                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
697
698                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
699                         goto do_error;
700
701                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
702                 size_goal = tp->xmit_size_goal;
703         }
704
705 out:
706         if (copied)
707                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
708         return copied;
709
710 do_error:
711         if (copied)
712                 goto out;
713 out_err:
714         return sk_stream_error(sk, flags, err);
715 }
716
717 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
718                      size_t size, int flags)
719 {
720         ssize_t res;
721         struct sock *sk = sock->sk;
722
723 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
724
725         if (!(sk->sk_route_caps & NETIF_F_SG) ||
726             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
727                 return sock_no_sendpage(sock, page, offset, size, flags);
728
729 #undef TCP_ZC_CSUM_FLAGS
730
731         lock_sock(sk);
732         TCP_CHECK_TIMER(sk);
733         res = do_tcp_sendpages(sk, &page, offset, size, flags);
734         TCP_CHECK_TIMER(sk);
735         release_sock(sk);
736         return res;
737 }
738
739 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
740 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
741
742 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
743 {
744         int tmp = tp->mss_cache;
745
746         if (sk->sk_route_caps & NETIF_F_SG) {
747                 if (sk->sk_route_caps & NETIF_F_TSO)
748                         tmp = 0;
749                 else {
750                         int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
751
752                         if (tmp >= pgbreak &&
753                             tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
754                                 tmp = pgbreak;
755                 }
756         }
757
758         return tmp;
759 }
760
761 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
762                 size_t size)
763 {
764         struct iovec *iov;
765         struct tcp_sock *tp = tcp_sk(sk);
766         struct sk_buff *skb;
767         int iovlen, flags;
768         int mss_now, size_goal;
769         int err, copied;
770         long timeo;
771
772         lock_sock(sk);
773         TCP_CHECK_TIMER(sk);
774
775         flags = msg->msg_flags;
776         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
777
778         /* Wait for a connection to finish. */
779         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
780                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
781                         goto out_err;
782
783         /* This should be in poll */
784         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
785
786         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
787         size_goal = tp->xmit_size_goal;
788
789         /* Ok commence sending. */
790         iovlen = msg->msg_iovlen;
791         iov = msg->msg_iov;
792         copied = 0;
793
794         err = -EPIPE;
795         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
796                 goto do_error;
797
798         while (--iovlen >= 0) {
799                 int seglen = iov->iov_len;
800                 unsigned char __user *from = iov->iov_base;
801
802                 iov++;
803
804                 while (seglen > 0) {
805                         int copy;
806
807                         skb = sk->sk_write_queue.prev;
808
809                         if (!sk->sk_send_head ||
810                             (copy = size_goal - skb->len) <= 0) {
811
812 new_segment:
813                                 /* Allocate new segment. If the interface is SG,
814                                  * allocate skb fitting to single page.
815                                  */
816                                 if (!sk_stream_memory_free(sk))
817                                         goto wait_for_sndbuf;
818
819                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
820                                                            0, sk->sk_allocation);
821                                 if (!skb)
822                                         goto wait_for_memory;
823
824                                 /*
825                                  * Check whether we can use HW checksum.
826                                  */
827                                 if (sk->sk_route_caps &
828                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
829                                      NETIF_F_HW_CSUM))
830                                         skb->ip_summed = CHECKSUM_HW;
831
832                                 skb_entail(sk, tp, skb);
833                                 copy = size_goal;
834                         }
835
836                         /* Try to append data to the end of skb. */
837                         if (copy > seglen)
838                                 copy = seglen;
839
840                         /* Where to copy to? */
841                         if (skb_tailroom(skb) > 0) {
842                                 /* We have some space in skb head. Superb! */
843                                 if (copy > skb_tailroom(skb))
844                                         copy = skb_tailroom(skb);
845                                 if ((err = skb_add_data(skb, from, copy)) != 0)
846                                         goto do_fault;
847                         } else {
848                                 int merge = 0;
849                                 int i = skb_shinfo(skb)->nr_frags;
850                                 struct page *page = TCP_PAGE(sk);
851                                 int off = TCP_OFF(sk);
852
853                                 if (skb_can_coalesce(skb, i, page, off) &&
854                                     off != PAGE_SIZE) {
855                                         /* We can extend the last page
856                                          * fragment. */
857                                         merge = 1;
858                                 } else if (i == MAX_SKB_FRAGS ||
859                                            (!i &&
860                                            !(sk->sk_route_caps & NETIF_F_SG))) {
861                                         /* Need to add new fragment and cannot
862                                          * do this because interface is non-SG,
863                                          * or because all the page slots are
864                                          * busy. */
865                                         tcp_mark_push(tp, skb);
866                                         goto new_segment;
867                                 } else if (page) {
868                                         if (off == PAGE_SIZE) {
869                                                 put_page(page);
870                                                 TCP_PAGE(sk) = page = NULL;
871                                         }
872                                 }
873
874                                 if (!page) {
875                                         /* Allocate new cache page. */
876                                         if (!(page = sk_stream_alloc_page(sk)))
877                                                 goto wait_for_memory;
878                                         off = 0;
879                                 }
880
881                                 if (copy > PAGE_SIZE - off)
882                                         copy = PAGE_SIZE - off;
883
884                                 /* Time to copy data. We are close to
885                                  * the end! */
886                                 err = skb_copy_to_page(sk, from, skb, page,
887                                                        off, copy);
888                                 if (err) {
889                                         /* If this page was new, give it to the
890                                          * socket so it does not get leaked.
891                                          */
892                                         if (!TCP_PAGE(sk)) {
893                                                 TCP_PAGE(sk) = page;
894                                                 TCP_OFF(sk) = 0;
895                                         }
896                                         goto do_error;
897                                 }
898
899                                 /* Update the skb. */
900                                 if (merge) {
901                                         skb_shinfo(skb)->frags[i - 1].size +=
902                                                                         copy;
903                                 } else {
904                                         skb_fill_page_desc(skb, i, page, off, copy);
905                                         if (TCP_PAGE(sk)) {
906                                                 get_page(page);
907                                         } else if (off + copy < PAGE_SIZE) {
908                                                 get_page(page);
909                                                 TCP_PAGE(sk) = page;
910                                         }
911                                 }
912
913                                 TCP_OFF(sk) = off + copy;
914                         }
915
916                         if (!copied)
917                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
918
919                         tp->write_seq += copy;
920                         TCP_SKB_CB(skb)->end_seq += copy;
921                         skb_shinfo(skb)->tso_segs = 0;
922
923                         from += copy;
924                         copied += copy;
925                         if ((seglen -= copy) == 0 && iovlen == 0)
926                                 goto out;
927
928                         if (skb->len < mss_now || (flags & MSG_OOB))
929                                 continue;
930
931                         if (forced_push(tp)) {
932                                 tcp_mark_push(tp, skb);
933                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
934                         } else if (skb == sk->sk_send_head)
935                                 tcp_push_one(sk, mss_now);
936                         continue;
937
938 wait_for_sndbuf:
939                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
940 wait_for_memory:
941                         if (copied)
942                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
943
944                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
945                                 goto do_error;
946
947                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
948                         size_goal = tp->xmit_size_goal;
949                 }
950         }
951
952 out:
953         if (copied)
954                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
955         TCP_CHECK_TIMER(sk);
956         release_sock(sk);
957         return copied;
958
959 do_fault:
960         if (!skb->len) {
961                 if (sk->sk_send_head == skb)
962                         sk->sk_send_head = NULL;
963                 __skb_unlink(skb, &sk->sk_write_queue);
964                 sk_stream_free_skb(sk, skb);
965         }
966
967 do_error:
968         if (copied)
969                 goto out;
970 out_err:
971         err = sk_stream_error(sk, flags, err);
972         TCP_CHECK_TIMER(sk);
973         release_sock(sk);
974         return err;
975 }
976
977 /*
978  *      Handle reading urgent data. BSD has very simple semantics for
979  *      this, no blocking and very strange errors 8)
980  */
981
982 static int tcp_recv_urg(struct sock *sk, long timeo,
983                         struct msghdr *msg, int len, int flags,
984                         int *addr_len)
985 {
986         struct tcp_sock *tp = tcp_sk(sk);
987
988         /* No URG data to read. */
989         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
990             tp->urg_data == TCP_URG_READ)
991                 return -EINVAL; /* Yes this is right ! */
992
993         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
994                 return -ENOTCONN;
995
996         if (tp->urg_data & TCP_URG_VALID) {
997                 int err = 0;
998                 char c = tp->urg_data;
999
1000                 if (!(flags & MSG_PEEK))
1001                         tp->urg_data = TCP_URG_READ;
1002
1003                 /* Read urgent data. */
1004                 msg->msg_flags |= MSG_OOB;
1005
1006                 if (len > 0) {
1007                         if (!(flags & MSG_TRUNC))
1008                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1009                         len = 1;
1010                 } else
1011                         msg->msg_flags |= MSG_TRUNC;
1012
1013                 return err ? -EFAULT : len;
1014         }
1015
1016         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1017                 return 0;
1018
1019         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1020          * the available implementations agree in this case:
1021          * this call should never block, independent of the
1022          * blocking state of the socket.
1023          * Mike <pall@rz.uni-karlsruhe.de>
1024          */
1025         return -EAGAIN;
1026 }
1027
1028 /* Clean up the receive buffer for full frames taken by the user,
1029  * then send an ACK if necessary.  COPIED is the number of bytes
1030  * tcp_recvmsg has given to the user so far, it speeds up the
1031  * calculation of whether or not we must ACK for the sake of
1032  * a window update.
1033  */
1034 static void cleanup_rbuf(struct sock *sk, int copied)
1035 {
1036         struct tcp_sock *tp = tcp_sk(sk);
1037         int time_to_ack = 0;
1038
1039 #if TCP_DEBUG
1040         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1041
1042         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1043 #endif
1044
1045         if (tcp_ack_scheduled(tp)) {
1046                    /* Delayed ACKs frequently hit locked sockets during bulk
1047                     * receive. */
1048                 if (tp->ack.blocked ||
1049                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1050                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1051                     /*
1052                      * If this read emptied read buffer, we send ACK, if
1053                      * connection is not bidirectional, user drained
1054                      * receive buffer and there was a small segment
1055                      * in queue.
1056                      */
1057                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1058                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1059                         time_to_ack = 1;
1060         }
1061
1062         /* We send an ACK if we can now advertise a non-zero window
1063          * which has been raised "significantly".
1064          *
1065          * Even if window raised up to infinity, do not send window open ACK
1066          * in states, where we will not receive more. It is useless.
1067          */
1068         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1069                 __u32 rcv_window_now = tcp_receive_window(tp);
1070
1071                 /* Optimize, __tcp_select_window() is not cheap. */
1072                 if (2*rcv_window_now <= tp->window_clamp) {
1073                         __u32 new_window = __tcp_select_window(sk);
1074
1075                         /* Send ACK now, if this read freed lots of space
1076                          * in our buffer. Certainly, new_window is new window.
1077                          * We can advertise it now, if it is not less than current one.
1078                          * "Lots" means "at least twice" here.
1079                          */
1080                         if (new_window && new_window >= 2 * rcv_window_now)
1081                                 time_to_ack = 1;
1082                 }
1083         }
1084         if (time_to_ack)
1085                 tcp_send_ack(sk);
1086 }
1087
1088 static void tcp_prequeue_process(struct sock *sk)
1089 {
1090         struct sk_buff *skb;
1091         struct tcp_sock *tp = tcp_sk(sk);
1092
1093         NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1094
1095         /* RX process wants to run with disabled BHs, though it is not
1096          * necessary */
1097         local_bh_disable();
1098         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1099                 sk->sk_backlog_rcv(sk, skb);
1100         local_bh_enable();
1101
1102         /* Clear memory counter. */
1103         tp->ucopy.memory = 0;
1104 }
1105
1106 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1107 {
1108         struct sk_buff *skb;
1109         u32 offset;
1110
1111         skb_queue_walk(&sk->sk_receive_queue, skb) {
1112                 offset = seq - TCP_SKB_CB(skb)->seq;
1113                 if (skb->h.th->syn)
1114                         offset--;
1115                 if (offset < skb->len || skb->h.th->fin) {
1116                         *off = offset;
1117                         return skb;
1118                 }
1119         }
1120         return NULL;
1121 }
1122
1123 /*
1124  * This routine provides an alternative to tcp_recvmsg() for routines
1125  * that would like to handle copying from skbuffs directly in 'sendfile'
1126  * fashion.
1127  * Note:
1128  *      - It is assumed that the socket was locked by the caller.
1129  *      - The routine does not block.
1130  *      - At present, there is no support for reading OOB data
1131  *        or for 'peeking' the socket using this routine
1132  *        (although both would be easy to implement).
1133  */
1134 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1135                   sk_read_actor_t recv_actor)
1136 {
1137         struct sk_buff *skb;
1138         struct tcp_sock *tp = tcp_sk(sk);
1139         u32 seq = tp->copied_seq;
1140         u32 offset;
1141         int copied = 0;
1142
1143         if (sk->sk_state == TCP_LISTEN)
1144                 return -ENOTCONN;
1145         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1146                 if (offset < skb->len) {
1147                         size_t used, len;
1148
1149                         len = skb->len - offset;
1150                         /* Stop reading if we hit a patch of urgent data */
1151                         if (tp->urg_data) {
1152                                 u32 urg_offset = tp->urg_seq - seq;
1153                                 if (urg_offset < len)
1154                                         len = urg_offset;
1155                                 if (!len)
1156                                         break;
1157                         }
1158                         used = recv_actor(desc, skb, offset, len);
1159                         if (used <= len) {
1160                                 seq += used;
1161                                 copied += used;
1162                                 offset += used;
1163                         }
1164                         if (offset != skb->len)
1165                                 break;
1166                 }
1167                 if (skb->h.th->fin) {
1168                         sk_eat_skb(sk, skb);
1169                         ++seq;
1170                         break;
1171                 }
1172                 sk_eat_skb(sk, skb);
1173                 if (!desc->count)
1174                         break;
1175         }
1176         tp->copied_seq = seq;
1177
1178         tcp_rcv_space_adjust(sk);
1179
1180         /* Clean up data we have read: This will do ACK frames. */
1181         if (copied)
1182                 cleanup_rbuf(sk, copied);
1183         return copied;
1184 }
1185
1186 /*
1187  *      This routine copies from a sock struct into the user buffer.
1188  *
1189  *      Technical note: in 2.3 we work on _locked_ socket, so that
1190  *      tricks with *seq access order and skb->users are not required.
1191  *      Probably, code can be easily improved even more.
1192  */
1193
1194 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1195                 size_t len, int nonblock, int flags, int *addr_len)
1196 {
1197         struct tcp_sock *tp = tcp_sk(sk);
1198         int copied = 0;
1199         u32 peek_seq;
1200         u32 *seq;
1201         unsigned long used;
1202         int err;
1203         int target;             /* Read at least this many bytes */
1204         long timeo;
1205         struct task_struct *user_recv = NULL;
1206
1207         lock_sock(sk);
1208
1209         TCP_CHECK_TIMER(sk);
1210
1211         err = -ENOTCONN;
1212         if (sk->sk_state == TCP_LISTEN)
1213                 goto out;
1214
1215         timeo = sock_rcvtimeo(sk, nonblock);
1216
1217         /* Urgent data needs to be handled specially. */
1218         if (flags & MSG_OOB)
1219                 goto recv_urg;
1220
1221         seq = &tp->copied_seq;
1222         if (flags & MSG_PEEK) {
1223                 peek_seq = tp->copied_seq;
1224                 seq = &peek_seq;
1225         }
1226
1227         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1228
1229         do {
1230                 struct sk_buff *skb;
1231                 u32 offset;
1232
1233                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1234                 if (tp->urg_data && tp->urg_seq == *seq) {
1235                         if (copied)
1236                                 break;
1237                         if (signal_pending(current)) {
1238                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1239                                 break;
1240                         }
1241                 }
1242
1243                 /* Next get a buffer. */
1244
1245                 skb = skb_peek(&sk->sk_receive_queue);
1246                 do {
1247                         if (!skb)
1248                                 break;
1249
1250                         /* Now that we have two receive queues this
1251                          * shouldn't happen.
1252                          */
1253                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1254                                 printk(KERN_INFO "recvmsg bug: copied %X "
1255                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1256                                 break;
1257                         }
1258                         offset = *seq - TCP_SKB_CB(skb)->seq;
1259                         if (skb->h.th->syn)
1260                                 offset--;
1261                         if (offset < skb->len)
1262                                 goto found_ok_skb;
1263                         if (skb->h.th->fin)
1264                                 goto found_fin_ok;
1265                         BUG_TRAP(flags & MSG_PEEK);
1266                         skb = skb->next;
1267                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1268
1269                 /* Well, if we have backlog, try to process it now yet. */
1270
1271                 if (copied >= target && !sk->sk_backlog.tail)
1272                         break;
1273
1274                 if (copied) {
1275                         if (sk->sk_err ||
1276                             sk->sk_state == TCP_CLOSE ||
1277                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1278                             !timeo ||
1279                             signal_pending(current) ||
1280                             (flags & MSG_PEEK))
1281                                 break;
1282                 } else {
1283                         if (sock_flag(sk, SOCK_DONE))
1284                                 break;
1285
1286                         if (sk->sk_err) {
1287                                 copied = sock_error(sk);
1288                                 break;
1289                         }
1290
1291                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1292                                 break;
1293
1294                         if (sk->sk_state == TCP_CLOSE) {
1295                                 if (!sock_flag(sk, SOCK_DONE)) {
1296                                         /* This occurs when user tries to read
1297                                          * from never connected socket.
1298                                          */
1299                                         copied = -ENOTCONN;
1300                                         break;
1301                                 }
1302                                 break;
1303                         }
1304
1305                         if (!timeo) {
1306                                 copied = -EAGAIN;
1307                                 break;
1308                         }
1309
1310                         if (signal_pending(current)) {
1311                                 copied = sock_intr_errno(timeo);
1312                                 break;
1313                         }
1314                 }
1315
1316                 cleanup_rbuf(sk, copied);
1317
1318                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1319                         /* Install new reader */
1320                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1321                                 user_recv = current;
1322                                 tp->ucopy.task = user_recv;
1323                                 tp->ucopy.iov = msg->msg_iov;
1324                         }
1325
1326                         tp->ucopy.len = len;
1327
1328                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1329                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1330
1331                         /* Ugly... If prequeue is not empty, we have to
1332                          * process it before releasing socket, otherwise
1333                          * order will be broken at second iteration.
1334                          * More elegant solution is required!!!
1335                          *
1336                          * Look: we have the following (pseudo)queues:
1337                          *
1338                          * 1. packets in flight
1339                          * 2. backlog
1340                          * 3. prequeue
1341                          * 4. receive_queue
1342                          *
1343                          * Each queue can be processed only if the next ones
1344                          * are empty. At this point we have empty receive_queue.
1345                          * But prequeue _can_ be not empty after 2nd iteration,
1346                          * when we jumped to start of loop because backlog
1347                          * processing added something to receive_queue.
1348                          * We cannot release_sock(), because backlog contains
1349                          * packets arrived _after_ prequeued ones.
1350                          *
1351                          * Shortly, algorithm is clear --- to process all
1352                          * the queues in order. We could make it more directly,
1353                          * requeueing packets from backlog to prequeue, if
1354                          * is not empty. It is more elegant, but eats cycles,
1355                          * unfortunately.
1356                          */
1357                         if (!skb_queue_empty(&tp->ucopy.prequeue))
1358                                 goto do_prequeue;
1359
1360                         /* __ Set realtime policy in scheduler __ */
1361                 }
1362
1363                 if (copied >= target) {
1364                         /* Do not sleep, just process backlog. */
1365                         release_sock(sk);
1366                         lock_sock(sk);
1367                 } else
1368                         sk_wait_data(sk, &timeo);
1369
1370                 if (user_recv) {
1371                         int chunk;
1372
1373                         /* __ Restore normal policy in scheduler __ */
1374
1375                         if ((chunk = len - tp->ucopy.len) != 0) {
1376                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1377                                 len -= chunk;
1378                                 copied += chunk;
1379                         }
1380
1381                         if (tp->rcv_nxt == tp->copied_seq &&
1382                             !skb_queue_empty(&tp->ucopy.prequeue)) {
1383 do_prequeue:
1384                                 tcp_prequeue_process(sk);
1385
1386                                 if ((chunk = len - tp->ucopy.len) != 0) {
1387                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1388                                         len -= chunk;
1389                                         copied += chunk;
1390                                 }
1391                         }
1392                 }
1393                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1394                         if (net_ratelimit())
1395                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1396                                        current->comm, current->pid);
1397                         peek_seq = tp->copied_seq;
1398                 }
1399                 continue;
1400
1401         found_ok_skb:
1402                 /* Ok so how much can we use? */
1403                 used = skb->len - offset;
1404                 if (len < used)
1405                         used = len;
1406
1407                 /* Do we have urgent data here? */
1408                 if (tp->urg_data) {
1409                         u32 urg_offset = tp->urg_seq - *seq;
1410                         if (urg_offset < used) {
1411                                 if (!urg_offset) {
1412                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1413                                                 ++*seq;
1414                                                 offset++;
1415                                                 used--;
1416                                                 if (!used)
1417                                                         goto skip_copy;
1418                                         }
1419                                 } else
1420                                         used = urg_offset;
1421                         }
1422                 }
1423
1424                 if (!(flags & MSG_TRUNC)) {
1425                         err = skb_copy_datagram_iovec(skb, offset,
1426                                                       msg->msg_iov, used);
1427                         if (err) {
1428                                 /* Exception. Bailout! */
1429                                 if (!copied)
1430                                         copied = -EFAULT;
1431                                 break;
1432                         }
1433                 }
1434
1435                 *seq += used;
1436                 copied += used;
1437                 len -= used;
1438
1439                 tcp_rcv_space_adjust(sk);
1440
1441 skip_copy:
1442                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1443                         tp->urg_data = 0;
1444                         tcp_fast_path_check(sk, tp);
1445                 }
1446                 if (used + offset < skb->len)
1447                         continue;
1448
1449                 if (skb->h.th->fin)
1450                         goto found_fin_ok;
1451                 if (!(flags & MSG_PEEK))
1452                         sk_eat_skb(sk, skb);
1453                 continue;
1454
1455         found_fin_ok:
1456                 /* Process the FIN. */
1457                 ++*seq;
1458                 if (!(flags & MSG_PEEK))
1459                         sk_eat_skb(sk, skb);
1460                 break;
1461         } while (len > 0);
1462
1463         if (user_recv) {
1464                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1465                         int chunk;
1466
1467                         tp->ucopy.len = copied > 0 ? len : 0;
1468
1469                         tcp_prequeue_process(sk);
1470
1471                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1472                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1473                                 len -= chunk;
1474                                 copied += chunk;
1475                         }
1476                 }
1477
1478                 tp->ucopy.task = NULL;
1479                 tp->ucopy.len = 0;
1480         }
1481
1482         /* According to UNIX98, msg_name/msg_namelen are ignored
1483          * on connected socket. I was just happy when found this 8) --ANK
1484          */
1485
1486         /* Clean up data we have read: This will do ACK frames. */
1487         cleanup_rbuf(sk, copied);
1488
1489         TCP_CHECK_TIMER(sk);
1490         release_sock(sk);
1491         return copied;
1492
1493 out:
1494         TCP_CHECK_TIMER(sk);
1495         release_sock(sk);
1496         return err;
1497
1498 recv_urg:
1499         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1500         goto out;
1501 }
1502
1503 /*
1504  *      State processing on a close. This implements the state shift for
1505  *      sending our FIN frame. Note that we only send a FIN for some
1506  *      states. A shutdown() may have already sent the FIN, or we may be
1507  *      closed.
1508  */
1509
1510 static unsigned char new_state[16] = {
1511   /* current state:        new state:      action:      */
1512   /* (Invalid)          */ TCP_CLOSE,
1513   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1514   /* TCP_SYN_SENT       */ TCP_CLOSE,
1515   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1516   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1517   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1518   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1519   /* TCP_CLOSE          */ TCP_CLOSE,
1520   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1521   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1522   /* TCP_LISTEN         */ TCP_CLOSE,
1523   /* TCP_CLOSING        */ TCP_CLOSING,
1524 };
1525
1526 static int tcp_close_state(struct sock *sk)
1527 {
1528         int next = (int)new_state[sk->sk_state];
1529         int ns = next & TCP_STATE_MASK;
1530
1531         tcp_set_state(sk, ns);
1532
1533         return next & TCP_ACTION_FIN;
1534 }
1535
1536 /*
1537  *      Shutdown the sending side of a connection. Much like close except
1538  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1539  */
1540
1541 void tcp_shutdown(struct sock *sk, int how)
1542 {
1543         /*      We need to grab some memory, and put together a FIN,
1544          *      and then put it into the queue to be sent.
1545          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1546          */
1547         if (!(how & SEND_SHUTDOWN))
1548                 return;
1549
1550         /* If we've already sent a FIN, or it's a closed state, skip this. */
1551         if ((1 << sk->sk_state) &
1552             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1553              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1554                 /* Clear out any half completed packets.  FIN if needed. */
1555                 if (tcp_close_state(sk))
1556                         tcp_send_fin(sk);
1557         }
1558 }
1559
1560 /*
1561  * At this point, there should be no process reference to this
1562  * socket, and thus no user references at all.  Therefore we
1563  * can assume the socket waitqueue is inactive and nobody will
1564  * try to jump onto it.
1565  */
1566 void tcp_destroy_sock(struct sock *sk)
1567 {
1568         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1569         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1570
1571         /* It cannot be in hash table! */
1572         BUG_TRAP(sk_unhashed(sk));
1573
1574         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1575         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1576
1577         sk->sk_prot->destroy(sk);
1578
1579         sk_stream_kill_queues(sk);
1580
1581         xfrm_sk_free_policy(sk);
1582
1583 #ifdef INET_REFCNT_DEBUG
1584         if (atomic_read(&sk->sk_refcnt) != 1) {
1585                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1586                        sk, atomic_read(&sk->sk_refcnt));
1587         }
1588 #endif
1589
1590         atomic_dec(&tcp_orphan_count);
1591         sock_put(sk);
1592 }
1593
1594 void tcp_close(struct sock *sk, long timeout)
1595 {
1596         struct sk_buff *skb;
1597         int data_was_unread = 0;
1598
1599         lock_sock(sk);
1600         sk->sk_shutdown = SHUTDOWN_MASK;
1601
1602         if (sk->sk_state == TCP_LISTEN) {
1603                 tcp_set_state(sk, TCP_CLOSE);
1604
1605                 /* Special case. */
1606                 tcp_listen_stop(sk);
1607
1608                 goto adjudge_to_death;
1609         }
1610
1611         /*  We need to flush the recv. buffs.  We do this only on the
1612          *  descriptor close, not protocol-sourced closes, because the
1613          *  reader process may not have drained the data yet!
1614          */
1615         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1616                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1617                           skb->h.th->fin;
1618                 data_was_unread += len;
1619                 __kfree_skb(skb);
1620         }
1621
1622         sk_stream_mem_reclaim(sk);
1623
1624         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1625          * 3.10, we send a RST here because data was lost.  To
1626          * witness the awful effects of the old behavior of always
1627          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1628          * a bulk GET in an FTP client, suspend the process, wait
1629          * for the client to advertise a zero window, then kill -9
1630          * the FTP client, wheee...  Note: timeout is always zero
1631          * in such a case.
1632          */
1633         if (data_was_unread) {
1634                 /* Unread data was tossed, zap the connection. */
1635                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1636                 tcp_set_state(sk, TCP_CLOSE);
1637                 tcp_send_active_reset(sk, GFP_KERNEL);
1638         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1639                 /* Check zero linger _after_ checking for unread data. */
1640                 sk->sk_prot->disconnect(sk, 0);
1641                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1642         } else if (tcp_close_state(sk)) {
1643                 /* We FIN if the application ate all the data before
1644                  * zapping the connection.
1645                  */
1646
1647                 /* RED-PEN. Formally speaking, we have broken TCP state
1648                  * machine. State transitions:
1649                  *
1650                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1651                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1652                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1653                  *
1654                  * are legal only when FIN has been sent (i.e. in window),
1655                  * rather than queued out of window. Purists blame.
1656                  *
1657                  * F.e. "RFC state" is ESTABLISHED,
1658                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1659                  *
1660                  * The visible declinations are that sometimes
1661                  * we enter time-wait state, when it is not required really
1662                  * (harmless), do not send active resets, when they are
1663                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1664                  * they look as CLOSING or LAST_ACK for Linux)
1665                  * Probably, I missed some more holelets.
1666                  *                                              --ANK
1667                  */
1668                 tcp_send_fin(sk);
1669         }
1670
1671         sk_stream_wait_close(sk, timeout);
1672
1673 adjudge_to_death:
1674         /* It is the last release_sock in its life. It will remove backlog. */
1675         release_sock(sk);
1676
1677
1678         /* Now socket is owned by kernel and we acquire BH lock
1679            to finish close. No need to check for user refs.
1680          */
1681         local_bh_disable();
1682         bh_lock_sock(sk);
1683         BUG_TRAP(!sock_owned_by_user(sk));
1684
1685         sock_hold(sk);
1686         sock_orphan(sk);
1687
1688         /*      This is a (useful) BSD violating of the RFC. There is a
1689          *      problem with TCP as specified in that the other end could
1690          *      keep a socket open forever with no application left this end.
1691          *      We use a 3 minute timeout (about the same as BSD) then kill
1692          *      our end. If they send after that then tough - BUT: long enough
1693          *      that we won't make the old 4*rto = almost no time - whoops
1694          *      reset mistake.
1695          *
1696          *      Nope, it was not mistake. It is really desired behaviour
1697          *      f.e. on http servers, when such sockets are useless, but
1698          *      consume significant resources. Let's do it with special
1699          *      linger2 option.                                 --ANK
1700          */
1701
1702         if (sk->sk_state == TCP_FIN_WAIT2) {
1703                 struct tcp_sock *tp = tcp_sk(sk);
1704                 if (tp->linger2 < 0) {
1705                         tcp_set_state(sk, TCP_CLOSE);
1706                         tcp_send_active_reset(sk, GFP_ATOMIC);
1707                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1708                 } else {
1709                         int tmo = tcp_fin_time(tp);
1710
1711                         if (tmo > TCP_TIMEWAIT_LEN) {
1712                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1713                         } else {
1714                                 atomic_inc(&tcp_orphan_count);
1715                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1716                                 goto out;
1717                         }
1718                 }
1719         }
1720         if (sk->sk_state != TCP_CLOSE) {
1721                 sk_stream_mem_reclaim(sk);
1722                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1723                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1724                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1725                         if (net_ratelimit())
1726                                 printk(KERN_INFO "TCP: too many of orphaned "
1727                                        "sockets\n");
1728                         tcp_set_state(sk, TCP_CLOSE);
1729                         tcp_send_active_reset(sk, GFP_ATOMIC);
1730                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1731                 }
1732         }
1733         atomic_inc(&tcp_orphan_count);
1734
1735         if (sk->sk_state == TCP_CLOSE)
1736                 tcp_destroy_sock(sk);
1737         /* Otherwise, socket is reprieved until protocol close. */
1738
1739 out:
1740         bh_unlock_sock(sk);
1741         local_bh_enable();
1742         sock_put(sk);
1743 }
1744
1745 /* These states need RST on ABORT according to RFC793 */
1746
1747 static inline int tcp_need_reset(int state)
1748 {
1749         return (1 << state) &
1750                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1751                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1752 }
1753
1754 int tcp_disconnect(struct sock *sk, int flags)
1755 {
1756         struct inet_sock *inet = inet_sk(sk);
1757         struct tcp_sock *tp = tcp_sk(sk);
1758         int err = 0;
1759         int old_state = sk->sk_state;
1760
1761         if (old_state != TCP_CLOSE)
1762                 tcp_set_state(sk, TCP_CLOSE);
1763
1764         /* ABORT function of RFC793 */
1765         if (old_state == TCP_LISTEN) {
1766                 tcp_listen_stop(sk);
1767         } else if (tcp_need_reset(old_state) ||
1768                    (tp->snd_nxt != tp->write_seq &&
1769                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1770                 /* The last check adjusts for discrepance of Linux wrt. RFC
1771                  * states
1772                  */
1773                 tcp_send_active_reset(sk, gfp_any());
1774                 sk->sk_err = ECONNRESET;
1775         } else if (old_state == TCP_SYN_SENT)
1776                 sk->sk_err = ECONNRESET;
1777
1778         tcp_clear_xmit_timers(sk);
1779         __skb_queue_purge(&sk->sk_receive_queue);
1780         sk_stream_writequeue_purge(sk);
1781         __skb_queue_purge(&tp->out_of_order_queue);
1782
1783         inet->dport = 0;
1784
1785         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1786                 inet_reset_saddr(sk);
1787
1788         sk->sk_shutdown = 0;
1789         sock_reset_flag(sk, SOCK_DONE);
1790         tp->srtt = 0;
1791         if ((tp->write_seq += tp->max_window + 2) == 0)
1792                 tp->write_seq = 1;
1793         tp->backoff = 0;
1794         tp->snd_cwnd = 2;
1795         tp->probes_out = 0;
1796         tp->packets_out = 0;
1797         tp->snd_ssthresh = 0x7fffffff;
1798         tp->snd_cwnd_cnt = 0;
1799         tcp_set_ca_state(tp, TCP_CA_Open);
1800         tcp_clear_retrans(tp);
1801         tcp_delack_init(tp);
1802         sk->sk_send_head = NULL;
1803         tp->rx_opt.saw_tstamp = 0;
1804         tcp_sack_reset(&tp->rx_opt);
1805         __sk_dst_reset(sk);
1806
1807         BUG_TRAP(!inet->num || tp->bind_hash);
1808
1809         sk->sk_error_report(sk);
1810         return err;
1811 }
1812
1813 /*
1814  *      Wait for an incoming connection, avoid race
1815  *      conditions. This must be called with the socket locked.
1816  */
1817 static int wait_for_connect(struct sock *sk, long timeo)
1818 {
1819         struct tcp_sock *tp = tcp_sk(sk);
1820         DEFINE_WAIT(wait);
1821         int err;
1822
1823         /*
1824          * True wake-one mechanism for incoming connections: only
1825          * one process gets woken up, not the 'whole herd'.
1826          * Since we do not 'race & poll' for established sockets
1827          * anymore, the common case will execute the loop only once.
1828          *
1829          * Subtle issue: "add_wait_queue_exclusive()" will be added
1830          * after any current non-exclusive waiters, and we know that
1831          * it will always _stay_ after any new non-exclusive waiters
1832          * because all non-exclusive waiters are added at the
1833          * beginning of the wait-queue. As such, it's ok to "drop"
1834          * our exclusiveness temporarily when we get woken up without
1835          * having to remove and re-insert us on the wait queue.
1836          */
1837         for (;;) {
1838                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1839                                           TASK_INTERRUPTIBLE);
1840                 release_sock(sk);
1841                 if (reqsk_queue_empty(&tp->accept_queue))
1842                         timeo = schedule_timeout(timeo);
1843                 lock_sock(sk);
1844                 err = 0;
1845                 if (!reqsk_queue_empty(&tp->accept_queue))
1846                         break;
1847                 err = -EINVAL;
1848                 if (sk->sk_state != TCP_LISTEN)
1849                         break;
1850                 err = sock_intr_errno(timeo);
1851                 if (signal_pending(current))
1852                         break;
1853                 err = -EAGAIN;
1854                 if (!timeo)
1855                         break;
1856         }
1857         finish_wait(sk->sk_sleep, &wait);
1858         return err;
1859 }
1860
1861 /*
1862  *      This will accept the next outstanding connection.
1863  */
1864
1865 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1866 {
1867         struct tcp_sock *tp = tcp_sk(sk);
1868         struct sock *newsk;
1869         int error;
1870
1871         lock_sock(sk);
1872
1873         /* We need to make sure that this socket is listening,
1874          * and that it has something pending.
1875          */
1876         error = -EINVAL;
1877         if (sk->sk_state != TCP_LISTEN)
1878                 goto out_err;
1879
1880         /* Find already established connection */
1881         if (reqsk_queue_empty(&tp->accept_queue)) {
1882                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1883
1884                 /* If this is a non blocking socket don't sleep */
1885                 error = -EAGAIN;
1886                 if (!timeo)
1887                         goto out_err;
1888
1889                 error = wait_for_connect(sk, timeo);
1890                 if (error)
1891                         goto out_err;
1892         }
1893
1894         newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1895         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1896 out:
1897         release_sock(sk);
1898         return newsk;
1899 out_err:
1900         newsk = NULL;
1901         *err = error;
1902         goto out;
1903 }
1904
1905 /*
1906  *      Socket option code for TCP.
1907  */
1908 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1909                    int optlen)
1910 {
1911         struct tcp_sock *tp = tcp_sk(sk);
1912         int val;
1913         int err = 0;
1914
1915         if (level != SOL_TCP)
1916                 return tp->af_specific->setsockopt(sk, level, optname,
1917                                                    optval, optlen);
1918
1919         /* This is a string value all the others are int's */
1920         if (optname == TCP_CONGESTION) {
1921                 char name[TCP_CA_NAME_MAX];
1922
1923                 if (optlen < 1)
1924                         return -EINVAL;
1925
1926                 val = strncpy_from_user(name, optval,
1927                                         min(TCP_CA_NAME_MAX-1, optlen));
1928                 if (val < 0)
1929                         return -EFAULT;
1930                 name[val] = 0;
1931
1932                 lock_sock(sk);
1933                 err = tcp_set_congestion_control(tp, name);
1934                 release_sock(sk);
1935                 return err;
1936         }
1937
1938         if (optlen < sizeof(int))
1939                 return -EINVAL;
1940
1941         if (get_user(val, (int __user *)optval))
1942                 return -EFAULT;
1943
1944         lock_sock(sk);
1945
1946         switch (optname) {
1947         case TCP_MAXSEG:
1948                 /* Values greater than interface MTU won't take effect. However
1949                  * at the point when this call is done we typically don't yet
1950                  * know which interface is going to be used */
1951                 if (val < 8 || val > MAX_TCP_WINDOW) {
1952                         err = -EINVAL;
1953                         break;
1954                 }
1955                 tp->rx_opt.user_mss = val;
1956                 break;
1957
1958         case TCP_NODELAY:
1959                 if (val) {
1960                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1961                          * this option on corked socket is remembered, but
1962                          * it is not activated until cork is cleared.
1963                          *
1964                          * However, when TCP_NODELAY is set we make
1965                          * an explicit push, which overrides even TCP_CORK
1966                          * for currently queued segments.
1967                          */
1968                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1969                         tcp_push_pending_frames(sk, tp);
1970                 } else {
1971                         tp->nonagle &= ~TCP_NAGLE_OFF;
1972                 }
1973                 break;
1974
1975         case TCP_CORK:
1976                 /* When set indicates to always queue non-full frames.
1977                  * Later the user clears this option and we transmit
1978                  * any pending partial frames in the queue.  This is
1979                  * meant to be used alongside sendfile() to get properly
1980                  * filled frames when the user (for example) must write
1981                  * out headers with a write() call first and then use
1982                  * sendfile to send out the data parts.
1983                  *
1984                  * TCP_CORK can be set together with TCP_NODELAY and it is
1985                  * stronger than TCP_NODELAY.
1986                  */
1987                 if (val) {
1988                         tp->nonagle |= TCP_NAGLE_CORK;
1989                 } else {
1990                         tp->nonagle &= ~TCP_NAGLE_CORK;
1991                         if (tp->nonagle&TCP_NAGLE_OFF)
1992                                 tp->nonagle |= TCP_NAGLE_PUSH;
1993                         tcp_push_pending_frames(sk, tp);
1994                 }
1995                 break;
1996
1997         case TCP_KEEPIDLE:
1998                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
1999                         err = -EINVAL;
2000                 else {
2001                         tp->keepalive_time = val * HZ;
2002                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2003                             !((1 << sk->sk_state) &
2004                               (TCPF_CLOSE | TCPF_LISTEN))) {
2005                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2006                                 if (tp->keepalive_time > elapsed)
2007                                         elapsed = tp->keepalive_time - elapsed;
2008                                 else
2009                                         elapsed = 0;
2010                                 tcp_reset_keepalive_timer(sk, elapsed);
2011                         }
2012                 }
2013                 break;
2014         case TCP_KEEPINTVL:
2015                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2016                         err = -EINVAL;
2017                 else
2018                         tp->keepalive_intvl = val * HZ;
2019                 break;
2020         case TCP_KEEPCNT:
2021                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2022                         err = -EINVAL;
2023                 else
2024                         tp->keepalive_probes = val;
2025                 break;
2026         case TCP_SYNCNT:
2027                 if (val < 1 || val > MAX_TCP_SYNCNT)
2028                         err = -EINVAL;
2029                 else
2030                         tp->syn_retries = val;
2031                 break;
2032
2033         case TCP_LINGER2:
2034                 if (val < 0)
2035                         tp->linger2 = -1;
2036                 else if (val > sysctl_tcp_fin_timeout / HZ)
2037                         tp->linger2 = 0;
2038                 else
2039                         tp->linger2 = val * HZ;
2040                 break;
2041
2042         case TCP_DEFER_ACCEPT:
2043                 tp->defer_accept = 0;
2044                 if (val > 0) {
2045                         /* Translate value in seconds to number of
2046                          * retransmits */
2047                         while (tp->defer_accept < 32 &&
2048                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2049                                        tp->defer_accept))
2050                                 tp->defer_accept++;
2051                         tp->defer_accept++;
2052                 }
2053                 break;
2054
2055         case TCP_WINDOW_CLAMP:
2056                 if (!val) {
2057                         if (sk->sk_state != TCP_CLOSE) {
2058                                 err = -EINVAL;
2059                                 break;
2060                         }
2061                         tp->window_clamp = 0;
2062                 } else
2063                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2064                                                 SOCK_MIN_RCVBUF / 2 : val;
2065                 break;
2066
2067         case TCP_QUICKACK:
2068                 if (!val) {
2069                         tp->ack.pingpong = 1;
2070                 } else {
2071                         tp->ack.pingpong = 0;
2072                         if ((1 << sk->sk_state) &
2073                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2074                             tcp_ack_scheduled(tp)) {
2075                                 tp->ack.pending |= TCP_ACK_PUSHED;
2076                                 cleanup_rbuf(sk, 1);
2077                                 if (!(val & 1))
2078                                         tp->ack.pingpong = 1;
2079                         }
2080                 }
2081                 break;
2082
2083         default:
2084                 err = -ENOPROTOOPT;
2085                 break;
2086         };
2087         release_sock(sk);
2088         return err;
2089 }
2090
2091 /* Return information about state of tcp endpoint in API format. */
2092 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2093 {
2094         struct tcp_sock *tp = tcp_sk(sk);
2095         u32 now = tcp_time_stamp;
2096
2097         memset(info, 0, sizeof(*info));
2098
2099         info->tcpi_state = sk->sk_state;
2100         info->tcpi_ca_state = tp->ca_state;
2101         info->tcpi_retransmits = tp->retransmits;
2102         info->tcpi_probes = tp->probes_out;
2103         info->tcpi_backoff = tp->backoff;
2104
2105         if (tp->rx_opt.tstamp_ok)
2106                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2107         if (tp->rx_opt.sack_ok)
2108                 info->tcpi_options |= TCPI_OPT_SACK;
2109         if (tp->rx_opt.wscale_ok) {
2110                 info->tcpi_options |= TCPI_OPT_WSCALE;
2111                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2112                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2113         } 
2114
2115         if (tp->ecn_flags&TCP_ECN_OK)
2116                 info->tcpi_options |= TCPI_OPT_ECN;
2117
2118         info->tcpi_rto = jiffies_to_usecs(tp->rto);
2119         info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2120         info->tcpi_snd_mss = tp->mss_cache;
2121         info->tcpi_rcv_mss = tp->ack.rcv_mss;
2122
2123         info->tcpi_unacked = tp->packets_out;
2124         info->tcpi_sacked = tp->sacked_out;
2125         info->tcpi_lost = tp->lost_out;
2126         info->tcpi_retrans = tp->retrans_out;
2127         info->tcpi_fackets = tp->fackets_out;
2128
2129         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2130         info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2131         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2132
2133         info->tcpi_pmtu = tp->pmtu_cookie;
2134         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2135         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2136         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2137         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2138         info->tcpi_snd_cwnd = tp->snd_cwnd;
2139         info->tcpi_advmss = tp->advmss;
2140         info->tcpi_reordering = tp->reordering;
2141
2142         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2143         info->tcpi_rcv_space = tp->rcvq_space.space;
2144
2145         info->tcpi_total_retrans = tp->total_retrans;
2146 }
2147
2148 EXPORT_SYMBOL_GPL(tcp_get_info);
2149
2150 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2151                    int __user *optlen)
2152 {
2153         struct tcp_sock *tp = tcp_sk(sk);
2154         int val, len;
2155
2156         if (level != SOL_TCP)
2157                 return tp->af_specific->getsockopt(sk, level, optname,
2158                                                    optval, optlen);
2159
2160         if (get_user(len, optlen))
2161                 return -EFAULT;
2162
2163         len = min_t(unsigned int, len, sizeof(int));
2164
2165         if (len < 0)
2166                 return -EINVAL;
2167
2168         switch (optname) {
2169         case TCP_MAXSEG:
2170                 val = tp->mss_cache;
2171                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2172                         val = tp->rx_opt.user_mss;
2173                 break;
2174         case TCP_NODELAY:
2175                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2176                 break;
2177         case TCP_CORK:
2178                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2179                 break;
2180         case TCP_KEEPIDLE:
2181                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2182                 break;
2183         case TCP_KEEPINTVL:
2184                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2185                 break;
2186         case TCP_KEEPCNT:
2187                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2188                 break;
2189         case TCP_SYNCNT:
2190                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2191                 break;
2192         case TCP_LINGER2:
2193                 val = tp->linger2;
2194                 if (val >= 0)
2195                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2196                 break;
2197         case TCP_DEFER_ACCEPT:
2198                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2199                                                (tp->defer_accept - 1));
2200                 break;
2201         case TCP_WINDOW_CLAMP:
2202                 val = tp->window_clamp;
2203                 break;
2204         case TCP_INFO: {
2205                 struct tcp_info info;
2206
2207                 if (get_user(len, optlen))
2208                         return -EFAULT;
2209
2210                 tcp_get_info(sk, &info);
2211
2212                 len = min_t(unsigned int, len, sizeof(info));
2213                 if (put_user(len, optlen))
2214                         return -EFAULT;
2215                 if (copy_to_user(optval, &info, len))
2216                         return -EFAULT;
2217                 return 0;
2218         }
2219         case TCP_QUICKACK:
2220                 val = !tp->ack.pingpong;
2221                 break;
2222
2223         case TCP_CONGESTION:
2224                 if (get_user(len, optlen))
2225                         return -EFAULT;
2226                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2227                 if (put_user(len, optlen))
2228                         return -EFAULT;
2229                 if (copy_to_user(optval, tp->ca_ops->name, len))
2230                         return -EFAULT;
2231                 return 0;
2232         default:
2233                 return -ENOPROTOOPT;
2234         };
2235
2236         if (put_user(len, optlen))
2237                 return -EFAULT;
2238         if (copy_to_user(optval, &val, len))
2239                 return -EFAULT;
2240         return 0;
2241 }
2242
2243
2244 extern void __skb_cb_too_small_for_tcp(int, int);
2245 extern struct tcp_congestion_ops tcp_reno;
2246
2247 static __initdata unsigned long thash_entries;
2248 static int __init set_thash_entries(char *str)
2249 {
2250         if (!str)
2251                 return 0;
2252         thash_entries = simple_strtoul(str, &str, 0);
2253         return 1;
2254 }
2255 __setup("thash_entries=", set_thash_entries);
2256
2257 void __init tcp_init(void)
2258 {
2259         struct sk_buff *skb = NULL;
2260         int order, i;
2261
2262         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2263                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2264                                            sizeof(skb->cb));
2265
2266         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2267                                               sizeof(struct tcp_bind_bucket),
2268                                               0, SLAB_HWCACHE_ALIGN,
2269                                               NULL, NULL);
2270         if (!tcp_bucket_cachep)
2271                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2272
2273         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2274                                                 sizeof(struct tcp_tw_bucket),
2275                                                 0, SLAB_HWCACHE_ALIGN,
2276                                                 NULL, NULL);
2277         if (!tcp_timewait_cachep)
2278                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2279
2280         /* Size and allocate the main established and bind bucket
2281          * hash tables.
2282          *
2283          * The methodology is similar to that of the buffer cache.
2284          */
2285         tcp_ehash = (struct tcp_ehash_bucket *)
2286                 alloc_large_system_hash("TCP established",
2287                                         sizeof(struct tcp_ehash_bucket),
2288                                         thash_entries,
2289                                         (num_physpages >= 128 * 1024) ?
2290                                                 (25 - PAGE_SHIFT) :
2291                                                 (27 - PAGE_SHIFT),
2292                                         HASH_HIGHMEM,
2293                                         &tcp_ehash_size,
2294                                         NULL,
2295                                         0);
2296         tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2297         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2298                 rwlock_init(&tcp_ehash[i].lock);
2299                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2300         }
2301
2302         tcp_bhash = (struct tcp_bind_hashbucket *)
2303                 alloc_large_system_hash("TCP bind",
2304                                         sizeof(struct tcp_bind_hashbucket),
2305                                         tcp_ehash_size,
2306                                         (num_physpages >= 128 * 1024) ?
2307                                                 (25 - PAGE_SHIFT) :
2308                                                 (27 - PAGE_SHIFT),
2309                                         HASH_HIGHMEM,
2310                                         &tcp_bhash_size,
2311                                         NULL,
2312                                         64 * 1024);
2313         tcp_bhash_size = 1 << tcp_bhash_size;
2314         for (i = 0; i < tcp_bhash_size; i++) {
2315                 spin_lock_init(&tcp_bhash[i].lock);
2316                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2317         }
2318
2319         /* Try to be a bit smarter and adjust defaults depending
2320          * on available memory.
2321          */
2322         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2323                         (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2324                         order++)
2325                 ;
2326         if (order >= 4) {
2327                 sysctl_local_port_range[0] = 32768;
2328                 sysctl_local_port_range[1] = 61000;
2329                 sysctl_tcp_max_tw_buckets = 180000;
2330                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2331                 sysctl_max_syn_backlog = 1024;
2332         } else if (order < 3) {
2333                 sysctl_local_port_range[0] = 1024 * (3 - order);
2334                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2335                 sysctl_tcp_max_orphans >>= (3 - order);
2336                 sysctl_max_syn_backlog = 128;
2337         }
2338         tcp_port_rover = sysctl_local_port_range[0] - 1;
2339
2340         sysctl_tcp_mem[0] =  768 << order;
2341         sysctl_tcp_mem[1] = 1024 << order;
2342         sysctl_tcp_mem[2] = 1536 << order;
2343
2344         if (order < 3) {
2345                 sysctl_tcp_wmem[2] = 64 * 1024;
2346                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2347                 sysctl_tcp_rmem[1] = 43689;
2348                 sysctl_tcp_rmem[2] = 2 * 43689;
2349         }
2350
2351         printk(KERN_INFO "TCP: Hash tables configured "
2352                "(established %d bind %d)\n",
2353                tcp_ehash_size << 1, tcp_bhash_size);
2354
2355         tcp_register_congestion_control(&tcp_reno);
2356 }
2357
2358 EXPORT_SYMBOL(tcp_accept);
2359 EXPORT_SYMBOL(tcp_close);
2360 EXPORT_SYMBOL(tcp_destroy_sock);
2361 EXPORT_SYMBOL(tcp_disconnect);
2362 EXPORT_SYMBOL(tcp_getsockopt);
2363 EXPORT_SYMBOL(tcp_ioctl);
2364 EXPORT_SYMBOL(tcp_poll);
2365 EXPORT_SYMBOL(tcp_read_sock);
2366 EXPORT_SYMBOL(tcp_recvmsg);
2367 EXPORT_SYMBOL(tcp_sendmsg);
2368 EXPORT_SYMBOL(tcp_sendpage);
2369 EXPORT_SYMBOL(tcp_setsockopt);
2370 EXPORT_SYMBOL(tcp_shutdown);
2371 EXPORT_SYMBOL(tcp_statistics);
2372 EXPORT_SYMBOL(tcp_timewait_cachep);