Merge master.kernel.org:/home/rmk/linux-2.6-serial
[pandora-kernel.git] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:     $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11  *
12  * Authors:     Ross Biro
13  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *              Alan Cox        :       Numerous verify_area() problems
19  *              Alan Cox        :       Connecting on a connecting socket
20  *                                      now returns an error for tcp.
21  *              Alan Cox        :       sock->protocol is set correctly.
22  *                                      and is not sometimes left as 0.
23  *              Alan Cox        :       connect handles icmp errors on a
24  *                                      connect properly. Unfortunately there
25  *                                      is a restart syscall nasty there. I
26  *                                      can't match BSD without hacking the C
27  *                                      library. Ideas urgently sought!
28  *              Alan Cox        :       Disallow bind() to addresses that are
29  *                                      not ours - especially broadcast ones!!
30  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
31  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
32  *                                      instead they leave that for the DESTROY timer.
33  *              Alan Cox        :       Clean up error flag in accept
34  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
35  *                                      was buggy. Put a remove_sock() in the handler
36  *                                      for memory when we hit 0. Also altered the timer
37  *                                      code. The ACK stuff can wait and needs major 
38  *                                      TCP layer surgery.
39  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
40  *                                      and fixed timer/inet_bh race.
41  *              Alan Cox        :       Added zapped flag for TCP
42  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
43  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
45  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
48  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
49  *      Pauline Middelink       :       identd support
50  *              Alan Cox        :       Fixed connect() taking signals I think.
51  *              Alan Cox        :       SO_LINGER supported
52  *              Alan Cox        :       Error reporting fixes
53  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
54  *              Alan Cox        :       inet sockets don't set sk->type!
55  *              Alan Cox        :       Split socket option code
56  *              Alan Cox        :       Callbacks
57  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
58  *              Alex            :       Removed restriction on inet fioctl
59  *              Alan Cox        :       Splitting INET from NET core
60  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
61  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
62  *              Alan Cox        :       Split IP from generic code
63  *              Alan Cox        :       New kfree_skbmem()
64  *              Alan Cox        :       Make SO_DEBUG superuser only.
65  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
66  *                                      (compatibility fix)
67  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
68  *              Alan Cox        :       Allocator for a socket is settable.
69  *              Alan Cox        :       SO_ERROR includes soft errors.
70  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
71  *              Alan Cox        :       Generic socket allocation to make hooks
72  *                                      easier (suggested by Craig Metz).
73  *              Michael Pall    :       SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
81  *              Andi Kleen      :       Fix write_space callback
82  *              Chris Evans     :       Security fixes - signedness again
83  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *              This program is free software; you can redistribute it and/or
89  *              modify it under the terms of the GNU General Public License
90  *              as published by the Free Software Foundation; either version
91  *              2 of the License, or (at your option) any later version.
92  */
93
94 #include <linux/capability.h>
95 #include <linux/config.h>
96 #include <linux/errno.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115
116 #include <asm/uaccess.h>
117 #include <asm/system.h>
118
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/request_sock.h>
123 #include <net/sock.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
126
127 #include <linux/filter.h>
128
129 #ifdef CONFIG_INET
130 #include <net/tcp.h>
131 #endif
132
133 /* Take into consideration the size of the struct sk_buff overhead in the
134  * determination of these values, since that is non-constant across
135  * platforms.  This makes socket queueing behavior and performance
136  * not depend upon such differences.
137  */
138 #define _SK_MEM_PACKETS         256
139 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
140 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
141 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
142
143 /* Run time adjustable parameters. */
144 __u32 sysctl_wmem_max = SK_WMEM_MAX;
145 __u32 sysctl_rmem_max = SK_RMEM_MAX;
146 __u32 sysctl_wmem_default = SK_WMEM_MAX;
147 __u32 sysctl_rmem_default = SK_RMEM_MAX;
148
149 /* Maximal space eaten by iovec or ancilliary data plus some space */
150 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
151
152 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
153 {
154         struct timeval tv;
155
156         if (optlen < sizeof(tv))
157                 return -EINVAL;
158         if (copy_from_user(&tv, optval, sizeof(tv)))
159                 return -EFAULT;
160
161         *timeo_p = MAX_SCHEDULE_TIMEOUT;
162         if (tv.tv_sec == 0 && tv.tv_usec == 0)
163                 return 0;
164         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
165                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
166         return 0;
167 }
168
169 static void sock_warn_obsolete_bsdism(const char *name)
170 {
171         static int warned;
172         static char warncomm[TASK_COMM_LEN];
173         if (strcmp(warncomm, current->comm) && warned < 5) { 
174                 strcpy(warncomm,  current->comm); 
175                 printk(KERN_WARNING "process `%s' is using obsolete "
176                        "%s SO_BSDCOMPAT\n", warncomm, name);
177                 warned++;
178         }
179 }
180
181 static void sock_disable_timestamp(struct sock *sk)
182 {       
183         if (sock_flag(sk, SOCK_TIMESTAMP)) { 
184                 sock_reset_flag(sk, SOCK_TIMESTAMP);
185                 net_disable_timestamp();
186         }
187 }
188
189
190 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
191 {
192         int err = 0;
193         int skb_len;
194
195         /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
196            number of warnings when compiling with -W --ANK
197          */
198         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
199             (unsigned)sk->sk_rcvbuf) {
200                 err = -ENOMEM;
201                 goto out;
202         }
203
204         /* It would be deadlock, if sock_queue_rcv_skb is used
205            with socket lock! We assume that users of this
206            function are lock free.
207         */
208         err = sk_filter(sk, skb, 1);
209         if (err)
210                 goto out;
211
212         skb->dev = NULL;
213         skb_set_owner_r(skb, sk);
214
215         /* Cache the SKB length before we tack it onto the receive
216          * queue.  Once it is added it no longer belongs to us and
217          * may be freed by other threads of control pulling packets
218          * from the queue.
219          */
220         skb_len = skb->len;
221
222         skb_queue_tail(&sk->sk_receive_queue, skb);
223
224         if (!sock_flag(sk, SOCK_DEAD))
225                 sk->sk_data_ready(sk, skb_len);
226 out:
227         return err;
228 }
229 EXPORT_SYMBOL(sock_queue_rcv_skb);
230
231 int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
232 {
233         int rc = NET_RX_SUCCESS;
234
235         if (sk_filter(sk, skb, 0))
236                 goto discard_and_relse;
237
238         skb->dev = NULL;
239
240         bh_lock_sock(sk);
241         if (!sock_owned_by_user(sk))
242                 rc = sk->sk_backlog_rcv(sk, skb);
243         else
244                 sk_add_backlog(sk, skb);
245         bh_unlock_sock(sk);
246 out:
247         sock_put(sk);
248         return rc;
249 discard_and_relse:
250         kfree_skb(skb);
251         goto out;
252 }
253 EXPORT_SYMBOL(sk_receive_skb);
254
255 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
256 {
257         struct dst_entry *dst = sk->sk_dst_cache;
258
259         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
260                 sk->sk_dst_cache = NULL;
261                 dst_release(dst);
262                 return NULL;
263         }
264
265         return dst;
266 }
267 EXPORT_SYMBOL(__sk_dst_check);
268
269 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
270 {
271         struct dst_entry *dst = sk_dst_get(sk);
272
273         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
274                 sk_dst_reset(sk);
275                 dst_release(dst);
276                 return NULL;
277         }
278
279         return dst;
280 }
281 EXPORT_SYMBOL(sk_dst_check);
282
283 /*
284  *      This is meant for all protocols to use and covers goings on
285  *      at the socket level. Everything here is generic.
286  */
287
288 int sock_setsockopt(struct socket *sock, int level, int optname,
289                     char __user *optval, int optlen)
290 {
291         struct sock *sk=sock->sk;
292         struct sk_filter *filter;
293         int val;
294         int valbool;
295         struct linger ling;
296         int ret = 0;
297         
298         /*
299          *      Options without arguments
300          */
301
302 #ifdef SO_DONTLINGER            /* Compatibility item... */
303         if (optname == SO_DONTLINGER) {
304                 lock_sock(sk);
305                 sock_reset_flag(sk, SOCK_LINGER);
306                 release_sock(sk);
307                 return 0;
308         }
309 #endif
310         
311         if(optlen<sizeof(int))
312                 return(-EINVAL);
313         
314         if (get_user(val, (int __user *)optval))
315                 return -EFAULT;
316         
317         valbool = val?1:0;
318
319         lock_sock(sk);
320
321         switch(optname) 
322         {
323                 case SO_DEBUG:  
324                         if(val && !capable(CAP_NET_ADMIN))
325                         {
326                                 ret = -EACCES;
327                         }
328                         else if (valbool)
329                                 sock_set_flag(sk, SOCK_DBG);
330                         else
331                                 sock_reset_flag(sk, SOCK_DBG);
332                         break;
333                 case SO_REUSEADDR:
334                         sk->sk_reuse = valbool;
335                         break;
336                 case SO_TYPE:
337                 case SO_ERROR:
338                         ret = -ENOPROTOOPT;
339                         break;
340                 case SO_DONTROUTE:
341                         if (valbool)
342                                 sock_set_flag(sk, SOCK_LOCALROUTE);
343                         else
344                                 sock_reset_flag(sk, SOCK_LOCALROUTE);
345                         break;
346                 case SO_BROADCAST:
347                         sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
348                         break;
349                 case SO_SNDBUF:
350                         /* Don't error on this BSD doesn't and if you think
351                            about it this is right. Otherwise apps have to
352                            play 'guess the biggest size' games. RCVBUF/SNDBUF
353                            are treated in BSD as hints */
354                            
355                         if (val > sysctl_wmem_max)
356                                 val = sysctl_wmem_max;
357 set_sndbuf:
358                         sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
359                         if ((val * 2) < SOCK_MIN_SNDBUF)
360                                 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
361                         else
362                                 sk->sk_sndbuf = val * 2;
363
364                         /*
365                          *      Wake up sending tasks if we
366                          *      upped the value.
367                          */
368                         sk->sk_write_space(sk);
369                         break;
370
371                 case SO_SNDBUFFORCE:
372                         if (!capable(CAP_NET_ADMIN)) {
373                                 ret = -EPERM;
374                                 break;
375                         }
376                         goto set_sndbuf;
377
378                 case SO_RCVBUF:
379                         /* Don't error on this BSD doesn't and if you think
380                            about it this is right. Otherwise apps have to
381                            play 'guess the biggest size' games. RCVBUF/SNDBUF
382                            are treated in BSD as hints */
383                           
384                         if (val > sysctl_rmem_max)
385                                 val = sysctl_rmem_max;
386 set_rcvbuf:
387                         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
388                         /*
389                          * We double it on the way in to account for
390                          * "struct sk_buff" etc. overhead.   Applications
391                          * assume that the SO_RCVBUF setting they make will
392                          * allow that much actual data to be received on that
393                          * socket.
394                          *
395                          * Applications are unaware that "struct sk_buff" and
396                          * other overheads allocate from the receive buffer
397                          * during socket buffer allocation.
398                          *
399                          * And after considering the possible alternatives,
400                          * returning the value we actually used in getsockopt
401                          * is the most desirable behavior.
402                          */
403                         if ((val * 2) < SOCK_MIN_RCVBUF)
404                                 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
405                         else
406                                 sk->sk_rcvbuf = val * 2;
407                         break;
408
409                 case SO_RCVBUFFORCE:
410                         if (!capable(CAP_NET_ADMIN)) {
411                                 ret = -EPERM;
412                                 break;
413                         }
414                         goto set_rcvbuf;
415
416                 case SO_KEEPALIVE:
417 #ifdef CONFIG_INET
418                         if (sk->sk_protocol == IPPROTO_TCP)
419                                 tcp_set_keepalive(sk, valbool);
420 #endif
421                         sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
422                         break;
423
424                 case SO_OOBINLINE:
425                         sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
426                         break;
427
428                 case SO_NO_CHECK:
429                         sk->sk_no_check = valbool;
430                         break;
431
432                 case SO_PRIORITY:
433                         if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
434                                 sk->sk_priority = val;
435                         else
436                                 ret = -EPERM;
437                         break;
438
439                 case SO_LINGER:
440                         if(optlen<sizeof(ling)) {
441                                 ret = -EINVAL;  /* 1003.1g */
442                                 break;
443                         }
444                         if (copy_from_user(&ling,optval,sizeof(ling))) {
445                                 ret = -EFAULT;
446                                 break;
447                         }
448                         if (!ling.l_onoff)
449                                 sock_reset_flag(sk, SOCK_LINGER);
450                         else {
451 #if (BITS_PER_LONG == 32)
452                                 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
453                                         sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
454                                 else
455 #endif
456                                         sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
457                                 sock_set_flag(sk, SOCK_LINGER);
458                         }
459                         break;
460
461                 case SO_BSDCOMPAT:
462                         sock_warn_obsolete_bsdism("setsockopt");
463                         break;
464
465                 case SO_PASSCRED:
466                         if (valbool)
467                                 set_bit(SOCK_PASSCRED, &sock->flags);
468                         else
469                                 clear_bit(SOCK_PASSCRED, &sock->flags);
470                         break;
471
472                 case SO_TIMESTAMP:
473                         if (valbool)  {
474                                 sock_set_flag(sk, SOCK_RCVTSTAMP);
475                                 sock_enable_timestamp(sk);
476                         } else
477                                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
478                         break;
479
480                 case SO_RCVLOWAT:
481                         if (val < 0)
482                                 val = INT_MAX;
483                         sk->sk_rcvlowat = val ? : 1;
484                         break;
485
486                 case SO_RCVTIMEO:
487                         ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
488                         break;
489
490                 case SO_SNDTIMEO:
491                         ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
492                         break;
493
494 #ifdef CONFIG_NETDEVICES
495                 case SO_BINDTODEVICE:
496                 {
497                         char devname[IFNAMSIZ]; 
498
499                         /* Sorry... */ 
500                         if (!capable(CAP_NET_RAW)) {
501                                 ret = -EPERM;
502                                 break;
503                         }
504
505                         /* Bind this socket to a particular device like "eth0",
506                          * as specified in the passed interface name. If the
507                          * name is "" or the option length is zero the socket 
508                          * is not bound. 
509                          */ 
510
511                         if (!valbool) {
512                                 sk->sk_bound_dev_if = 0;
513                         } else {
514                                 if (optlen > IFNAMSIZ - 1)
515                                         optlen = IFNAMSIZ - 1;
516                                 memset(devname, 0, sizeof(devname));
517                                 if (copy_from_user(devname, optval, optlen)) {
518                                         ret = -EFAULT;
519                                         break;
520                                 }
521
522                                 /* Remove any cached route for this socket. */
523                                 sk_dst_reset(sk);
524
525                                 if (devname[0] == '\0') {
526                                         sk->sk_bound_dev_if = 0;
527                                 } else {
528                                         struct net_device *dev = dev_get_by_name(devname);
529                                         if (!dev) {
530                                                 ret = -ENODEV;
531                                                 break;
532                                         }
533                                         sk->sk_bound_dev_if = dev->ifindex;
534                                         dev_put(dev);
535                                 }
536                         }
537                         break;
538                 }
539 #endif
540
541
542                 case SO_ATTACH_FILTER:
543                         ret = -EINVAL;
544                         if (optlen == sizeof(struct sock_fprog)) {
545                                 struct sock_fprog fprog;
546
547                                 ret = -EFAULT;
548                                 if (copy_from_user(&fprog, optval, sizeof(fprog)))
549                                         break;
550
551                                 ret = sk_attach_filter(&fprog, sk);
552                         }
553                         break;
554
555                 case SO_DETACH_FILTER:
556                         spin_lock_bh(&sk->sk_lock.slock);
557                         filter = sk->sk_filter;
558                         if (filter) {
559                                 sk->sk_filter = NULL;
560                                 spin_unlock_bh(&sk->sk_lock.slock);
561                                 sk_filter_release(sk, filter);
562                                 break;
563                         }
564                         spin_unlock_bh(&sk->sk_lock.slock);
565                         ret = -ENONET;
566                         break;
567
568                 /* We implement the SO_SNDLOWAT etc to
569                    not be settable (1003.1g 5.3) */
570                 default:
571                         ret = -ENOPROTOOPT;
572                         break;
573         }
574         release_sock(sk);
575         return ret;
576 }
577
578
579 int sock_getsockopt(struct socket *sock, int level, int optname,
580                     char __user *optval, int __user *optlen)
581 {
582         struct sock *sk = sock->sk;
583         
584         union
585         {
586                 int val;
587                 struct linger ling;
588                 struct timeval tm;
589         } v;
590         
591         unsigned int lv = sizeof(int);
592         int len;
593         
594         if(get_user(len,optlen))
595                 return -EFAULT;
596         if(len < 0)
597                 return -EINVAL;
598                 
599         switch(optname) 
600         {
601                 case SO_DEBUG:          
602                         v.val = sock_flag(sk, SOCK_DBG);
603                         break;
604                 
605                 case SO_DONTROUTE:
606                         v.val = sock_flag(sk, SOCK_LOCALROUTE);
607                         break;
608                 
609                 case SO_BROADCAST:
610                         v.val = !!sock_flag(sk, SOCK_BROADCAST);
611                         break;
612
613                 case SO_SNDBUF:
614                         v.val = sk->sk_sndbuf;
615                         break;
616                 
617                 case SO_RCVBUF:
618                         v.val = sk->sk_rcvbuf;
619                         break;
620
621                 case SO_REUSEADDR:
622                         v.val = sk->sk_reuse;
623                         break;
624
625                 case SO_KEEPALIVE:
626                         v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
627                         break;
628
629                 case SO_TYPE:
630                         v.val = sk->sk_type;                            
631                         break;
632
633                 case SO_ERROR:
634                         v.val = -sock_error(sk);
635                         if(v.val==0)
636                                 v.val = xchg(&sk->sk_err_soft, 0);
637                         break;
638
639                 case SO_OOBINLINE:
640                         v.val = !!sock_flag(sk, SOCK_URGINLINE);
641                         break;
642         
643                 case SO_NO_CHECK:
644                         v.val = sk->sk_no_check;
645                         break;
646
647                 case SO_PRIORITY:
648                         v.val = sk->sk_priority;
649                         break;
650                 
651                 case SO_LINGER: 
652                         lv              = sizeof(v.ling);
653                         v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
654                         v.ling.l_linger = sk->sk_lingertime / HZ;
655                         break;
656                                         
657                 case SO_BSDCOMPAT:
658                         sock_warn_obsolete_bsdism("getsockopt");
659                         break;
660
661                 case SO_TIMESTAMP:
662                         v.val = sock_flag(sk, SOCK_RCVTSTAMP);
663                         break;
664
665                 case SO_RCVTIMEO:
666                         lv=sizeof(struct timeval);
667                         if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
668                                 v.tm.tv_sec = 0;
669                                 v.tm.tv_usec = 0;
670                         } else {
671                                 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
672                                 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
673                         }
674                         break;
675
676                 case SO_SNDTIMEO:
677                         lv=sizeof(struct timeval);
678                         if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
679                                 v.tm.tv_sec = 0;
680                                 v.tm.tv_usec = 0;
681                         } else {
682                                 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
683                                 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
684                         }
685                         break;
686
687                 case SO_RCVLOWAT:
688                         v.val = sk->sk_rcvlowat;
689                         break;
690
691                 case SO_SNDLOWAT:
692                         v.val=1;
693                         break; 
694
695                 case SO_PASSCRED:
696                         v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
697                         break;
698
699                 case SO_PEERCRED:
700                         if (len > sizeof(sk->sk_peercred))
701                                 len = sizeof(sk->sk_peercred);
702                         if (copy_to_user(optval, &sk->sk_peercred, len))
703                                 return -EFAULT;
704                         goto lenout;
705
706                 case SO_PEERNAME:
707                 {
708                         char address[128];
709
710                         if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
711                                 return -ENOTCONN;
712                         if (lv < len)
713                                 return -EINVAL;
714                         if (copy_to_user(optval, address, len))
715                                 return -EFAULT;
716                         goto lenout;
717                 }
718
719                 /* Dubious BSD thing... Probably nobody even uses it, but
720                  * the UNIX standard wants it for whatever reason... -DaveM
721                  */
722                 case SO_ACCEPTCONN:
723                         v.val = sk->sk_state == TCP_LISTEN;
724                         break;
725
726                 case SO_PEERSEC:
727                         return security_socket_getpeersec_stream(sock, optval, optlen, len);
728
729                 default:
730                         return(-ENOPROTOOPT);
731         }
732         if (len > lv)
733                 len = lv;
734         if (copy_to_user(optval, &v, len))
735                 return -EFAULT;
736 lenout:
737         if (put_user(len, optlen))
738                 return -EFAULT;
739         return 0;
740 }
741
742 /**
743  *      sk_alloc - All socket objects are allocated here
744  *      @family: protocol family
745  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
746  *      @prot: struct proto associated with this new sock instance
747  *      @zero_it: if we should zero the newly allocated sock
748  */
749 struct sock *sk_alloc(int family, gfp_t priority,
750                       struct proto *prot, int zero_it)
751 {
752         struct sock *sk = NULL;
753         kmem_cache_t *slab = prot->slab;
754
755         if (slab != NULL)
756                 sk = kmem_cache_alloc(slab, priority);
757         else
758                 sk = kmalloc(prot->obj_size, priority);
759
760         if (sk) {
761                 if (zero_it) {
762                         memset(sk, 0, prot->obj_size);
763                         sk->sk_family = family;
764                         /*
765                          * See comment in struct sock definition to understand
766                          * why we need sk_prot_creator -acme
767                          */
768                         sk->sk_prot = sk->sk_prot_creator = prot;
769                         sock_lock_init(sk);
770                 }
771                 
772                 if (security_sk_alloc(sk, family, priority))
773                         goto out_free;
774
775                 if (!try_module_get(prot->owner))
776                         goto out_free;
777         }
778         return sk;
779
780 out_free:
781         if (slab != NULL)
782                 kmem_cache_free(slab, sk);
783         else
784                 kfree(sk);
785         return NULL;
786 }
787
788 void sk_free(struct sock *sk)
789 {
790         struct sk_filter *filter;
791         struct module *owner = sk->sk_prot_creator->owner;
792
793         if (sk->sk_destruct)
794                 sk->sk_destruct(sk);
795
796         filter = sk->sk_filter;
797         if (filter) {
798                 sk_filter_release(sk, filter);
799                 sk->sk_filter = NULL;
800         }
801
802         sock_disable_timestamp(sk);
803
804         if (atomic_read(&sk->sk_omem_alloc))
805                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
806                        __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
807
808         security_sk_free(sk);
809         if (sk->sk_prot_creator->slab != NULL)
810                 kmem_cache_free(sk->sk_prot_creator->slab, sk);
811         else
812                 kfree(sk);
813         module_put(owner);
814 }
815
816 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
817 {
818         struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
819
820         if (newsk != NULL) {
821                 struct sk_filter *filter;
822
823                 memcpy(newsk, sk, sk->sk_prot->obj_size);
824
825                 /* SANITY */
826                 sk_node_init(&newsk->sk_node);
827                 sock_lock_init(newsk);
828                 bh_lock_sock(newsk);
829
830                 atomic_set(&newsk->sk_rmem_alloc, 0);
831                 atomic_set(&newsk->sk_wmem_alloc, 0);
832                 atomic_set(&newsk->sk_omem_alloc, 0);
833                 skb_queue_head_init(&newsk->sk_receive_queue);
834                 skb_queue_head_init(&newsk->sk_write_queue);
835 #ifdef CONFIG_NET_DMA
836                 skb_queue_head_init(&newsk->sk_async_wait_queue);
837 #endif
838
839                 rwlock_init(&newsk->sk_dst_lock);
840                 rwlock_init(&newsk->sk_callback_lock);
841
842                 newsk->sk_dst_cache     = NULL;
843                 newsk->sk_wmem_queued   = 0;
844                 newsk->sk_forward_alloc = 0;
845                 newsk->sk_send_head     = NULL;
846                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
847                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
848
849                 sock_reset_flag(newsk, SOCK_DONE);
850                 skb_queue_head_init(&newsk->sk_error_queue);
851
852                 filter = newsk->sk_filter;
853                 if (filter != NULL)
854                         sk_filter_charge(newsk, filter);
855
856                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
857                         /* It is still raw copy of parent, so invalidate
858                          * destructor and make plain sk_free() */
859                         newsk->sk_destruct = NULL;
860                         sk_free(newsk);
861                         newsk = NULL;
862                         goto out;
863                 }
864
865                 newsk->sk_err      = 0;
866                 newsk->sk_priority = 0;
867                 atomic_set(&newsk->sk_refcnt, 2);
868
869                 /*
870                  * Increment the counter in the same struct proto as the master
871                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
872                  * is the same as sk->sk_prot->socks, as this field was copied
873                  * with memcpy).
874                  *
875                  * This _changes_ the previous behaviour, where
876                  * tcp_create_openreq_child always was incrementing the
877                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
878                  * to be taken into account in all callers. -acme
879                  */
880                 sk_refcnt_debug_inc(newsk);
881                 newsk->sk_socket = NULL;
882                 newsk->sk_sleep  = NULL;
883
884                 if (newsk->sk_prot->sockets_allocated)
885                         atomic_inc(newsk->sk_prot->sockets_allocated);
886         }
887 out:
888         return newsk;
889 }
890
891 EXPORT_SYMBOL_GPL(sk_clone);
892
893 void __init sk_init(void)
894 {
895         if (num_physpages <= 4096) {
896                 sysctl_wmem_max = 32767;
897                 sysctl_rmem_max = 32767;
898                 sysctl_wmem_default = 32767;
899                 sysctl_rmem_default = 32767;
900         } else if (num_physpages >= 131072) {
901                 sysctl_wmem_max = 131071;
902                 sysctl_rmem_max = 131071;
903         }
904 }
905
906 /*
907  *      Simple resource managers for sockets.
908  */
909
910
911 /* 
912  * Write buffer destructor automatically called from kfree_skb. 
913  */
914 void sock_wfree(struct sk_buff *skb)
915 {
916         struct sock *sk = skb->sk;
917
918         /* In case it might be waiting for more memory. */
919         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
920         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
921                 sk->sk_write_space(sk);
922         sock_put(sk);
923 }
924
925 /* 
926  * Read buffer destructor automatically called from kfree_skb. 
927  */
928 void sock_rfree(struct sk_buff *skb)
929 {
930         struct sock *sk = skb->sk;
931
932         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
933 }
934
935
936 int sock_i_uid(struct sock *sk)
937 {
938         int uid;
939
940         read_lock(&sk->sk_callback_lock);
941         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
942         read_unlock(&sk->sk_callback_lock);
943         return uid;
944 }
945
946 unsigned long sock_i_ino(struct sock *sk)
947 {
948         unsigned long ino;
949
950         read_lock(&sk->sk_callback_lock);
951         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
952         read_unlock(&sk->sk_callback_lock);
953         return ino;
954 }
955
956 /*
957  * Allocate a skb from the socket's send buffer.
958  */
959 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
960                              gfp_t priority)
961 {
962         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
963                 struct sk_buff * skb = alloc_skb(size, priority);
964                 if (skb) {
965                         skb_set_owner_w(skb, sk);
966                         return skb;
967                 }
968         }
969         return NULL;
970 }
971
972 /*
973  * Allocate a skb from the socket's receive buffer.
974  */ 
975 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
976                              gfp_t priority)
977 {
978         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
979                 struct sk_buff *skb = alloc_skb(size, priority);
980                 if (skb) {
981                         skb_set_owner_r(skb, sk);
982                         return skb;
983                 }
984         }
985         return NULL;
986 }
987
988 /* 
989  * Allocate a memory block from the socket's option memory buffer.
990  */ 
991 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
992 {
993         if ((unsigned)size <= sysctl_optmem_max &&
994             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
995                 void *mem;
996                 /* First do the add, to avoid the race if kmalloc
997                  * might sleep.
998                  */
999                 atomic_add(size, &sk->sk_omem_alloc);
1000                 mem = kmalloc(size, priority);
1001                 if (mem)
1002                         return mem;
1003                 atomic_sub(size, &sk->sk_omem_alloc);
1004         }
1005         return NULL;
1006 }
1007
1008 /*
1009  * Free an option memory block.
1010  */
1011 void sock_kfree_s(struct sock *sk, void *mem, int size)
1012 {
1013         kfree(mem);
1014         atomic_sub(size, &sk->sk_omem_alloc);
1015 }
1016
1017 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1018    I think, these locks should be removed for datagram sockets.
1019  */
1020 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1021 {
1022         DEFINE_WAIT(wait);
1023
1024         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1025         for (;;) {
1026                 if (!timeo)
1027                         break;
1028                 if (signal_pending(current))
1029                         break;
1030                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1031                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1032                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1033                         break;
1034                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1035                         break;
1036                 if (sk->sk_err)
1037                         break;
1038                 timeo = schedule_timeout(timeo);
1039         }
1040         finish_wait(sk->sk_sleep, &wait);
1041         return timeo;
1042 }
1043
1044
1045 /*
1046  *      Generic send/receive buffer handlers
1047  */
1048
1049 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1050                                             unsigned long header_len,
1051                                             unsigned long data_len,
1052                                             int noblock, int *errcode)
1053 {
1054         struct sk_buff *skb;
1055         gfp_t gfp_mask;
1056         long timeo;
1057         int err;
1058
1059         gfp_mask = sk->sk_allocation;
1060         if (gfp_mask & __GFP_WAIT)
1061                 gfp_mask |= __GFP_REPEAT;
1062
1063         timeo = sock_sndtimeo(sk, noblock);
1064         while (1) {
1065                 err = sock_error(sk);
1066                 if (err != 0)
1067                         goto failure;
1068
1069                 err = -EPIPE;
1070                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1071                         goto failure;
1072
1073                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1074                         skb = alloc_skb(header_len, sk->sk_allocation);
1075                         if (skb) {
1076                                 int npages;
1077                                 int i;
1078
1079                                 /* No pages, we're done... */
1080                                 if (!data_len)
1081                                         break;
1082
1083                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1084                                 skb->truesize += data_len;
1085                                 skb_shinfo(skb)->nr_frags = npages;
1086                                 for (i = 0; i < npages; i++) {
1087                                         struct page *page;
1088                                         skb_frag_t *frag;
1089
1090                                         page = alloc_pages(sk->sk_allocation, 0);
1091                                         if (!page) {
1092                                                 err = -ENOBUFS;
1093                                                 skb_shinfo(skb)->nr_frags = i;
1094                                                 kfree_skb(skb);
1095                                                 goto failure;
1096                                         }
1097
1098                                         frag = &skb_shinfo(skb)->frags[i];
1099                                         frag->page = page;
1100                                         frag->page_offset = 0;
1101                                         frag->size = (data_len >= PAGE_SIZE ?
1102                                                       PAGE_SIZE :
1103                                                       data_len);
1104                                         data_len -= PAGE_SIZE;
1105                                 }
1106
1107                                 /* Full success... */
1108                                 break;
1109                         }
1110                         err = -ENOBUFS;
1111                         goto failure;
1112                 }
1113                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1114                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1115                 err = -EAGAIN;
1116                 if (!timeo)
1117                         goto failure;
1118                 if (signal_pending(current))
1119                         goto interrupted;
1120                 timeo = sock_wait_for_wmem(sk, timeo);
1121         }
1122
1123         skb_set_owner_w(skb, sk);
1124         return skb;
1125
1126 interrupted:
1127         err = sock_intr_errno(timeo);
1128 failure:
1129         *errcode = err;
1130         return NULL;
1131 }
1132
1133 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
1134                                     int noblock, int *errcode)
1135 {
1136         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1137 }
1138
1139 static void __lock_sock(struct sock *sk)
1140 {
1141         DEFINE_WAIT(wait);
1142
1143         for(;;) {
1144                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1145                                         TASK_UNINTERRUPTIBLE);
1146                 spin_unlock_bh(&sk->sk_lock.slock);
1147                 schedule();
1148                 spin_lock_bh(&sk->sk_lock.slock);
1149                 if(!sock_owned_by_user(sk))
1150                         break;
1151         }
1152         finish_wait(&sk->sk_lock.wq, &wait);
1153 }
1154
1155 static void __release_sock(struct sock *sk)
1156 {
1157         struct sk_buff *skb = sk->sk_backlog.head;
1158
1159         do {
1160                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1161                 bh_unlock_sock(sk);
1162
1163                 do {
1164                         struct sk_buff *next = skb->next;
1165
1166                         skb->next = NULL;
1167                         sk->sk_backlog_rcv(sk, skb);
1168
1169                         /*
1170                          * We are in process context here with softirqs
1171                          * disabled, use cond_resched_softirq() to preempt.
1172                          * This is safe to do because we've taken the backlog
1173                          * queue private:
1174                          */
1175                         cond_resched_softirq();
1176
1177                         skb = next;
1178                 } while (skb != NULL);
1179
1180                 bh_lock_sock(sk);
1181         } while((skb = sk->sk_backlog.head) != NULL);
1182 }
1183
1184 /**
1185  * sk_wait_data - wait for data to arrive at sk_receive_queue
1186  * @sk:    sock to wait on
1187  * @timeo: for how long
1188  *
1189  * Now socket state including sk->sk_err is changed only under lock,
1190  * hence we may omit checks after joining wait queue.
1191  * We check receive queue before schedule() only as optimization;
1192  * it is very likely that release_sock() added new data.
1193  */
1194 int sk_wait_data(struct sock *sk, long *timeo)
1195 {
1196         int rc;
1197         DEFINE_WAIT(wait);
1198
1199         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1200         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1201         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1202         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1203         finish_wait(sk->sk_sleep, &wait);
1204         return rc;
1205 }
1206
1207 EXPORT_SYMBOL(sk_wait_data);
1208
1209 /*
1210  * Set of default routines for initialising struct proto_ops when
1211  * the protocol does not support a particular function. In certain
1212  * cases where it makes no sense for a protocol to have a "do nothing"
1213  * function, some default processing is provided.
1214  */
1215
1216 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1217 {
1218         return -EOPNOTSUPP;
1219 }
1220
1221 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
1222                     int len, int flags)
1223 {
1224         return -EOPNOTSUPP;
1225 }
1226
1227 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1228 {
1229         return -EOPNOTSUPP;
1230 }
1231
1232 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1233 {
1234         return -EOPNOTSUPP;
1235 }
1236
1237 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
1238                     int *len, int peer)
1239 {
1240         return -EOPNOTSUPP;
1241 }
1242
1243 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1244 {
1245         return 0;
1246 }
1247
1248 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1249 {
1250         return -EOPNOTSUPP;
1251 }
1252
1253 int sock_no_listen(struct socket *sock, int backlog)
1254 {
1255         return -EOPNOTSUPP;
1256 }
1257
1258 int sock_no_shutdown(struct socket *sock, int how)
1259 {
1260         return -EOPNOTSUPP;
1261 }
1262
1263 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1264                     char __user *optval, int optlen)
1265 {
1266         return -EOPNOTSUPP;
1267 }
1268
1269 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1270                     char __user *optval, int __user *optlen)
1271 {
1272         return -EOPNOTSUPP;
1273 }
1274
1275 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1276                     size_t len)
1277 {
1278         return -EOPNOTSUPP;
1279 }
1280
1281 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1282                     size_t len, int flags)
1283 {
1284         return -EOPNOTSUPP;
1285 }
1286
1287 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1288 {
1289         /* Mirror missing mmap method error code */
1290         return -ENODEV;
1291 }
1292
1293 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1294 {
1295         ssize_t res;
1296         struct msghdr msg = {.msg_flags = flags};
1297         struct kvec iov;
1298         char *kaddr = kmap(page);
1299         iov.iov_base = kaddr + offset;
1300         iov.iov_len = size;
1301         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1302         kunmap(page);
1303         return res;
1304 }
1305
1306 /*
1307  *      Default Socket Callbacks
1308  */
1309
1310 static void sock_def_wakeup(struct sock *sk)
1311 {
1312         read_lock(&sk->sk_callback_lock);
1313         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1314                 wake_up_interruptible_all(sk->sk_sleep);
1315         read_unlock(&sk->sk_callback_lock);
1316 }
1317
1318 static void sock_def_error_report(struct sock *sk)
1319 {
1320         read_lock(&sk->sk_callback_lock);
1321         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1322                 wake_up_interruptible(sk->sk_sleep);
1323         sk_wake_async(sk,0,POLL_ERR); 
1324         read_unlock(&sk->sk_callback_lock);
1325 }
1326
1327 static void sock_def_readable(struct sock *sk, int len)
1328 {
1329         read_lock(&sk->sk_callback_lock);
1330         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1331                 wake_up_interruptible(sk->sk_sleep);
1332         sk_wake_async(sk,1,POLL_IN);
1333         read_unlock(&sk->sk_callback_lock);
1334 }
1335
1336 static void sock_def_write_space(struct sock *sk)
1337 {
1338         read_lock(&sk->sk_callback_lock);
1339
1340         /* Do not wake up a writer until he can make "significant"
1341          * progress.  --DaveM
1342          */
1343         if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1344                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1345                         wake_up_interruptible(sk->sk_sleep);
1346
1347                 /* Should agree with poll, otherwise some programs break */
1348                 if (sock_writeable(sk))
1349                         sk_wake_async(sk, 2, POLL_OUT);
1350         }
1351
1352         read_unlock(&sk->sk_callback_lock);
1353 }
1354
1355 static void sock_def_destruct(struct sock *sk)
1356 {
1357         kfree(sk->sk_protinfo);
1358 }
1359
1360 void sk_send_sigurg(struct sock *sk)
1361 {
1362         if (sk->sk_socket && sk->sk_socket->file)
1363                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1364                         sk_wake_async(sk, 3, POLL_PRI);
1365 }
1366
1367 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1368                     unsigned long expires)
1369 {
1370         if (!mod_timer(timer, expires))
1371                 sock_hold(sk);
1372 }
1373
1374 EXPORT_SYMBOL(sk_reset_timer);
1375
1376 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1377 {
1378         if (timer_pending(timer) && del_timer(timer))
1379                 __sock_put(sk);
1380 }
1381
1382 EXPORT_SYMBOL(sk_stop_timer);
1383
1384 void sock_init_data(struct socket *sock, struct sock *sk)
1385 {
1386         skb_queue_head_init(&sk->sk_receive_queue);
1387         skb_queue_head_init(&sk->sk_write_queue);
1388         skb_queue_head_init(&sk->sk_error_queue);
1389 #ifdef CONFIG_NET_DMA
1390         skb_queue_head_init(&sk->sk_async_wait_queue);
1391 #endif
1392
1393         sk->sk_send_head        =       NULL;
1394
1395         init_timer(&sk->sk_timer);
1396         
1397         sk->sk_allocation       =       GFP_KERNEL;
1398         sk->sk_rcvbuf           =       sysctl_rmem_default;
1399         sk->sk_sndbuf           =       sysctl_wmem_default;
1400         sk->sk_state            =       TCP_CLOSE;
1401         sk->sk_socket           =       sock;
1402
1403         sock_set_flag(sk, SOCK_ZAPPED);
1404
1405         if(sock)
1406         {
1407                 sk->sk_type     =       sock->type;
1408                 sk->sk_sleep    =       &sock->wait;
1409                 sock->sk        =       sk;
1410         } else
1411                 sk->sk_sleep    =       NULL;
1412
1413         rwlock_init(&sk->sk_dst_lock);
1414         rwlock_init(&sk->sk_callback_lock);
1415
1416         sk->sk_state_change     =       sock_def_wakeup;
1417         sk->sk_data_ready       =       sock_def_readable;
1418         sk->sk_write_space      =       sock_def_write_space;
1419         sk->sk_error_report     =       sock_def_error_report;
1420         sk->sk_destruct         =       sock_def_destruct;
1421
1422         sk->sk_sndmsg_page      =       NULL;
1423         sk->sk_sndmsg_off       =       0;
1424
1425         sk->sk_peercred.pid     =       0;
1426         sk->sk_peercred.uid     =       -1;
1427         sk->sk_peercred.gid     =       -1;
1428         sk->sk_write_pending    =       0;
1429         sk->sk_rcvlowat         =       1;
1430         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1431         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1432
1433         sk->sk_stamp.tv_sec     = -1L;
1434         sk->sk_stamp.tv_usec    = -1L;
1435
1436         atomic_set(&sk->sk_refcnt, 1);
1437 }
1438
1439 void fastcall lock_sock(struct sock *sk)
1440 {
1441         might_sleep();
1442         spin_lock_bh(&(sk->sk_lock.slock));
1443         if (sk->sk_lock.owner)
1444                 __lock_sock(sk);
1445         sk->sk_lock.owner = (void *)1;
1446         spin_unlock_bh(&(sk->sk_lock.slock));
1447 }
1448
1449 EXPORT_SYMBOL(lock_sock);
1450
1451 void fastcall release_sock(struct sock *sk)
1452 {
1453         spin_lock_bh(&(sk->sk_lock.slock));
1454         if (sk->sk_backlog.tail)
1455                 __release_sock(sk);
1456         sk->sk_lock.owner = NULL;
1457         if (waitqueue_active(&(sk->sk_lock.wq)))
1458                 wake_up(&(sk->sk_lock.wq));
1459         spin_unlock_bh(&(sk->sk_lock.slock));
1460 }
1461 EXPORT_SYMBOL(release_sock);
1462
1463 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1464
1465         if (!sock_flag(sk, SOCK_TIMESTAMP))
1466                 sock_enable_timestamp(sk);
1467         if (sk->sk_stamp.tv_sec == -1) 
1468                 return -ENOENT;
1469         if (sk->sk_stamp.tv_sec == 0)
1470                 do_gettimeofday(&sk->sk_stamp);
1471         return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1472                 -EFAULT : 0; 
1473
1474 EXPORT_SYMBOL(sock_get_timestamp);
1475
1476 void sock_enable_timestamp(struct sock *sk)
1477 {       
1478         if (!sock_flag(sk, SOCK_TIMESTAMP)) { 
1479                 sock_set_flag(sk, SOCK_TIMESTAMP);
1480                 net_enable_timestamp();
1481         }
1482 }
1483 EXPORT_SYMBOL(sock_enable_timestamp); 
1484
1485 /*
1486  *      Get a socket option on an socket.
1487  *
1488  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1489  *      asynchronous errors should be reported by getsockopt. We assume
1490  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1491  */
1492 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1493                            char __user *optval, int __user *optlen)
1494 {
1495         struct sock *sk = sock->sk;
1496
1497         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1498 }
1499
1500 EXPORT_SYMBOL(sock_common_getsockopt);
1501
1502 #ifdef CONFIG_COMPAT
1503 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1504                                   char __user *optval, int __user *optlen)
1505 {
1506         struct sock *sk = sock->sk;
1507
1508         if (sk->sk_prot->compat_setsockopt != NULL)
1509                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1510                                                       optval, optlen);
1511         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1512 }
1513 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1514 #endif
1515
1516 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1517                         struct msghdr *msg, size_t size, int flags)
1518 {
1519         struct sock *sk = sock->sk;
1520         int addr_len = 0;
1521         int err;
1522
1523         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1524                                    flags & ~MSG_DONTWAIT, &addr_len);
1525         if (err >= 0)
1526                 msg->msg_namelen = addr_len;
1527         return err;
1528 }
1529
1530 EXPORT_SYMBOL(sock_common_recvmsg);
1531
1532 /*
1533  *      Set socket options on an inet socket.
1534  */
1535 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1536                            char __user *optval, int optlen)
1537 {
1538         struct sock *sk = sock->sk;
1539
1540         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1541 }
1542
1543 EXPORT_SYMBOL(sock_common_setsockopt);
1544
1545 #ifdef CONFIG_COMPAT
1546 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1547                                   char __user *optval, int optlen)
1548 {
1549         struct sock *sk = sock->sk;
1550
1551         if (sk->sk_prot->compat_setsockopt != NULL)
1552                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1553                                                       optval, optlen);
1554         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1555 }
1556 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1557 #endif
1558
1559 void sk_common_release(struct sock *sk)
1560 {
1561         if (sk->sk_prot->destroy)
1562                 sk->sk_prot->destroy(sk);
1563
1564         /*
1565          * Observation: when sock_common_release is called, processes have
1566          * no access to socket. But net still has.
1567          * Step one, detach it from networking:
1568          *
1569          * A. Remove from hash tables.
1570          */
1571
1572         sk->sk_prot->unhash(sk);
1573
1574         /*
1575          * In this point socket cannot receive new packets, but it is possible
1576          * that some packets are in flight because some CPU runs receiver and
1577          * did hash table lookup before we unhashed socket. They will achieve
1578          * receive queue and will be purged by socket destructor.
1579          *
1580          * Also we still have packets pending on receive queue and probably,
1581          * our own packets waiting in device queues. sock_destroy will drain
1582          * receive queue, but transmitted packets will delay socket destruction
1583          * until the last reference will be released.
1584          */
1585
1586         sock_orphan(sk);
1587
1588         xfrm_sk_free_policy(sk);
1589
1590         sk_refcnt_debug_release(sk);
1591         sock_put(sk);
1592 }
1593
1594 EXPORT_SYMBOL(sk_common_release);
1595
1596 static DEFINE_RWLOCK(proto_list_lock);
1597 static LIST_HEAD(proto_list);
1598
1599 int proto_register(struct proto *prot, int alloc_slab)
1600 {
1601         char *request_sock_slab_name = NULL;
1602         char *timewait_sock_slab_name;
1603         int rc = -ENOBUFS;
1604
1605         if (alloc_slab) {
1606                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1607                                                SLAB_HWCACHE_ALIGN, NULL, NULL);
1608
1609                 if (prot->slab == NULL) {
1610                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1611                                prot->name);
1612                         goto out;
1613                 }
1614
1615                 if (prot->rsk_prot != NULL) {
1616                         static const char mask[] = "request_sock_%s";
1617
1618                         request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1619                         if (request_sock_slab_name == NULL)
1620                                 goto out_free_sock_slab;
1621
1622                         sprintf(request_sock_slab_name, mask, prot->name);
1623                         prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1624                                                                  prot->rsk_prot->obj_size, 0,
1625                                                                  SLAB_HWCACHE_ALIGN, NULL, NULL);
1626
1627                         if (prot->rsk_prot->slab == NULL) {
1628                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1629                                        prot->name);
1630                                 goto out_free_request_sock_slab_name;
1631                         }
1632                 }
1633
1634                 if (prot->twsk_prot != NULL) {
1635                         static const char mask[] = "tw_sock_%s";
1636
1637                         timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1638
1639                         if (timewait_sock_slab_name == NULL)
1640                                 goto out_free_request_sock_slab;
1641
1642                         sprintf(timewait_sock_slab_name, mask, prot->name);
1643                         prot->twsk_prot->twsk_slab =
1644                                 kmem_cache_create(timewait_sock_slab_name,
1645                                                   prot->twsk_prot->twsk_obj_size,
1646                                                   0, SLAB_HWCACHE_ALIGN,
1647                                                   NULL, NULL);
1648                         if (prot->twsk_prot->twsk_slab == NULL)
1649                                 goto out_free_timewait_sock_slab_name;
1650                 }
1651         }
1652
1653         write_lock(&proto_list_lock);
1654         list_add(&prot->node, &proto_list);
1655         write_unlock(&proto_list_lock);
1656         rc = 0;
1657 out:
1658         return rc;
1659 out_free_timewait_sock_slab_name:
1660         kfree(timewait_sock_slab_name);
1661 out_free_request_sock_slab:
1662         if (prot->rsk_prot && prot->rsk_prot->slab) {
1663                 kmem_cache_destroy(prot->rsk_prot->slab);
1664                 prot->rsk_prot->slab = NULL;
1665         }
1666 out_free_request_sock_slab_name:
1667         kfree(request_sock_slab_name);
1668 out_free_sock_slab:
1669         kmem_cache_destroy(prot->slab);
1670         prot->slab = NULL;
1671         goto out;
1672 }
1673
1674 EXPORT_SYMBOL(proto_register);
1675
1676 void proto_unregister(struct proto *prot)
1677 {
1678         write_lock(&proto_list_lock);
1679         list_del(&prot->node);
1680         write_unlock(&proto_list_lock);
1681
1682         if (prot->slab != NULL) {
1683                 kmem_cache_destroy(prot->slab);
1684                 prot->slab = NULL;
1685         }
1686
1687         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1688                 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1689
1690                 kmem_cache_destroy(prot->rsk_prot->slab);
1691                 kfree(name);
1692                 prot->rsk_prot->slab = NULL;
1693         }
1694
1695         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1696                 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1697
1698                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1699                 kfree(name);
1700                 prot->twsk_prot->twsk_slab = NULL;
1701         }
1702 }
1703
1704 EXPORT_SYMBOL(proto_unregister);
1705
1706 #ifdef CONFIG_PROC_FS
1707 static inline struct proto *__proto_head(void)
1708 {
1709         return list_entry(proto_list.next, struct proto, node);
1710 }
1711
1712 static inline struct proto *proto_head(void)
1713 {
1714         return list_empty(&proto_list) ? NULL : __proto_head();
1715 }
1716
1717 static inline struct proto *proto_next(struct proto *proto)
1718 {
1719         return proto->node.next == &proto_list ? NULL :
1720                 list_entry(proto->node.next, struct proto, node);
1721 }
1722
1723 static inline struct proto *proto_get_idx(loff_t pos)
1724 {
1725         struct proto *proto;
1726         loff_t i = 0;
1727
1728         list_for_each_entry(proto, &proto_list, node)
1729                 if (i++ == pos)
1730                         goto out;
1731
1732         proto = NULL;
1733 out:
1734         return proto;
1735 }
1736
1737 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1738 {
1739         read_lock(&proto_list_lock);
1740         return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1741 }
1742
1743 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1744 {
1745         ++*pos;
1746         return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1747 }
1748
1749 static void proto_seq_stop(struct seq_file *seq, void *v)
1750 {
1751         read_unlock(&proto_list_lock);
1752 }
1753
1754 static char proto_method_implemented(const void *method)
1755 {
1756         return method == NULL ? 'n' : 'y';
1757 }
1758
1759 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1760 {
1761         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1762                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1763                    proto->name,
1764                    proto->obj_size,
1765                    proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1766                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1767                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1768                    proto->max_header,
1769                    proto->slab == NULL ? "no" : "yes",
1770                    module_name(proto->owner),
1771                    proto_method_implemented(proto->close),
1772                    proto_method_implemented(proto->connect),
1773                    proto_method_implemented(proto->disconnect),
1774                    proto_method_implemented(proto->accept),
1775                    proto_method_implemented(proto->ioctl),
1776                    proto_method_implemented(proto->init),
1777                    proto_method_implemented(proto->destroy),
1778                    proto_method_implemented(proto->shutdown),
1779                    proto_method_implemented(proto->setsockopt),
1780                    proto_method_implemented(proto->getsockopt),
1781                    proto_method_implemented(proto->sendmsg),
1782                    proto_method_implemented(proto->recvmsg),
1783                    proto_method_implemented(proto->sendpage),
1784                    proto_method_implemented(proto->bind),
1785                    proto_method_implemented(proto->backlog_rcv),
1786                    proto_method_implemented(proto->hash),
1787                    proto_method_implemented(proto->unhash),
1788                    proto_method_implemented(proto->get_port),
1789                    proto_method_implemented(proto->enter_memory_pressure));
1790 }
1791
1792 static int proto_seq_show(struct seq_file *seq, void *v)
1793 {
1794         if (v == SEQ_START_TOKEN)
1795                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1796                            "protocol",
1797                            "size",
1798                            "sockets",
1799                            "memory",
1800                            "press",
1801                            "maxhdr",
1802                            "slab",
1803                            "module",
1804                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1805         else
1806                 proto_seq_printf(seq, v);
1807         return 0;
1808 }
1809
1810 static struct seq_operations proto_seq_ops = {
1811         .start  = proto_seq_start,
1812         .next   = proto_seq_next,
1813         .stop   = proto_seq_stop,
1814         .show   = proto_seq_show,
1815 };
1816
1817 static int proto_seq_open(struct inode *inode, struct file *file)
1818 {
1819         return seq_open(file, &proto_seq_ops);
1820 }
1821
1822 static struct file_operations proto_seq_fops = {
1823         .owner          = THIS_MODULE,
1824         .open           = proto_seq_open,
1825         .read           = seq_read,
1826         .llseek         = seq_lseek,
1827         .release        = seq_release,
1828 };
1829
1830 static int __init proto_init(void)
1831 {
1832         /* register /proc/net/protocols */
1833         return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1834 }
1835
1836 subsys_initcall(proto_init);
1837
1838 #endif /* PROC_FS */
1839
1840 EXPORT_SYMBOL(sk_alloc);
1841 EXPORT_SYMBOL(sk_free);
1842 EXPORT_SYMBOL(sk_send_sigurg);
1843 EXPORT_SYMBOL(sock_alloc_send_skb);
1844 EXPORT_SYMBOL(sock_init_data);
1845 EXPORT_SYMBOL(sock_kfree_s);
1846 EXPORT_SYMBOL(sock_kmalloc);
1847 EXPORT_SYMBOL(sock_no_accept);
1848 EXPORT_SYMBOL(sock_no_bind);
1849 EXPORT_SYMBOL(sock_no_connect);
1850 EXPORT_SYMBOL(sock_no_getname);
1851 EXPORT_SYMBOL(sock_no_getsockopt);
1852 EXPORT_SYMBOL(sock_no_ioctl);
1853 EXPORT_SYMBOL(sock_no_listen);
1854 EXPORT_SYMBOL(sock_no_mmap);
1855 EXPORT_SYMBOL(sock_no_poll);
1856 EXPORT_SYMBOL(sock_no_recvmsg);
1857 EXPORT_SYMBOL(sock_no_sendmsg);
1858 EXPORT_SYMBOL(sock_no_sendpage);
1859 EXPORT_SYMBOL(sock_no_setsockopt);
1860 EXPORT_SYMBOL(sock_no_shutdown);
1861 EXPORT_SYMBOL(sock_no_socketpair);
1862 EXPORT_SYMBOL(sock_rfree);
1863 EXPORT_SYMBOL(sock_setsockopt);
1864 EXPORT_SYMBOL(sock_wfree);
1865 EXPORT_SYMBOL(sock_wmalloc);
1866 EXPORT_SYMBOL(sock_i_uid);
1867 EXPORT_SYMBOL(sock_i_ino);
1868 EXPORT_SYMBOL(sysctl_optmem_max);
1869 #ifdef CONFIG_SYSCTL
1870 EXPORT_SYMBOL(sysctl_rmem_max);
1871 EXPORT_SYMBOL(sysctl_wmem_max);
1872 #endif