[AF_UNIX]: Datagram getpeersec
[pandora-kernel.git] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:     $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11  *
12  * Authors:     Ross Biro
13  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *              Alan Cox        :       Numerous verify_area() problems
19  *              Alan Cox        :       Connecting on a connecting socket
20  *                                      now returns an error for tcp.
21  *              Alan Cox        :       sock->protocol is set correctly.
22  *                                      and is not sometimes left as 0.
23  *              Alan Cox        :       connect handles icmp errors on a
24  *                                      connect properly. Unfortunately there
25  *                                      is a restart syscall nasty there. I
26  *                                      can't match BSD without hacking the C
27  *                                      library. Ideas urgently sought!
28  *              Alan Cox        :       Disallow bind() to addresses that are
29  *                                      not ours - especially broadcast ones!!
30  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
31  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
32  *                                      instead they leave that for the DESTROY timer.
33  *              Alan Cox        :       Clean up error flag in accept
34  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
35  *                                      was buggy. Put a remove_sock() in the handler
36  *                                      for memory when we hit 0. Also altered the timer
37  *                                      code. The ACK stuff can wait and needs major 
38  *                                      TCP layer surgery.
39  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
40  *                                      and fixed timer/inet_bh race.
41  *              Alan Cox        :       Added zapped flag for TCP
42  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
43  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
45  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
48  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
49  *      Pauline Middelink       :       identd support
50  *              Alan Cox        :       Fixed connect() taking signals I think.
51  *              Alan Cox        :       SO_LINGER supported
52  *              Alan Cox        :       Error reporting fixes
53  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
54  *              Alan Cox        :       inet sockets don't set sk->type!
55  *              Alan Cox        :       Split socket option code
56  *              Alan Cox        :       Callbacks
57  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
58  *              Alex            :       Removed restriction on inet fioctl
59  *              Alan Cox        :       Splitting INET from NET core
60  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
61  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
62  *              Alan Cox        :       Split IP from generic code
63  *              Alan Cox        :       New kfree_skbmem()
64  *              Alan Cox        :       Make SO_DEBUG superuser only.
65  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
66  *                                      (compatibility fix)
67  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
68  *              Alan Cox        :       Allocator for a socket is settable.
69  *              Alan Cox        :       SO_ERROR includes soft errors.
70  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
71  *              Alan Cox        :       Generic socket allocation to make hooks
72  *                                      easier (suggested by Craig Metz).
73  *              Michael Pall    :       SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
81  *              Andi Kleen      :       Fix write_space callback
82  *              Chris Evans     :       Security fixes - signedness again
83  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *              This program is free software; you can redistribute it and/or
89  *              modify it under the terms of the GNU General Public License
90  *              as published by the Free Software Foundation; either version
91  *              2 of the License, or (at your option) any later version.
92  */
93
94 #include <linux/capability.h>
95 #include <linux/config.h>
96 #include <linux/errno.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115
116 #include <asm/uaccess.h>
117 #include <asm/system.h>
118
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/request_sock.h>
123 #include <net/sock.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
126
127 #include <linux/filter.h>
128
129 #ifdef CONFIG_INET
130 #include <net/tcp.h>
131 #endif
132
133 /* Take into consideration the size of the struct sk_buff overhead in the
134  * determination of these values, since that is non-constant across
135  * platforms.  This makes socket queueing behavior and performance
136  * not depend upon such differences.
137  */
138 #define _SK_MEM_PACKETS         256
139 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
140 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
141 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
142
143 /* Run time adjustable parameters. */
144 __u32 sysctl_wmem_max = SK_WMEM_MAX;
145 __u32 sysctl_rmem_max = SK_RMEM_MAX;
146 __u32 sysctl_wmem_default = SK_WMEM_MAX;
147 __u32 sysctl_rmem_default = SK_RMEM_MAX;
148
149 /* Maximal space eaten by iovec or ancilliary data plus some space */
150 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
151
152 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
153 {
154         struct timeval tv;
155
156         if (optlen < sizeof(tv))
157                 return -EINVAL;
158         if (copy_from_user(&tv, optval, sizeof(tv)))
159                 return -EFAULT;
160
161         *timeo_p = MAX_SCHEDULE_TIMEOUT;
162         if (tv.tv_sec == 0 && tv.tv_usec == 0)
163                 return 0;
164         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
165                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
166         return 0;
167 }
168
169 static void sock_warn_obsolete_bsdism(const char *name)
170 {
171         static int warned;
172         static char warncomm[TASK_COMM_LEN];
173         if (strcmp(warncomm, current->comm) && warned < 5) { 
174                 strcpy(warncomm,  current->comm); 
175                 printk(KERN_WARNING "process `%s' is using obsolete "
176                        "%s SO_BSDCOMPAT\n", warncomm, name);
177                 warned++;
178         }
179 }
180
181 static void sock_disable_timestamp(struct sock *sk)
182 {       
183         if (sock_flag(sk, SOCK_TIMESTAMP)) { 
184                 sock_reset_flag(sk, SOCK_TIMESTAMP);
185                 net_disable_timestamp();
186         }
187 }
188
189
190 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
191 {
192         int err = 0;
193         int skb_len;
194
195         /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
196            number of warnings when compiling with -W --ANK
197          */
198         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
199             (unsigned)sk->sk_rcvbuf) {
200                 err = -ENOMEM;
201                 goto out;
202         }
203
204         /* It would be deadlock, if sock_queue_rcv_skb is used
205            with socket lock! We assume that users of this
206            function are lock free.
207         */
208         err = sk_filter(sk, skb, 1);
209         if (err)
210                 goto out;
211
212         skb->dev = NULL;
213         skb_set_owner_r(skb, sk);
214
215         /* Cache the SKB length before we tack it onto the receive
216          * queue.  Once it is added it no longer belongs to us and
217          * may be freed by other threads of control pulling packets
218          * from the queue.
219          */
220         skb_len = skb->len;
221
222         skb_queue_tail(&sk->sk_receive_queue, skb);
223
224         if (!sock_flag(sk, SOCK_DEAD))
225                 sk->sk_data_ready(sk, skb_len);
226 out:
227         return err;
228 }
229 EXPORT_SYMBOL(sock_queue_rcv_skb);
230
231 int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
232 {
233         int rc = NET_RX_SUCCESS;
234
235         if (sk_filter(sk, skb, 0))
236                 goto discard_and_relse;
237
238         skb->dev = NULL;
239
240         bh_lock_sock(sk);
241         if (!sock_owned_by_user(sk))
242                 rc = sk->sk_backlog_rcv(sk, skb);
243         else
244                 sk_add_backlog(sk, skb);
245         bh_unlock_sock(sk);
246 out:
247         sock_put(sk);
248         return rc;
249 discard_and_relse:
250         kfree_skb(skb);
251         goto out;
252 }
253 EXPORT_SYMBOL(sk_receive_skb);
254
255 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
256 {
257         struct dst_entry *dst = sk->sk_dst_cache;
258
259         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
260                 sk->sk_dst_cache = NULL;
261                 dst_release(dst);
262                 return NULL;
263         }
264
265         return dst;
266 }
267 EXPORT_SYMBOL(__sk_dst_check);
268
269 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
270 {
271         struct dst_entry *dst = sk_dst_get(sk);
272
273         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
274                 sk_dst_reset(sk);
275                 dst_release(dst);
276                 return NULL;
277         }
278
279         return dst;
280 }
281 EXPORT_SYMBOL(sk_dst_check);
282
283 /*
284  *      This is meant for all protocols to use and covers goings on
285  *      at the socket level. Everything here is generic.
286  */
287
288 int sock_setsockopt(struct socket *sock, int level, int optname,
289                     char __user *optval, int optlen)
290 {
291         struct sock *sk=sock->sk;
292         struct sk_filter *filter;
293         int val;
294         int valbool;
295         struct linger ling;
296         int ret = 0;
297         
298         /*
299          *      Options without arguments
300          */
301
302 #ifdef SO_DONTLINGER            /* Compatibility item... */
303         if (optname == SO_DONTLINGER) {
304                 lock_sock(sk);
305                 sock_reset_flag(sk, SOCK_LINGER);
306                 release_sock(sk);
307                 return 0;
308         }
309 #endif
310         
311         if(optlen<sizeof(int))
312                 return(-EINVAL);
313         
314         if (get_user(val, (int __user *)optval))
315                 return -EFAULT;
316         
317         valbool = val?1:0;
318
319         lock_sock(sk);
320
321         switch(optname) 
322         {
323                 case SO_DEBUG:  
324                         if(val && !capable(CAP_NET_ADMIN))
325                         {
326                                 ret = -EACCES;
327                         }
328                         else if (valbool)
329                                 sock_set_flag(sk, SOCK_DBG);
330                         else
331                                 sock_reset_flag(sk, SOCK_DBG);
332                         break;
333                 case SO_REUSEADDR:
334                         sk->sk_reuse = valbool;
335                         break;
336                 case SO_TYPE:
337                 case SO_ERROR:
338                         ret = -ENOPROTOOPT;
339                         break;
340                 case SO_DONTROUTE:
341                         if (valbool)
342                                 sock_set_flag(sk, SOCK_LOCALROUTE);
343                         else
344                                 sock_reset_flag(sk, SOCK_LOCALROUTE);
345                         break;
346                 case SO_BROADCAST:
347                         sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
348                         break;
349                 case SO_SNDBUF:
350                         /* Don't error on this BSD doesn't and if you think
351                            about it this is right. Otherwise apps have to
352                            play 'guess the biggest size' games. RCVBUF/SNDBUF
353                            are treated in BSD as hints */
354                            
355                         if (val > sysctl_wmem_max)
356                                 val = sysctl_wmem_max;
357 set_sndbuf:
358                         sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
359                         if ((val * 2) < SOCK_MIN_SNDBUF)
360                                 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
361                         else
362                                 sk->sk_sndbuf = val * 2;
363
364                         /*
365                          *      Wake up sending tasks if we
366                          *      upped the value.
367                          */
368                         sk->sk_write_space(sk);
369                         break;
370
371                 case SO_SNDBUFFORCE:
372                         if (!capable(CAP_NET_ADMIN)) {
373                                 ret = -EPERM;
374                                 break;
375                         }
376                         goto set_sndbuf;
377
378                 case SO_RCVBUF:
379                         /* Don't error on this BSD doesn't and if you think
380                            about it this is right. Otherwise apps have to
381                            play 'guess the biggest size' games. RCVBUF/SNDBUF
382                            are treated in BSD as hints */
383                           
384                         if (val > sysctl_rmem_max)
385                                 val = sysctl_rmem_max;
386 set_rcvbuf:
387                         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
388                         /*
389                          * We double it on the way in to account for
390                          * "struct sk_buff" etc. overhead.   Applications
391                          * assume that the SO_RCVBUF setting they make will
392                          * allow that much actual data to be received on that
393                          * socket.
394                          *
395                          * Applications are unaware that "struct sk_buff" and
396                          * other overheads allocate from the receive buffer
397                          * during socket buffer allocation.
398                          *
399                          * And after considering the possible alternatives,
400                          * returning the value we actually used in getsockopt
401                          * is the most desirable behavior.
402                          */
403                         if ((val * 2) < SOCK_MIN_RCVBUF)
404                                 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
405                         else
406                                 sk->sk_rcvbuf = val * 2;
407                         break;
408
409                 case SO_RCVBUFFORCE:
410                         if (!capable(CAP_NET_ADMIN)) {
411                                 ret = -EPERM;
412                                 break;
413                         }
414                         goto set_rcvbuf;
415
416                 case SO_KEEPALIVE:
417 #ifdef CONFIG_INET
418                         if (sk->sk_protocol == IPPROTO_TCP)
419                                 tcp_set_keepalive(sk, valbool);
420 #endif
421                         sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
422                         break;
423
424                 case SO_OOBINLINE:
425                         sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
426                         break;
427
428                 case SO_NO_CHECK:
429                         sk->sk_no_check = valbool;
430                         break;
431
432                 case SO_PRIORITY:
433                         if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
434                                 sk->sk_priority = val;
435                         else
436                                 ret = -EPERM;
437                         break;
438
439                 case SO_LINGER:
440                         if(optlen<sizeof(ling)) {
441                                 ret = -EINVAL;  /* 1003.1g */
442                                 break;
443                         }
444                         if (copy_from_user(&ling,optval,sizeof(ling))) {
445                                 ret = -EFAULT;
446                                 break;
447                         }
448                         if (!ling.l_onoff)
449                                 sock_reset_flag(sk, SOCK_LINGER);
450                         else {
451 #if (BITS_PER_LONG == 32)
452                                 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
453                                         sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
454                                 else
455 #endif
456                                         sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
457                                 sock_set_flag(sk, SOCK_LINGER);
458                         }
459                         break;
460
461                 case SO_BSDCOMPAT:
462                         sock_warn_obsolete_bsdism("setsockopt");
463                         break;
464
465                 case SO_PASSCRED:
466                         if (valbool)
467                                 set_bit(SOCK_PASSCRED, &sock->flags);
468                         else
469                                 clear_bit(SOCK_PASSCRED, &sock->flags);
470                         break;
471
472                 case SO_TIMESTAMP:
473                         if (valbool)  {
474                                 sock_set_flag(sk, SOCK_RCVTSTAMP);
475                                 sock_enable_timestamp(sk);
476                         } else
477                                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
478                         break;
479
480                 case SO_RCVLOWAT:
481                         if (val < 0)
482                                 val = INT_MAX;
483                         sk->sk_rcvlowat = val ? : 1;
484                         break;
485
486                 case SO_RCVTIMEO:
487                         ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
488                         break;
489
490                 case SO_SNDTIMEO:
491                         ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
492                         break;
493
494 #ifdef CONFIG_NETDEVICES
495                 case SO_BINDTODEVICE:
496                 {
497                         char devname[IFNAMSIZ]; 
498
499                         /* Sorry... */ 
500                         if (!capable(CAP_NET_RAW)) {
501                                 ret = -EPERM;
502                                 break;
503                         }
504
505                         /* Bind this socket to a particular device like "eth0",
506                          * as specified in the passed interface name. If the
507                          * name is "" or the option length is zero the socket 
508                          * is not bound. 
509                          */ 
510
511                         if (!valbool) {
512                                 sk->sk_bound_dev_if = 0;
513                         } else {
514                                 if (optlen > IFNAMSIZ - 1)
515                                         optlen = IFNAMSIZ - 1;
516                                 memset(devname, 0, sizeof(devname));
517                                 if (copy_from_user(devname, optval, optlen)) {
518                                         ret = -EFAULT;
519                                         break;
520                                 }
521
522                                 /* Remove any cached route for this socket. */
523                                 sk_dst_reset(sk);
524
525                                 if (devname[0] == '\0') {
526                                         sk->sk_bound_dev_if = 0;
527                                 } else {
528                                         struct net_device *dev = dev_get_by_name(devname);
529                                         if (!dev) {
530                                                 ret = -ENODEV;
531                                                 break;
532                                         }
533                                         sk->sk_bound_dev_if = dev->ifindex;
534                                         dev_put(dev);
535                                 }
536                         }
537                         break;
538                 }
539 #endif
540
541
542                 case SO_ATTACH_FILTER:
543                         ret = -EINVAL;
544                         if (optlen == sizeof(struct sock_fprog)) {
545                                 struct sock_fprog fprog;
546
547                                 ret = -EFAULT;
548                                 if (copy_from_user(&fprog, optval, sizeof(fprog)))
549                                         break;
550
551                                 ret = sk_attach_filter(&fprog, sk);
552                         }
553                         break;
554
555                 case SO_DETACH_FILTER:
556                         spin_lock_bh(&sk->sk_lock.slock);
557                         filter = sk->sk_filter;
558                         if (filter) {
559                                 sk->sk_filter = NULL;
560                                 spin_unlock_bh(&sk->sk_lock.slock);
561                                 sk_filter_release(sk, filter);
562                                 break;
563                         }
564                         spin_unlock_bh(&sk->sk_lock.slock);
565                         ret = -ENONET;
566                         break;
567
568                 case SO_PASSSEC:
569                         if (valbool)
570                                 set_bit(SOCK_PASSSEC, &sock->flags);
571                         else
572                                 clear_bit(SOCK_PASSSEC, &sock->flags);
573                         break;
574
575                 /* We implement the SO_SNDLOWAT etc to
576                    not be settable (1003.1g 5.3) */
577                 default:
578                         ret = -ENOPROTOOPT;
579                         break;
580         }
581         release_sock(sk);
582         return ret;
583 }
584
585
586 int sock_getsockopt(struct socket *sock, int level, int optname,
587                     char __user *optval, int __user *optlen)
588 {
589         struct sock *sk = sock->sk;
590         
591         union
592         {
593                 int val;
594                 struct linger ling;
595                 struct timeval tm;
596         } v;
597         
598         unsigned int lv = sizeof(int);
599         int len;
600         
601         if(get_user(len,optlen))
602                 return -EFAULT;
603         if(len < 0)
604                 return -EINVAL;
605                 
606         switch(optname) 
607         {
608                 case SO_DEBUG:          
609                         v.val = sock_flag(sk, SOCK_DBG);
610                         break;
611                 
612                 case SO_DONTROUTE:
613                         v.val = sock_flag(sk, SOCK_LOCALROUTE);
614                         break;
615                 
616                 case SO_BROADCAST:
617                         v.val = !!sock_flag(sk, SOCK_BROADCAST);
618                         break;
619
620                 case SO_SNDBUF:
621                         v.val = sk->sk_sndbuf;
622                         break;
623                 
624                 case SO_RCVBUF:
625                         v.val = sk->sk_rcvbuf;
626                         break;
627
628                 case SO_REUSEADDR:
629                         v.val = sk->sk_reuse;
630                         break;
631
632                 case SO_KEEPALIVE:
633                         v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
634                         break;
635
636                 case SO_TYPE:
637                         v.val = sk->sk_type;                            
638                         break;
639
640                 case SO_ERROR:
641                         v.val = -sock_error(sk);
642                         if(v.val==0)
643                                 v.val = xchg(&sk->sk_err_soft, 0);
644                         break;
645
646                 case SO_OOBINLINE:
647                         v.val = !!sock_flag(sk, SOCK_URGINLINE);
648                         break;
649         
650                 case SO_NO_CHECK:
651                         v.val = sk->sk_no_check;
652                         break;
653
654                 case SO_PRIORITY:
655                         v.val = sk->sk_priority;
656                         break;
657                 
658                 case SO_LINGER: 
659                         lv              = sizeof(v.ling);
660                         v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
661                         v.ling.l_linger = sk->sk_lingertime / HZ;
662                         break;
663                                         
664                 case SO_BSDCOMPAT:
665                         sock_warn_obsolete_bsdism("getsockopt");
666                         break;
667
668                 case SO_TIMESTAMP:
669                         v.val = sock_flag(sk, SOCK_RCVTSTAMP);
670                         break;
671
672                 case SO_RCVTIMEO:
673                         lv=sizeof(struct timeval);
674                         if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
675                                 v.tm.tv_sec = 0;
676                                 v.tm.tv_usec = 0;
677                         } else {
678                                 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
679                                 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
680                         }
681                         break;
682
683                 case SO_SNDTIMEO:
684                         lv=sizeof(struct timeval);
685                         if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
686                                 v.tm.tv_sec = 0;
687                                 v.tm.tv_usec = 0;
688                         } else {
689                                 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
690                                 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
691                         }
692                         break;
693
694                 case SO_RCVLOWAT:
695                         v.val = sk->sk_rcvlowat;
696                         break;
697
698                 case SO_SNDLOWAT:
699                         v.val=1;
700                         break; 
701
702                 case SO_PASSCRED:
703                         v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
704                         break;
705
706                 case SO_PEERCRED:
707                         if (len > sizeof(sk->sk_peercred))
708                                 len = sizeof(sk->sk_peercred);
709                         if (copy_to_user(optval, &sk->sk_peercred, len))
710                                 return -EFAULT;
711                         goto lenout;
712
713                 case SO_PEERNAME:
714                 {
715                         char address[128];
716
717                         if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
718                                 return -ENOTCONN;
719                         if (lv < len)
720                                 return -EINVAL;
721                         if (copy_to_user(optval, address, len))
722                                 return -EFAULT;
723                         goto lenout;
724                 }
725
726                 /* Dubious BSD thing... Probably nobody even uses it, but
727                  * the UNIX standard wants it for whatever reason... -DaveM
728                  */
729                 case SO_ACCEPTCONN:
730                         v.val = sk->sk_state == TCP_LISTEN;
731                         break;
732
733                 case SO_PASSSEC:
734                         v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
735                         break;
736
737                 case SO_PEERSEC:
738                         return security_socket_getpeersec_stream(sock, optval, optlen, len);
739
740                 default:
741                         return(-ENOPROTOOPT);
742         }
743         if (len > lv)
744                 len = lv;
745         if (copy_to_user(optval, &v, len))
746                 return -EFAULT;
747 lenout:
748         if (put_user(len, optlen))
749                 return -EFAULT;
750         return 0;
751 }
752
753 /**
754  *      sk_alloc - All socket objects are allocated here
755  *      @family: protocol family
756  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
757  *      @prot: struct proto associated with this new sock instance
758  *      @zero_it: if we should zero the newly allocated sock
759  */
760 struct sock *sk_alloc(int family, gfp_t priority,
761                       struct proto *prot, int zero_it)
762 {
763         struct sock *sk = NULL;
764         kmem_cache_t *slab = prot->slab;
765
766         if (slab != NULL)
767                 sk = kmem_cache_alloc(slab, priority);
768         else
769                 sk = kmalloc(prot->obj_size, priority);
770
771         if (sk) {
772                 if (zero_it) {
773                         memset(sk, 0, prot->obj_size);
774                         sk->sk_family = family;
775                         /*
776                          * See comment in struct sock definition to understand
777                          * why we need sk_prot_creator -acme
778                          */
779                         sk->sk_prot = sk->sk_prot_creator = prot;
780                         sock_lock_init(sk);
781                 }
782                 
783                 if (security_sk_alloc(sk, family, priority))
784                         goto out_free;
785
786                 if (!try_module_get(prot->owner))
787                         goto out_free;
788         }
789         return sk;
790
791 out_free:
792         if (slab != NULL)
793                 kmem_cache_free(slab, sk);
794         else
795                 kfree(sk);
796         return NULL;
797 }
798
799 void sk_free(struct sock *sk)
800 {
801         struct sk_filter *filter;
802         struct module *owner = sk->sk_prot_creator->owner;
803
804         if (sk->sk_destruct)
805                 sk->sk_destruct(sk);
806
807         filter = sk->sk_filter;
808         if (filter) {
809                 sk_filter_release(sk, filter);
810                 sk->sk_filter = NULL;
811         }
812
813         sock_disable_timestamp(sk);
814
815         if (atomic_read(&sk->sk_omem_alloc))
816                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
817                        __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
818
819         security_sk_free(sk);
820         if (sk->sk_prot_creator->slab != NULL)
821                 kmem_cache_free(sk->sk_prot_creator->slab, sk);
822         else
823                 kfree(sk);
824         module_put(owner);
825 }
826
827 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
828 {
829         struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
830
831         if (newsk != NULL) {
832                 struct sk_filter *filter;
833
834                 memcpy(newsk, sk, sk->sk_prot->obj_size);
835
836                 /* SANITY */
837                 sk_node_init(&newsk->sk_node);
838                 sock_lock_init(newsk);
839                 bh_lock_sock(newsk);
840
841                 atomic_set(&newsk->sk_rmem_alloc, 0);
842                 atomic_set(&newsk->sk_wmem_alloc, 0);
843                 atomic_set(&newsk->sk_omem_alloc, 0);
844                 skb_queue_head_init(&newsk->sk_receive_queue);
845                 skb_queue_head_init(&newsk->sk_write_queue);
846 #ifdef CONFIG_NET_DMA
847                 skb_queue_head_init(&newsk->sk_async_wait_queue);
848 #endif
849
850                 rwlock_init(&newsk->sk_dst_lock);
851                 rwlock_init(&newsk->sk_callback_lock);
852
853                 newsk->sk_dst_cache     = NULL;
854                 newsk->sk_wmem_queued   = 0;
855                 newsk->sk_forward_alloc = 0;
856                 newsk->sk_send_head     = NULL;
857                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
858                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
859
860                 sock_reset_flag(newsk, SOCK_DONE);
861                 skb_queue_head_init(&newsk->sk_error_queue);
862
863                 filter = newsk->sk_filter;
864                 if (filter != NULL)
865                         sk_filter_charge(newsk, filter);
866
867                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
868                         /* It is still raw copy of parent, so invalidate
869                          * destructor and make plain sk_free() */
870                         newsk->sk_destruct = NULL;
871                         sk_free(newsk);
872                         newsk = NULL;
873                         goto out;
874                 }
875
876                 newsk->sk_err      = 0;
877                 newsk->sk_priority = 0;
878                 atomic_set(&newsk->sk_refcnt, 2);
879
880                 /*
881                  * Increment the counter in the same struct proto as the master
882                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
883                  * is the same as sk->sk_prot->socks, as this field was copied
884                  * with memcpy).
885                  *
886                  * This _changes_ the previous behaviour, where
887                  * tcp_create_openreq_child always was incrementing the
888                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
889                  * to be taken into account in all callers. -acme
890                  */
891                 sk_refcnt_debug_inc(newsk);
892                 newsk->sk_socket = NULL;
893                 newsk->sk_sleep  = NULL;
894
895                 if (newsk->sk_prot->sockets_allocated)
896                         atomic_inc(newsk->sk_prot->sockets_allocated);
897         }
898 out:
899         return newsk;
900 }
901
902 EXPORT_SYMBOL_GPL(sk_clone);
903
904 void __init sk_init(void)
905 {
906         if (num_physpages <= 4096) {
907                 sysctl_wmem_max = 32767;
908                 sysctl_rmem_max = 32767;
909                 sysctl_wmem_default = 32767;
910                 sysctl_rmem_default = 32767;
911         } else if (num_physpages >= 131072) {
912                 sysctl_wmem_max = 131071;
913                 sysctl_rmem_max = 131071;
914         }
915 }
916
917 /*
918  *      Simple resource managers for sockets.
919  */
920
921
922 /* 
923  * Write buffer destructor automatically called from kfree_skb. 
924  */
925 void sock_wfree(struct sk_buff *skb)
926 {
927         struct sock *sk = skb->sk;
928
929         /* In case it might be waiting for more memory. */
930         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
931         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
932                 sk->sk_write_space(sk);
933         sock_put(sk);
934 }
935
936 /* 
937  * Read buffer destructor automatically called from kfree_skb. 
938  */
939 void sock_rfree(struct sk_buff *skb)
940 {
941         struct sock *sk = skb->sk;
942
943         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
944 }
945
946
947 int sock_i_uid(struct sock *sk)
948 {
949         int uid;
950
951         read_lock(&sk->sk_callback_lock);
952         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
953         read_unlock(&sk->sk_callback_lock);
954         return uid;
955 }
956
957 unsigned long sock_i_ino(struct sock *sk)
958 {
959         unsigned long ino;
960
961         read_lock(&sk->sk_callback_lock);
962         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
963         read_unlock(&sk->sk_callback_lock);
964         return ino;
965 }
966
967 /*
968  * Allocate a skb from the socket's send buffer.
969  */
970 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
971                              gfp_t priority)
972 {
973         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
974                 struct sk_buff * skb = alloc_skb(size, priority);
975                 if (skb) {
976                         skb_set_owner_w(skb, sk);
977                         return skb;
978                 }
979         }
980         return NULL;
981 }
982
983 /*
984  * Allocate a skb from the socket's receive buffer.
985  */ 
986 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
987                              gfp_t priority)
988 {
989         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
990                 struct sk_buff *skb = alloc_skb(size, priority);
991                 if (skb) {
992                         skb_set_owner_r(skb, sk);
993                         return skb;
994                 }
995         }
996         return NULL;
997 }
998
999 /* 
1000  * Allocate a memory block from the socket's option memory buffer.
1001  */ 
1002 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1003 {
1004         if ((unsigned)size <= sysctl_optmem_max &&
1005             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1006                 void *mem;
1007                 /* First do the add, to avoid the race if kmalloc
1008                  * might sleep.
1009                  */
1010                 atomic_add(size, &sk->sk_omem_alloc);
1011                 mem = kmalloc(size, priority);
1012                 if (mem)
1013                         return mem;
1014                 atomic_sub(size, &sk->sk_omem_alloc);
1015         }
1016         return NULL;
1017 }
1018
1019 /*
1020  * Free an option memory block.
1021  */
1022 void sock_kfree_s(struct sock *sk, void *mem, int size)
1023 {
1024         kfree(mem);
1025         atomic_sub(size, &sk->sk_omem_alloc);
1026 }
1027
1028 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1029    I think, these locks should be removed for datagram sockets.
1030  */
1031 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1032 {
1033         DEFINE_WAIT(wait);
1034
1035         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1036         for (;;) {
1037                 if (!timeo)
1038                         break;
1039                 if (signal_pending(current))
1040                         break;
1041                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1042                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1043                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1044                         break;
1045                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1046                         break;
1047                 if (sk->sk_err)
1048                         break;
1049                 timeo = schedule_timeout(timeo);
1050         }
1051         finish_wait(sk->sk_sleep, &wait);
1052         return timeo;
1053 }
1054
1055
1056 /*
1057  *      Generic send/receive buffer handlers
1058  */
1059
1060 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1061                                             unsigned long header_len,
1062                                             unsigned long data_len,
1063                                             int noblock, int *errcode)
1064 {
1065         struct sk_buff *skb;
1066         gfp_t gfp_mask;
1067         long timeo;
1068         int err;
1069
1070         gfp_mask = sk->sk_allocation;
1071         if (gfp_mask & __GFP_WAIT)
1072                 gfp_mask |= __GFP_REPEAT;
1073
1074         timeo = sock_sndtimeo(sk, noblock);
1075         while (1) {
1076                 err = sock_error(sk);
1077                 if (err != 0)
1078                         goto failure;
1079
1080                 err = -EPIPE;
1081                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1082                         goto failure;
1083
1084                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1085                         skb = alloc_skb(header_len, sk->sk_allocation);
1086                         if (skb) {
1087                                 int npages;
1088                                 int i;
1089
1090                                 /* No pages, we're done... */
1091                                 if (!data_len)
1092                                         break;
1093
1094                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1095                                 skb->truesize += data_len;
1096                                 skb_shinfo(skb)->nr_frags = npages;
1097                                 for (i = 0; i < npages; i++) {
1098                                         struct page *page;
1099                                         skb_frag_t *frag;
1100
1101                                         page = alloc_pages(sk->sk_allocation, 0);
1102                                         if (!page) {
1103                                                 err = -ENOBUFS;
1104                                                 skb_shinfo(skb)->nr_frags = i;
1105                                                 kfree_skb(skb);
1106                                                 goto failure;
1107                                         }
1108
1109                                         frag = &skb_shinfo(skb)->frags[i];
1110                                         frag->page = page;
1111                                         frag->page_offset = 0;
1112                                         frag->size = (data_len >= PAGE_SIZE ?
1113                                                       PAGE_SIZE :
1114                                                       data_len);
1115                                         data_len -= PAGE_SIZE;
1116                                 }
1117
1118                                 /* Full success... */
1119                                 break;
1120                         }
1121                         err = -ENOBUFS;
1122                         goto failure;
1123                 }
1124                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1125                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1126                 err = -EAGAIN;
1127                 if (!timeo)
1128                         goto failure;
1129                 if (signal_pending(current))
1130                         goto interrupted;
1131                 timeo = sock_wait_for_wmem(sk, timeo);
1132         }
1133
1134         skb_set_owner_w(skb, sk);
1135         return skb;
1136
1137 interrupted:
1138         err = sock_intr_errno(timeo);
1139 failure:
1140         *errcode = err;
1141         return NULL;
1142 }
1143
1144 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
1145                                     int noblock, int *errcode)
1146 {
1147         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1148 }
1149
1150 static void __lock_sock(struct sock *sk)
1151 {
1152         DEFINE_WAIT(wait);
1153
1154         for(;;) {
1155                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1156                                         TASK_UNINTERRUPTIBLE);
1157                 spin_unlock_bh(&sk->sk_lock.slock);
1158                 schedule();
1159                 spin_lock_bh(&sk->sk_lock.slock);
1160                 if(!sock_owned_by_user(sk))
1161                         break;
1162         }
1163         finish_wait(&sk->sk_lock.wq, &wait);
1164 }
1165
1166 static void __release_sock(struct sock *sk)
1167 {
1168         struct sk_buff *skb = sk->sk_backlog.head;
1169
1170         do {
1171                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1172                 bh_unlock_sock(sk);
1173
1174                 do {
1175                         struct sk_buff *next = skb->next;
1176
1177                         skb->next = NULL;
1178                         sk->sk_backlog_rcv(sk, skb);
1179
1180                         /*
1181                          * We are in process context here with softirqs
1182                          * disabled, use cond_resched_softirq() to preempt.
1183                          * This is safe to do because we've taken the backlog
1184                          * queue private:
1185                          */
1186                         cond_resched_softirq();
1187
1188                         skb = next;
1189                 } while (skb != NULL);
1190
1191                 bh_lock_sock(sk);
1192         } while((skb = sk->sk_backlog.head) != NULL);
1193 }
1194
1195 /**
1196  * sk_wait_data - wait for data to arrive at sk_receive_queue
1197  * @sk:    sock to wait on
1198  * @timeo: for how long
1199  *
1200  * Now socket state including sk->sk_err is changed only under lock,
1201  * hence we may omit checks after joining wait queue.
1202  * We check receive queue before schedule() only as optimization;
1203  * it is very likely that release_sock() added new data.
1204  */
1205 int sk_wait_data(struct sock *sk, long *timeo)
1206 {
1207         int rc;
1208         DEFINE_WAIT(wait);
1209
1210         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1211         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1212         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1213         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1214         finish_wait(sk->sk_sleep, &wait);
1215         return rc;
1216 }
1217
1218 EXPORT_SYMBOL(sk_wait_data);
1219
1220 /*
1221  * Set of default routines for initialising struct proto_ops when
1222  * the protocol does not support a particular function. In certain
1223  * cases where it makes no sense for a protocol to have a "do nothing"
1224  * function, some default processing is provided.
1225  */
1226
1227 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1228 {
1229         return -EOPNOTSUPP;
1230 }
1231
1232 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
1233                     int len, int flags)
1234 {
1235         return -EOPNOTSUPP;
1236 }
1237
1238 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1239 {
1240         return -EOPNOTSUPP;
1241 }
1242
1243 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1244 {
1245         return -EOPNOTSUPP;
1246 }
1247
1248 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
1249                     int *len, int peer)
1250 {
1251         return -EOPNOTSUPP;
1252 }
1253
1254 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1255 {
1256         return 0;
1257 }
1258
1259 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1260 {
1261         return -EOPNOTSUPP;
1262 }
1263
1264 int sock_no_listen(struct socket *sock, int backlog)
1265 {
1266         return -EOPNOTSUPP;
1267 }
1268
1269 int sock_no_shutdown(struct socket *sock, int how)
1270 {
1271         return -EOPNOTSUPP;
1272 }
1273
1274 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1275                     char __user *optval, int optlen)
1276 {
1277         return -EOPNOTSUPP;
1278 }
1279
1280 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1281                     char __user *optval, int __user *optlen)
1282 {
1283         return -EOPNOTSUPP;
1284 }
1285
1286 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1287                     size_t len)
1288 {
1289         return -EOPNOTSUPP;
1290 }
1291
1292 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1293                     size_t len, int flags)
1294 {
1295         return -EOPNOTSUPP;
1296 }
1297
1298 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1299 {
1300         /* Mirror missing mmap method error code */
1301         return -ENODEV;
1302 }
1303
1304 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1305 {
1306         ssize_t res;
1307         struct msghdr msg = {.msg_flags = flags};
1308         struct kvec iov;
1309         char *kaddr = kmap(page);
1310         iov.iov_base = kaddr + offset;
1311         iov.iov_len = size;
1312         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1313         kunmap(page);
1314         return res;
1315 }
1316
1317 /*
1318  *      Default Socket Callbacks
1319  */
1320
1321 static void sock_def_wakeup(struct sock *sk)
1322 {
1323         read_lock(&sk->sk_callback_lock);
1324         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1325                 wake_up_interruptible_all(sk->sk_sleep);
1326         read_unlock(&sk->sk_callback_lock);
1327 }
1328
1329 static void sock_def_error_report(struct sock *sk)
1330 {
1331         read_lock(&sk->sk_callback_lock);
1332         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1333                 wake_up_interruptible(sk->sk_sleep);
1334         sk_wake_async(sk,0,POLL_ERR); 
1335         read_unlock(&sk->sk_callback_lock);
1336 }
1337
1338 static void sock_def_readable(struct sock *sk, int len)
1339 {
1340         read_lock(&sk->sk_callback_lock);
1341         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1342                 wake_up_interruptible(sk->sk_sleep);
1343         sk_wake_async(sk,1,POLL_IN);
1344         read_unlock(&sk->sk_callback_lock);
1345 }
1346
1347 static void sock_def_write_space(struct sock *sk)
1348 {
1349         read_lock(&sk->sk_callback_lock);
1350
1351         /* Do not wake up a writer until he can make "significant"
1352          * progress.  --DaveM
1353          */
1354         if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1355                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1356                         wake_up_interruptible(sk->sk_sleep);
1357
1358                 /* Should agree with poll, otherwise some programs break */
1359                 if (sock_writeable(sk))
1360                         sk_wake_async(sk, 2, POLL_OUT);
1361         }
1362
1363         read_unlock(&sk->sk_callback_lock);
1364 }
1365
1366 static void sock_def_destruct(struct sock *sk)
1367 {
1368         kfree(sk->sk_protinfo);
1369 }
1370
1371 void sk_send_sigurg(struct sock *sk)
1372 {
1373         if (sk->sk_socket && sk->sk_socket->file)
1374                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1375                         sk_wake_async(sk, 3, POLL_PRI);
1376 }
1377
1378 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1379                     unsigned long expires)
1380 {
1381         if (!mod_timer(timer, expires))
1382                 sock_hold(sk);
1383 }
1384
1385 EXPORT_SYMBOL(sk_reset_timer);
1386
1387 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1388 {
1389         if (timer_pending(timer) && del_timer(timer))
1390                 __sock_put(sk);
1391 }
1392
1393 EXPORT_SYMBOL(sk_stop_timer);
1394
1395 void sock_init_data(struct socket *sock, struct sock *sk)
1396 {
1397         skb_queue_head_init(&sk->sk_receive_queue);
1398         skb_queue_head_init(&sk->sk_write_queue);
1399         skb_queue_head_init(&sk->sk_error_queue);
1400 #ifdef CONFIG_NET_DMA
1401         skb_queue_head_init(&sk->sk_async_wait_queue);
1402 #endif
1403
1404         sk->sk_send_head        =       NULL;
1405
1406         init_timer(&sk->sk_timer);
1407         
1408         sk->sk_allocation       =       GFP_KERNEL;
1409         sk->sk_rcvbuf           =       sysctl_rmem_default;
1410         sk->sk_sndbuf           =       sysctl_wmem_default;
1411         sk->sk_state            =       TCP_CLOSE;
1412         sk->sk_socket           =       sock;
1413
1414         sock_set_flag(sk, SOCK_ZAPPED);
1415
1416         if(sock)
1417         {
1418                 sk->sk_type     =       sock->type;
1419                 sk->sk_sleep    =       &sock->wait;
1420                 sock->sk        =       sk;
1421         } else
1422                 sk->sk_sleep    =       NULL;
1423
1424         rwlock_init(&sk->sk_dst_lock);
1425         rwlock_init(&sk->sk_callback_lock);
1426
1427         sk->sk_state_change     =       sock_def_wakeup;
1428         sk->sk_data_ready       =       sock_def_readable;
1429         sk->sk_write_space      =       sock_def_write_space;
1430         sk->sk_error_report     =       sock_def_error_report;
1431         sk->sk_destruct         =       sock_def_destruct;
1432
1433         sk->sk_sndmsg_page      =       NULL;
1434         sk->sk_sndmsg_off       =       0;
1435
1436         sk->sk_peercred.pid     =       0;
1437         sk->sk_peercred.uid     =       -1;
1438         sk->sk_peercred.gid     =       -1;
1439         sk->sk_write_pending    =       0;
1440         sk->sk_rcvlowat         =       1;
1441         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1442         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1443
1444         sk->sk_stamp.tv_sec     = -1L;
1445         sk->sk_stamp.tv_usec    = -1L;
1446
1447         atomic_set(&sk->sk_refcnt, 1);
1448 }
1449
1450 void fastcall lock_sock(struct sock *sk)
1451 {
1452         might_sleep();
1453         spin_lock_bh(&(sk->sk_lock.slock));
1454         if (sk->sk_lock.owner)
1455                 __lock_sock(sk);
1456         sk->sk_lock.owner = (void *)1;
1457         spin_unlock_bh(&(sk->sk_lock.slock));
1458 }
1459
1460 EXPORT_SYMBOL(lock_sock);
1461
1462 void fastcall release_sock(struct sock *sk)
1463 {
1464         spin_lock_bh(&(sk->sk_lock.slock));
1465         if (sk->sk_backlog.tail)
1466                 __release_sock(sk);
1467         sk->sk_lock.owner = NULL;
1468         if (waitqueue_active(&(sk->sk_lock.wq)))
1469                 wake_up(&(sk->sk_lock.wq));
1470         spin_unlock_bh(&(sk->sk_lock.slock));
1471 }
1472 EXPORT_SYMBOL(release_sock);
1473
1474 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1475
1476         if (!sock_flag(sk, SOCK_TIMESTAMP))
1477                 sock_enable_timestamp(sk);
1478         if (sk->sk_stamp.tv_sec == -1) 
1479                 return -ENOENT;
1480         if (sk->sk_stamp.tv_sec == 0)
1481                 do_gettimeofday(&sk->sk_stamp);
1482         return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1483                 -EFAULT : 0; 
1484
1485 EXPORT_SYMBOL(sock_get_timestamp);
1486
1487 void sock_enable_timestamp(struct sock *sk)
1488 {       
1489         if (!sock_flag(sk, SOCK_TIMESTAMP)) { 
1490                 sock_set_flag(sk, SOCK_TIMESTAMP);
1491                 net_enable_timestamp();
1492         }
1493 }
1494 EXPORT_SYMBOL(sock_enable_timestamp); 
1495
1496 /*
1497  *      Get a socket option on an socket.
1498  *
1499  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1500  *      asynchronous errors should be reported by getsockopt. We assume
1501  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1502  */
1503 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1504                            char __user *optval, int __user *optlen)
1505 {
1506         struct sock *sk = sock->sk;
1507
1508         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1509 }
1510
1511 EXPORT_SYMBOL(sock_common_getsockopt);
1512
1513 #ifdef CONFIG_COMPAT
1514 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1515                                   char __user *optval, int __user *optlen)
1516 {
1517         struct sock *sk = sock->sk;
1518
1519         if (sk->sk_prot->compat_setsockopt != NULL)
1520                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1521                                                       optval, optlen);
1522         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1523 }
1524 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1525 #endif
1526
1527 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1528                         struct msghdr *msg, size_t size, int flags)
1529 {
1530         struct sock *sk = sock->sk;
1531         int addr_len = 0;
1532         int err;
1533
1534         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1535                                    flags & ~MSG_DONTWAIT, &addr_len);
1536         if (err >= 0)
1537                 msg->msg_namelen = addr_len;
1538         return err;
1539 }
1540
1541 EXPORT_SYMBOL(sock_common_recvmsg);
1542
1543 /*
1544  *      Set socket options on an inet socket.
1545  */
1546 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1547                            char __user *optval, int optlen)
1548 {
1549         struct sock *sk = sock->sk;
1550
1551         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1552 }
1553
1554 EXPORT_SYMBOL(sock_common_setsockopt);
1555
1556 #ifdef CONFIG_COMPAT
1557 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1558                                   char __user *optval, int optlen)
1559 {
1560         struct sock *sk = sock->sk;
1561
1562         if (sk->sk_prot->compat_setsockopt != NULL)
1563                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1564                                                       optval, optlen);
1565         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1566 }
1567 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1568 #endif
1569
1570 void sk_common_release(struct sock *sk)
1571 {
1572         if (sk->sk_prot->destroy)
1573                 sk->sk_prot->destroy(sk);
1574
1575         /*
1576          * Observation: when sock_common_release is called, processes have
1577          * no access to socket. But net still has.
1578          * Step one, detach it from networking:
1579          *
1580          * A. Remove from hash tables.
1581          */
1582
1583         sk->sk_prot->unhash(sk);
1584
1585         /*
1586          * In this point socket cannot receive new packets, but it is possible
1587          * that some packets are in flight because some CPU runs receiver and
1588          * did hash table lookup before we unhashed socket. They will achieve
1589          * receive queue and will be purged by socket destructor.
1590          *
1591          * Also we still have packets pending on receive queue and probably,
1592          * our own packets waiting in device queues. sock_destroy will drain
1593          * receive queue, but transmitted packets will delay socket destruction
1594          * until the last reference will be released.
1595          */
1596
1597         sock_orphan(sk);
1598
1599         xfrm_sk_free_policy(sk);
1600
1601         sk_refcnt_debug_release(sk);
1602         sock_put(sk);
1603 }
1604
1605 EXPORT_SYMBOL(sk_common_release);
1606
1607 static DEFINE_RWLOCK(proto_list_lock);
1608 static LIST_HEAD(proto_list);
1609
1610 int proto_register(struct proto *prot, int alloc_slab)
1611 {
1612         char *request_sock_slab_name = NULL;
1613         char *timewait_sock_slab_name;
1614         int rc = -ENOBUFS;
1615
1616         if (alloc_slab) {
1617                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1618                                                SLAB_HWCACHE_ALIGN, NULL, NULL);
1619
1620                 if (prot->slab == NULL) {
1621                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1622                                prot->name);
1623                         goto out;
1624                 }
1625
1626                 if (prot->rsk_prot != NULL) {
1627                         static const char mask[] = "request_sock_%s";
1628
1629                         request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1630                         if (request_sock_slab_name == NULL)
1631                                 goto out_free_sock_slab;
1632
1633                         sprintf(request_sock_slab_name, mask, prot->name);
1634                         prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1635                                                                  prot->rsk_prot->obj_size, 0,
1636                                                                  SLAB_HWCACHE_ALIGN, NULL, NULL);
1637
1638                         if (prot->rsk_prot->slab == NULL) {
1639                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1640                                        prot->name);
1641                                 goto out_free_request_sock_slab_name;
1642                         }
1643                 }
1644
1645                 if (prot->twsk_prot != NULL) {
1646                         static const char mask[] = "tw_sock_%s";
1647
1648                         timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1649
1650                         if (timewait_sock_slab_name == NULL)
1651                                 goto out_free_request_sock_slab;
1652
1653                         sprintf(timewait_sock_slab_name, mask, prot->name);
1654                         prot->twsk_prot->twsk_slab =
1655                                 kmem_cache_create(timewait_sock_slab_name,
1656                                                   prot->twsk_prot->twsk_obj_size,
1657                                                   0, SLAB_HWCACHE_ALIGN,
1658                                                   NULL, NULL);
1659                         if (prot->twsk_prot->twsk_slab == NULL)
1660                                 goto out_free_timewait_sock_slab_name;
1661                 }
1662         }
1663
1664         write_lock(&proto_list_lock);
1665         list_add(&prot->node, &proto_list);
1666         write_unlock(&proto_list_lock);
1667         rc = 0;
1668 out:
1669         return rc;
1670 out_free_timewait_sock_slab_name:
1671         kfree(timewait_sock_slab_name);
1672 out_free_request_sock_slab:
1673         if (prot->rsk_prot && prot->rsk_prot->slab) {
1674                 kmem_cache_destroy(prot->rsk_prot->slab);
1675                 prot->rsk_prot->slab = NULL;
1676         }
1677 out_free_request_sock_slab_name:
1678         kfree(request_sock_slab_name);
1679 out_free_sock_slab:
1680         kmem_cache_destroy(prot->slab);
1681         prot->slab = NULL;
1682         goto out;
1683 }
1684
1685 EXPORT_SYMBOL(proto_register);
1686
1687 void proto_unregister(struct proto *prot)
1688 {
1689         write_lock(&proto_list_lock);
1690         list_del(&prot->node);
1691         write_unlock(&proto_list_lock);
1692
1693         if (prot->slab != NULL) {
1694                 kmem_cache_destroy(prot->slab);
1695                 prot->slab = NULL;
1696         }
1697
1698         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1699                 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1700
1701                 kmem_cache_destroy(prot->rsk_prot->slab);
1702                 kfree(name);
1703                 prot->rsk_prot->slab = NULL;
1704         }
1705
1706         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1707                 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1708
1709                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1710                 kfree(name);
1711                 prot->twsk_prot->twsk_slab = NULL;
1712         }
1713 }
1714
1715 EXPORT_SYMBOL(proto_unregister);
1716
1717 #ifdef CONFIG_PROC_FS
1718 static inline struct proto *__proto_head(void)
1719 {
1720         return list_entry(proto_list.next, struct proto, node);
1721 }
1722
1723 static inline struct proto *proto_head(void)
1724 {
1725         return list_empty(&proto_list) ? NULL : __proto_head();
1726 }
1727
1728 static inline struct proto *proto_next(struct proto *proto)
1729 {
1730         return proto->node.next == &proto_list ? NULL :
1731                 list_entry(proto->node.next, struct proto, node);
1732 }
1733
1734 static inline struct proto *proto_get_idx(loff_t pos)
1735 {
1736         struct proto *proto;
1737         loff_t i = 0;
1738
1739         list_for_each_entry(proto, &proto_list, node)
1740                 if (i++ == pos)
1741                         goto out;
1742
1743         proto = NULL;
1744 out:
1745         return proto;
1746 }
1747
1748 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1749 {
1750         read_lock(&proto_list_lock);
1751         return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1752 }
1753
1754 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1755 {
1756         ++*pos;
1757         return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1758 }
1759
1760 static void proto_seq_stop(struct seq_file *seq, void *v)
1761 {
1762         read_unlock(&proto_list_lock);
1763 }
1764
1765 static char proto_method_implemented(const void *method)
1766 {
1767         return method == NULL ? 'n' : 'y';
1768 }
1769
1770 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1771 {
1772         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1773                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1774                    proto->name,
1775                    proto->obj_size,
1776                    proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1777                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1778                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1779                    proto->max_header,
1780                    proto->slab == NULL ? "no" : "yes",
1781                    module_name(proto->owner),
1782                    proto_method_implemented(proto->close),
1783                    proto_method_implemented(proto->connect),
1784                    proto_method_implemented(proto->disconnect),
1785                    proto_method_implemented(proto->accept),
1786                    proto_method_implemented(proto->ioctl),
1787                    proto_method_implemented(proto->init),
1788                    proto_method_implemented(proto->destroy),
1789                    proto_method_implemented(proto->shutdown),
1790                    proto_method_implemented(proto->setsockopt),
1791                    proto_method_implemented(proto->getsockopt),
1792                    proto_method_implemented(proto->sendmsg),
1793                    proto_method_implemented(proto->recvmsg),
1794                    proto_method_implemented(proto->sendpage),
1795                    proto_method_implemented(proto->bind),
1796                    proto_method_implemented(proto->backlog_rcv),
1797                    proto_method_implemented(proto->hash),
1798                    proto_method_implemented(proto->unhash),
1799                    proto_method_implemented(proto->get_port),
1800                    proto_method_implemented(proto->enter_memory_pressure));
1801 }
1802
1803 static int proto_seq_show(struct seq_file *seq, void *v)
1804 {
1805         if (v == SEQ_START_TOKEN)
1806                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1807                            "protocol",
1808                            "size",
1809                            "sockets",
1810                            "memory",
1811                            "press",
1812                            "maxhdr",
1813                            "slab",
1814                            "module",
1815                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1816         else
1817                 proto_seq_printf(seq, v);
1818         return 0;
1819 }
1820
1821 static struct seq_operations proto_seq_ops = {
1822         .start  = proto_seq_start,
1823         .next   = proto_seq_next,
1824         .stop   = proto_seq_stop,
1825         .show   = proto_seq_show,
1826 };
1827
1828 static int proto_seq_open(struct inode *inode, struct file *file)
1829 {
1830         return seq_open(file, &proto_seq_ops);
1831 }
1832
1833 static struct file_operations proto_seq_fops = {
1834         .owner          = THIS_MODULE,
1835         .open           = proto_seq_open,
1836         .read           = seq_read,
1837         .llseek         = seq_lseek,
1838         .release        = seq_release,
1839 };
1840
1841 static int __init proto_init(void)
1842 {
1843         /* register /proc/net/protocols */
1844         return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1845 }
1846
1847 subsys_initcall(proto_init);
1848
1849 #endif /* PROC_FS */
1850
1851 EXPORT_SYMBOL(sk_alloc);
1852 EXPORT_SYMBOL(sk_free);
1853 EXPORT_SYMBOL(sk_send_sigurg);
1854 EXPORT_SYMBOL(sock_alloc_send_skb);
1855 EXPORT_SYMBOL(sock_init_data);
1856 EXPORT_SYMBOL(sock_kfree_s);
1857 EXPORT_SYMBOL(sock_kmalloc);
1858 EXPORT_SYMBOL(sock_no_accept);
1859 EXPORT_SYMBOL(sock_no_bind);
1860 EXPORT_SYMBOL(sock_no_connect);
1861 EXPORT_SYMBOL(sock_no_getname);
1862 EXPORT_SYMBOL(sock_no_getsockopt);
1863 EXPORT_SYMBOL(sock_no_ioctl);
1864 EXPORT_SYMBOL(sock_no_listen);
1865 EXPORT_SYMBOL(sock_no_mmap);
1866 EXPORT_SYMBOL(sock_no_poll);
1867 EXPORT_SYMBOL(sock_no_recvmsg);
1868 EXPORT_SYMBOL(sock_no_sendmsg);
1869 EXPORT_SYMBOL(sock_no_sendpage);
1870 EXPORT_SYMBOL(sock_no_setsockopt);
1871 EXPORT_SYMBOL(sock_no_shutdown);
1872 EXPORT_SYMBOL(sock_no_socketpair);
1873 EXPORT_SYMBOL(sock_rfree);
1874 EXPORT_SYMBOL(sock_setsockopt);
1875 EXPORT_SYMBOL(sock_wfree);
1876 EXPORT_SYMBOL(sock_wmalloc);
1877 EXPORT_SYMBOL(sock_i_uid);
1878 EXPORT_SYMBOL(sock_i_ino);
1879 EXPORT_SYMBOL(sysctl_optmem_max);
1880 #ifdef CONFIG_SYSCTL
1881 EXPORT_SYMBOL(sysctl_rmem_max);
1882 EXPORT_SYMBOL(sysctl_wmem_max);
1883 #endif