Merge branch 'topic/snd-printk' into for-linus
[pandora-kernel.git] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Florian La Roche, <flla@stud.uni-sb.de>
13  *              Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *              Alan Cox        :       Numerous verify_area() problems
17  *              Alan Cox        :       Connecting on a connecting socket
18  *                                      now returns an error for tcp.
19  *              Alan Cox        :       sock->protocol is set correctly.
20  *                                      and is not sometimes left as 0.
21  *              Alan Cox        :       connect handles icmp errors on a
22  *                                      connect properly. Unfortunately there
23  *                                      is a restart syscall nasty there. I
24  *                                      can't match BSD without hacking the C
25  *                                      library. Ideas urgently sought!
26  *              Alan Cox        :       Disallow bind() to addresses that are
27  *                                      not ours - especially broadcast ones!!
28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
30  *                                      instead they leave that for the DESTROY timer.
31  *              Alan Cox        :       Clean up error flag in accept
32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
33  *                                      was buggy. Put a remove_sock() in the handler
34  *                                      for memory when we hit 0. Also altered the timer
35  *                                      code. The ACK stuff can wait and needs major
36  *                                      TCP layer surgery.
37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
38  *                                      and fixed timer/inet_bh race.
39  *              Alan Cox        :       Added zapped flag for TCP
40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
47  *      Pauline Middelink       :       identd support
48  *              Alan Cox        :       Fixed connect() taking signals I think.
49  *              Alan Cox        :       SO_LINGER supported
50  *              Alan Cox        :       Error reporting fixes
51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
52  *              Alan Cox        :       inet sockets don't set sk->type!
53  *              Alan Cox        :       Split socket option code
54  *              Alan Cox        :       Callbacks
55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
56  *              Alex            :       Removed restriction on inet fioctl
57  *              Alan Cox        :       Splitting INET from NET core
58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
60  *              Alan Cox        :       Split IP from generic code
61  *              Alan Cox        :       New kfree_skbmem()
62  *              Alan Cox        :       Make SO_DEBUG superuser only.
63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
64  *                                      (compatibility fix)
65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
66  *              Alan Cox        :       Allocator for a socket is settable.
67  *              Alan Cox        :       SO_ERROR includes soft errors.
68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
69  *              Alan Cox        :       Generic socket allocation to make hooks
70  *                                      easier (suggested by Craig Metz).
71  *              Michael Pall    :       SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
79  *              Andi Kleen      :       Fix write_space callback
80  *              Chris Evans     :       Security fixes - signedness again
81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *              This program is free software; you can redistribute it and/or
87  *              modify it under the terms of the GNU General Public License
88  *              as published by the Free Software Foundation; either version
89  *              2 of the License, or (at your option) any later version.
90  */
91
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113
114 #include <asm/uaccess.h>
115 #include <asm/system.h>
116
117 #include <linux/netdevice.h>
118 #include <net/protocol.h>
119 #include <linux/skbuff.h>
120 #include <net/net_namespace.h>
121 #include <net/request_sock.h>
122 #include <net/sock.h>
123 #include <linux/net_tstamp.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
126
127 #include <linux/filter.h>
128
129 #ifdef CONFIG_INET
130 #include <net/tcp.h>
131 #endif
132
133 /*
134  * Each address family might have different locking rules, so we have
135  * one slock key per address family:
136  */
137 static struct lock_class_key af_family_keys[AF_MAX];
138 static struct lock_class_key af_family_slock_keys[AF_MAX];
139
140 /*
141  * Make lock validator output more readable. (we pre-construct these
142  * strings build-time, so that runtime initialization of socket
143  * locks is fast):
144  */
145 static const char *af_family_key_strings[AF_MAX+1] = {
146   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
147   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
148   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
149   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
150   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
151   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
152   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
153   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
154   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
155   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
156   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
157   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
158   "sk_lock-AF_IEEE802154",
159   "sk_lock-AF_MAX"
160 };
161 static const char *af_family_slock_key_strings[AF_MAX+1] = {
162   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
163   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
164   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
165   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
166   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
167   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
168   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
169   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
170   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
171   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
172   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
173   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
174   "slock-AF_IEEE802154",
175   "slock-AF_MAX"
176 };
177 static const char *af_family_clock_key_strings[AF_MAX+1] = {
178   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
179   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
180   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
181   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
182   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
183   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
184   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
185   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
186   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
187   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
188   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
189   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
190   "clock-AF_IEEE802154",
191   "clock-AF_MAX"
192 };
193
194 /*
195  * sk_callback_lock locking rules are per-address-family,
196  * so split the lock classes by using a per-AF key:
197  */
198 static struct lock_class_key af_callback_keys[AF_MAX];
199
200 /* Take into consideration the size of the struct sk_buff overhead in the
201  * determination of these values, since that is non-constant across
202  * platforms.  This makes socket queueing behavior and performance
203  * not depend upon such differences.
204  */
205 #define _SK_MEM_PACKETS         256
206 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
207 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
208 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
209
210 /* Run time adjustable parameters. */
211 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
212 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
213 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
214 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
215
216 /* Maximal space eaten by iovec or ancilliary data plus some space */
217 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
218 EXPORT_SYMBOL(sysctl_optmem_max);
219
220 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
221 {
222         struct timeval tv;
223
224         if (optlen < sizeof(tv))
225                 return -EINVAL;
226         if (copy_from_user(&tv, optval, sizeof(tv)))
227                 return -EFAULT;
228         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
229                 return -EDOM;
230
231         if (tv.tv_sec < 0) {
232                 static int warned __read_mostly;
233
234                 *timeo_p = 0;
235                 if (warned < 10 && net_ratelimit()) {
236                         warned++;
237                         printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
238                                "tries to set negative timeout\n",
239                                 current->comm, task_pid_nr(current));
240                 }
241                 return 0;
242         }
243         *timeo_p = MAX_SCHEDULE_TIMEOUT;
244         if (tv.tv_sec == 0 && tv.tv_usec == 0)
245                 return 0;
246         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
247                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
248         return 0;
249 }
250
251 static void sock_warn_obsolete_bsdism(const char *name)
252 {
253         static int warned;
254         static char warncomm[TASK_COMM_LEN];
255         if (strcmp(warncomm, current->comm) && warned < 5) {
256                 strcpy(warncomm,  current->comm);
257                 printk(KERN_WARNING "process `%s' is using obsolete "
258                        "%s SO_BSDCOMPAT\n", warncomm, name);
259                 warned++;
260         }
261 }
262
263 static void sock_disable_timestamp(struct sock *sk, int flag)
264 {
265         if (sock_flag(sk, flag)) {
266                 sock_reset_flag(sk, flag);
267                 if (!sock_flag(sk, SOCK_TIMESTAMP) &&
268                     !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
269                         net_disable_timestamp();
270                 }
271         }
272 }
273
274
275 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
276 {
277         int err = 0;
278         int skb_len;
279
280         /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
281            number of warnings when compiling with -W --ANK
282          */
283         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
284             (unsigned)sk->sk_rcvbuf) {
285                 err = -ENOMEM;
286                 goto out;
287         }
288
289         err = sk_filter(sk, skb);
290         if (err)
291                 goto out;
292
293         if (!sk_rmem_schedule(sk, skb->truesize)) {
294                 err = -ENOBUFS;
295                 goto out;
296         }
297
298         skb->dev = NULL;
299         skb_set_owner_r(skb, sk);
300
301         /* Cache the SKB length before we tack it onto the receive
302          * queue.  Once it is added it no longer belongs to us and
303          * may be freed by other threads of control pulling packets
304          * from the queue.
305          */
306         skb_len = skb->len;
307
308         skb_queue_tail(&sk->sk_receive_queue, skb);
309
310         if (!sock_flag(sk, SOCK_DEAD))
311                 sk->sk_data_ready(sk, skb_len);
312 out:
313         return err;
314 }
315 EXPORT_SYMBOL(sock_queue_rcv_skb);
316
317 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
318 {
319         int rc = NET_RX_SUCCESS;
320
321         if (sk_filter(sk, skb))
322                 goto discard_and_relse;
323
324         skb->dev = NULL;
325
326         if (nested)
327                 bh_lock_sock_nested(sk);
328         else
329                 bh_lock_sock(sk);
330         if (!sock_owned_by_user(sk)) {
331                 /*
332                  * trylock + unlock semantics:
333                  */
334                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
335
336                 rc = sk_backlog_rcv(sk, skb);
337
338                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
339         } else
340                 sk_add_backlog(sk, skb);
341         bh_unlock_sock(sk);
342 out:
343         sock_put(sk);
344         return rc;
345 discard_and_relse:
346         kfree_skb(skb);
347         goto out;
348 }
349 EXPORT_SYMBOL(sk_receive_skb);
350
351 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
352 {
353         struct dst_entry *dst = sk->sk_dst_cache;
354
355         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
356                 sk->sk_dst_cache = NULL;
357                 dst_release(dst);
358                 return NULL;
359         }
360
361         return dst;
362 }
363 EXPORT_SYMBOL(__sk_dst_check);
364
365 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
366 {
367         struct dst_entry *dst = sk_dst_get(sk);
368
369         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
370                 sk_dst_reset(sk);
371                 dst_release(dst);
372                 return NULL;
373         }
374
375         return dst;
376 }
377 EXPORT_SYMBOL(sk_dst_check);
378
379 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
380 {
381         int ret = -ENOPROTOOPT;
382 #ifdef CONFIG_NETDEVICES
383         struct net *net = sock_net(sk);
384         char devname[IFNAMSIZ];
385         int index;
386
387         /* Sorry... */
388         ret = -EPERM;
389         if (!capable(CAP_NET_RAW))
390                 goto out;
391
392         ret = -EINVAL;
393         if (optlen < 0)
394                 goto out;
395
396         /* Bind this socket to a particular device like "eth0",
397          * as specified in the passed interface name. If the
398          * name is "" or the option length is zero the socket
399          * is not bound.
400          */
401         if (optlen > IFNAMSIZ - 1)
402                 optlen = IFNAMSIZ - 1;
403         memset(devname, 0, sizeof(devname));
404
405         ret = -EFAULT;
406         if (copy_from_user(devname, optval, optlen))
407                 goto out;
408
409         if (devname[0] == '\0') {
410                 index = 0;
411         } else {
412                 struct net_device *dev = dev_get_by_name(net, devname);
413
414                 ret = -ENODEV;
415                 if (!dev)
416                         goto out;
417
418                 index = dev->ifindex;
419                 dev_put(dev);
420         }
421
422         lock_sock(sk);
423         sk->sk_bound_dev_if = index;
424         sk_dst_reset(sk);
425         release_sock(sk);
426
427         ret = 0;
428
429 out:
430 #endif
431
432         return ret;
433 }
434
435 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
436 {
437         if (valbool)
438                 sock_set_flag(sk, bit);
439         else
440                 sock_reset_flag(sk, bit);
441 }
442
443 /*
444  *      This is meant for all protocols to use and covers goings on
445  *      at the socket level. Everything here is generic.
446  */
447
448 int sock_setsockopt(struct socket *sock, int level, int optname,
449                     char __user *optval, int optlen)
450 {
451         struct sock *sk = sock->sk;
452         int val;
453         int valbool;
454         struct linger ling;
455         int ret = 0;
456
457         /*
458          *      Options without arguments
459          */
460
461         if (optname == SO_BINDTODEVICE)
462                 return sock_bindtodevice(sk, optval, optlen);
463
464         if (optlen < sizeof(int))
465                 return -EINVAL;
466
467         if (get_user(val, (int __user *)optval))
468                 return -EFAULT;
469
470         valbool = val ? 1 : 0;
471
472         lock_sock(sk);
473
474         switch (optname) {
475         case SO_DEBUG:
476                 if (val && !capable(CAP_NET_ADMIN))
477                         ret = -EACCES;
478                 else
479                         sock_valbool_flag(sk, SOCK_DBG, valbool);
480                 break;
481         case SO_REUSEADDR:
482                 sk->sk_reuse = valbool;
483                 break;
484         case SO_TYPE:
485         case SO_ERROR:
486                 ret = -ENOPROTOOPT;
487                 break;
488         case SO_DONTROUTE:
489                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
490                 break;
491         case SO_BROADCAST:
492                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
493                 break;
494         case SO_SNDBUF:
495                 /* Don't error on this BSD doesn't and if you think
496                    about it this is right. Otherwise apps have to
497                    play 'guess the biggest size' games. RCVBUF/SNDBUF
498                    are treated in BSD as hints */
499
500                 if (val > sysctl_wmem_max)
501                         val = sysctl_wmem_max;
502 set_sndbuf:
503                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
504                 if ((val * 2) < SOCK_MIN_SNDBUF)
505                         sk->sk_sndbuf = SOCK_MIN_SNDBUF;
506                 else
507                         sk->sk_sndbuf = val * 2;
508
509                 /*
510                  *      Wake up sending tasks if we
511                  *      upped the value.
512                  */
513                 sk->sk_write_space(sk);
514                 break;
515
516         case SO_SNDBUFFORCE:
517                 if (!capable(CAP_NET_ADMIN)) {
518                         ret = -EPERM;
519                         break;
520                 }
521                 goto set_sndbuf;
522
523         case SO_RCVBUF:
524                 /* Don't error on this BSD doesn't and if you think
525                    about it this is right. Otherwise apps have to
526                    play 'guess the biggest size' games. RCVBUF/SNDBUF
527                    are treated in BSD as hints */
528
529                 if (val > sysctl_rmem_max)
530                         val = sysctl_rmem_max;
531 set_rcvbuf:
532                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
533                 /*
534                  * We double it on the way in to account for
535                  * "struct sk_buff" etc. overhead.   Applications
536                  * assume that the SO_RCVBUF setting they make will
537                  * allow that much actual data to be received on that
538                  * socket.
539                  *
540                  * Applications are unaware that "struct sk_buff" and
541                  * other overheads allocate from the receive buffer
542                  * during socket buffer allocation.
543                  *
544                  * And after considering the possible alternatives,
545                  * returning the value we actually used in getsockopt
546                  * is the most desirable behavior.
547                  */
548                 if ((val * 2) < SOCK_MIN_RCVBUF)
549                         sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
550                 else
551                         sk->sk_rcvbuf = val * 2;
552                 break;
553
554         case SO_RCVBUFFORCE:
555                 if (!capable(CAP_NET_ADMIN)) {
556                         ret = -EPERM;
557                         break;
558                 }
559                 goto set_rcvbuf;
560
561         case SO_KEEPALIVE:
562 #ifdef CONFIG_INET
563                 if (sk->sk_protocol == IPPROTO_TCP)
564                         tcp_set_keepalive(sk, valbool);
565 #endif
566                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
567                 break;
568
569         case SO_OOBINLINE:
570                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
571                 break;
572
573         case SO_NO_CHECK:
574                 sk->sk_no_check = valbool;
575                 break;
576
577         case SO_PRIORITY:
578                 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
579                         sk->sk_priority = val;
580                 else
581                         ret = -EPERM;
582                 break;
583
584         case SO_LINGER:
585                 if (optlen < sizeof(ling)) {
586                         ret = -EINVAL;  /* 1003.1g */
587                         break;
588                 }
589                 if (copy_from_user(&ling, optval, sizeof(ling))) {
590                         ret = -EFAULT;
591                         break;
592                 }
593                 if (!ling.l_onoff)
594                         sock_reset_flag(sk, SOCK_LINGER);
595                 else {
596 #if (BITS_PER_LONG == 32)
597                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
598                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
599                         else
600 #endif
601                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
602                         sock_set_flag(sk, SOCK_LINGER);
603                 }
604                 break;
605
606         case SO_BSDCOMPAT:
607                 sock_warn_obsolete_bsdism("setsockopt");
608                 break;
609
610         case SO_PASSCRED:
611                 if (valbool)
612                         set_bit(SOCK_PASSCRED, &sock->flags);
613                 else
614                         clear_bit(SOCK_PASSCRED, &sock->flags);
615                 break;
616
617         case SO_TIMESTAMP:
618         case SO_TIMESTAMPNS:
619                 if (valbool)  {
620                         if (optname == SO_TIMESTAMP)
621                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
622                         else
623                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
624                         sock_set_flag(sk, SOCK_RCVTSTAMP);
625                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
626                 } else {
627                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
628                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
629                 }
630                 break;
631
632         case SO_TIMESTAMPING:
633                 if (val & ~SOF_TIMESTAMPING_MASK) {
634                         ret = -EINVAL;
635                         break;
636                 }
637                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
638                                   val & SOF_TIMESTAMPING_TX_HARDWARE);
639                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
640                                   val & SOF_TIMESTAMPING_TX_SOFTWARE);
641                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
642                                   val & SOF_TIMESTAMPING_RX_HARDWARE);
643                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
644                         sock_enable_timestamp(sk,
645                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
646                 else
647                         sock_disable_timestamp(sk,
648                                                SOCK_TIMESTAMPING_RX_SOFTWARE);
649                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
650                                   val & SOF_TIMESTAMPING_SOFTWARE);
651                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
652                                   val & SOF_TIMESTAMPING_SYS_HARDWARE);
653                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
654                                   val & SOF_TIMESTAMPING_RAW_HARDWARE);
655                 break;
656
657         case SO_RCVLOWAT:
658                 if (val < 0)
659                         val = INT_MAX;
660                 sk->sk_rcvlowat = val ? : 1;
661                 break;
662
663         case SO_RCVTIMEO:
664                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
665                 break;
666
667         case SO_SNDTIMEO:
668                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
669                 break;
670
671         case SO_ATTACH_FILTER:
672                 ret = -EINVAL;
673                 if (optlen == sizeof(struct sock_fprog)) {
674                         struct sock_fprog fprog;
675
676                         ret = -EFAULT;
677                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
678                                 break;
679
680                         ret = sk_attach_filter(&fprog, sk);
681                 }
682                 break;
683
684         case SO_DETACH_FILTER:
685                 ret = sk_detach_filter(sk);
686                 break;
687
688         case SO_PASSSEC:
689                 if (valbool)
690                         set_bit(SOCK_PASSSEC, &sock->flags);
691                 else
692                         clear_bit(SOCK_PASSSEC, &sock->flags);
693                 break;
694         case SO_MARK:
695                 if (!capable(CAP_NET_ADMIN))
696                         ret = -EPERM;
697                 else
698                         sk->sk_mark = val;
699                 break;
700
701                 /* We implement the SO_SNDLOWAT etc to
702                    not be settable (1003.1g 5.3) */
703         default:
704                 ret = -ENOPROTOOPT;
705                 break;
706         }
707         release_sock(sk);
708         return ret;
709 }
710 EXPORT_SYMBOL(sock_setsockopt);
711
712
713 int sock_getsockopt(struct socket *sock, int level, int optname,
714                     char __user *optval, int __user *optlen)
715 {
716         struct sock *sk = sock->sk;
717
718         union {
719                 int val;
720                 struct linger ling;
721                 struct timeval tm;
722         } v;
723
724         unsigned int lv = sizeof(int);
725         int len;
726
727         if (get_user(len, optlen))
728                 return -EFAULT;
729         if (len < 0)
730                 return -EINVAL;
731
732         memset(&v, 0, sizeof(v));
733
734         switch (optname) {
735         case SO_DEBUG:
736                 v.val = sock_flag(sk, SOCK_DBG);
737                 break;
738
739         case SO_DONTROUTE:
740                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
741                 break;
742
743         case SO_BROADCAST:
744                 v.val = !!sock_flag(sk, SOCK_BROADCAST);
745                 break;
746
747         case SO_SNDBUF:
748                 v.val = sk->sk_sndbuf;
749                 break;
750
751         case SO_RCVBUF:
752                 v.val = sk->sk_rcvbuf;
753                 break;
754
755         case SO_REUSEADDR:
756                 v.val = sk->sk_reuse;
757                 break;
758
759         case SO_KEEPALIVE:
760                 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
761                 break;
762
763         case SO_TYPE:
764                 v.val = sk->sk_type;
765                 break;
766
767         case SO_ERROR:
768                 v.val = -sock_error(sk);
769                 if (v.val == 0)
770                         v.val = xchg(&sk->sk_err_soft, 0);
771                 break;
772
773         case SO_OOBINLINE:
774                 v.val = !!sock_flag(sk, SOCK_URGINLINE);
775                 break;
776
777         case SO_NO_CHECK:
778                 v.val = sk->sk_no_check;
779                 break;
780
781         case SO_PRIORITY:
782                 v.val = sk->sk_priority;
783                 break;
784
785         case SO_LINGER:
786                 lv              = sizeof(v.ling);
787                 v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
788                 v.ling.l_linger = sk->sk_lingertime / HZ;
789                 break;
790
791         case SO_BSDCOMPAT:
792                 sock_warn_obsolete_bsdism("getsockopt");
793                 break;
794
795         case SO_TIMESTAMP:
796                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
797                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
798                 break;
799
800         case SO_TIMESTAMPNS:
801                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
802                 break;
803
804         case SO_TIMESTAMPING:
805                 v.val = 0;
806                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
807                         v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
808                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
809                         v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
810                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
811                         v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
812                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
813                         v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
814                 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
815                         v.val |= SOF_TIMESTAMPING_SOFTWARE;
816                 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
817                         v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
818                 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
819                         v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
820                 break;
821
822         case SO_RCVTIMEO:
823                 lv = sizeof(struct timeval);
824                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
825                         v.tm.tv_sec = 0;
826                         v.tm.tv_usec = 0;
827                 } else {
828                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
829                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
830                 }
831                 break;
832
833         case SO_SNDTIMEO:
834                 lv = sizeof(struct timeval);
835                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
836                         v.tm.tv_sec = 0;
837                         v.tm.tv_usec = 0;
838                 } else {
839                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
840                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
841                 }
842                 break;
843
844         case SO_RCVLOWAT:
845                 v.val = sk->sk_rcvlowat;
846                 break;
847
848         case SO_SNDLOWAT:
849                 v.val = 1;
850                 break;
851
852         case SO_PASSCRED:
853                 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
854                 break;
855
856         case SO_PEERCRED:
857                 if (len > sizeof(sk->sk_peercred))
858                         len = sizeof(sk->sk_peercred);
859                 if (copy_to_user(optval, &sk->sk_peercred, len))
860                         return -EFAULT;
861                 goto lenout;
862
863         case SO_PEERNAME:
864         {
865                 char address[128];
866
867                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
868                         return -ENOTCONN;
869                 if (lv < len)
870                         return -EINVAL;
871                 if (copy_to_user(optval, address, len))
872                         return -EFAULT;
873                 goto lenout;
874         }
875
876         /* Dubious BSD thing... Probably nobody even uses it, but
877          * the UNIX standard wants it for whatever reason... -DaveM
878          */
879         case SO_ACCEPTCONN:
880                 v.val = sk->sk_state == TCP_LISTEN;
881                 break;
882
883         case SO_PASSSEC:
884                 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
885                 break;
886
887         case SO_PEERSEC:
888                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
889
890         case SO_MARK:
891                 v.val = sk->sk_mark;
892                 break;
893
894         default:
895                 return -ENOPROTOOPT;
896         }
897
898         if (len > lv)
899                 len = lv;
900         if (copy_to_user(optval, &v, len))
901                 return -EFAULT;
902 lenout:
903         if (put_user(len, optlen))
904                 return -EFAULT;
905         return 0;
906 }
907
908 /*
909  * Initialize an sk_lock.
910  *
911  * (We also register the sk_lock with the lock validator.)
912  */
913 static inline void sock_lock_init(struct sock *sk)
914 {
915         sock_lock_init_class_and_name(sk,
916                         af_family_slock_key_strings[sk->sk_family],
917                         af_family_slock_keys + sk->sk_family,
918                         af_family_key_strings[sk->sk_family],
919                         af_family_keys + sk->sk_family);
920 }
921
922 /*
923  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
924  * even temporarly, because of RCU lookups. sk_node should also be left as is.
925  */
926 static void sock_copy(struct sock *nsk, const struct sock *osk)
927 {
928 #ifdef CONFIG_SECURITY_NETWORK
929         void *sptr = nsk->sk_security;
930 #endif
931         BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
932                      sizeof(osk->sk_node) + sizeof(osk->sk_refcnt));
933         memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
934                osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
935 #ifdef CONFIG_SECURITY_NETWORK
936         nsk->sk_security = sptr;
937         security_sk_clone(osk, nsk);
938 #endif
939 }
940
941 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
942                 int family)
943 {
944         struct sock *sk;
945         struct kmem_cache *slab;
946
947         slab = prot->slab;
948         if (slab != NULL) {
949                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
950                 if (!sk)
951                         return sk;
952                 if (priority & __GFP_ZERO) {
953                         /*
954                          * caches using SLAB_DESTROY_BY_RCU should let
955                          * sk_node.next un-modified. Special care is taken
956                          * when initializing object to zero.
957                          */
958                         if (offsetof(struct sock, sk_node.next) != 0)
959                                 memset(sk, 0, offsetof(struct sock, sk_node.next));
960                         memset(&sk->sk_node.pprev, 0,
961                                prot->obj_size - offsetof(struct sock,
962                                                          sk_node.pprev));
963                 }
964         }
965         else
966                 sk = kmalloc(prot->obj_size, priority);
967
968         if (sk != NULL) {
969                 kmemcheck_annotate_bitfield(sk, flags);
970
971                 if (security_sk_alloc(sk, family, priority))
972                         goto out_free;
973
974                 if (!try_module_get(prot->owner))
975                         goto out_free_sec;
976         }
977
978         return sk;
979
980 out_free_sec:
981         security_sk_free(sk);
982 out_free:
983         if (slab != NULL)
984                 kmem_cache_free(slab, sk);
985         else
986                 kfree(sk);
987         return NULL;
988 }
989
990 static void sk_prot_free(struct proto *prot, struct sock *sk)
991 {
992         struct kmem_cache *slab;
993         struct module *owner;
994
995         owner = prot->owner;
996         slab = prot->slab;
997
998         security_sk_free(sk);
999         if (slab != NULL)
1000                 kmem_cache_free(slab, sk);
1001         else
1002                 kfree(sk);
1003         module_put(owner);
1004 }
1005
1006 /**
1007  *      sk_alloc - All socket objects are allocated here
1008  *      @net: the applicable net namespace
1009  *      @family: protocol family
1010  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1011  *      @prot: struct proto associated with this new sock instance
1012  */
1013 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1014                       struct proto *prot)
1015 {
1016         struct sock *sk;
1017
1018         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1019         if (sk) {
1020                 sk->sk_family = family;
1021                 /*
1022                  * See comment in struct sock definition to understand
1023                  * why we need sk_prot_creator -acme
1024                  */
1025                 sk->sk_prot = sk->sk_prot_creator = prot;
1026                 sock_lock_init(sk);
1027                 sock_net_set(sk, get_net(net));
1028                 atomic_set(&sk->sk_wmem_alloc, 1);
1029         }
1030
1031         return sk;
1032 }
1033 EXPORT_SYMBOL(sk_alloc);
1034
1035 static void __sk_free(struct sock *sk)
1036 {
1037         struct sk_filter *filter;
1038
1039         if (sk->sk_destruct)
1040                 sk->sk_destruct(sk);
1041
1042         filter = rcu_dereference(sk->sk_filter);
1043         if (filter) {
1044                 sk_filter_uncharge(sk, filter);
1045                 rcu_assign_pointer(sk->sk_filter, NULL);
1046         }
1047
1048         sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1049         sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1050
1051         if (atomic_read(&sk->sk_omem_alloc))
1052                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1053                        __func__, atomic_read(&sk->sk_omem_alloc));
1054
1055         put_net(sock_net(sk));
1056         sk_prot_free(sk->sk_prot_creator, sk);
1057 }
1058
1059 void sk_free(struct sock *sk)
1060 {
1061         /*
1062          * We substract one from sk_wmem_alloc and can know if
1063          * some packets are still in some tx queue.
1064          * If not null, sock_wfree() will call __sk_free(sk) later
1065          */
1066         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1067                 __sk_free(sk);
1068 }
1069 EXPORT_SYMBOL(sk_free);
1070
1071 /*
1072  * Last sock_put should drop referrence to sk->sk_net. It has already
1073  * been dropped in sk_change_net. Taking referrence to stopping namespace
1074  * is not an option.
1075  * Take referrence to a socket to remove it from hash _alive_ and after that
1076  * destroy it in the context of init_net.
1077  */
1078 void sk_release_kernel(struct sock *sk)
1079 {
1080         if (sk == NULL || sk->sk_socket == NULL)
1081                 return;
1082
1083         sock_hold(sk);
1084         sock_release(sk->sk_socket);
1085         release_net(sock_net(sk));
1086         sock_net_set(sk, get_net(&init_net));
1087         sock_put(sk);
1088 }
1089 EXPORT_SYMBOL(sk_release_kernel);
1090
1091 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1092 {
1093         struct sock *newsk;
1094
1095         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1096         if (newsk != NULL) {
1097                 struct sk_filter *filter;
1098
1099                 sock_copy(newsk, sk);
1100
1101                 /* SANITY */
1102                 get_net(sock_net(newsk));
1103                 sk_node_init(&newsk->sk_node);
1104                 sock_lock_init(newsk);
1105                 bh_lock_sock(newsk);
1106                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1107
1108                 atomic_set(&newsk->sk_rmem_alloc, 0);
1109                 /*
1110                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1111                  */
1112                 atomic_set(&newsk->sk_wmem_alloc, 1);
1113                 atomic_set(&newsk->sk_omem_alloc, 0);
1114                 skb_queue_head_init(&newsk->sk_receive_queue);
1115                 skb_queue_head_init(&newsk->sk_write_queue);
1116 #ifdef CONFIG_NET_DMA
1117                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1118 #endif
1119
1120                 rwlock_init(&newsk->sk_dst_lock);
1121                 rwlock_init(&newsk->sk_callback_lock);
1122                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1123                                 af_callback_keys + newsk->sk_family,
1124                                 af_family_clock_key_strings[newsk->sk_family]);
1125
1126                 newsk->sk_dst_cache     = NULL;
1127                 newsk->sk_wmem_queued   = 0;
1128                 newsk->sk_forward_alloc = 0;
1129                 newsk->sk_send_head     = NULL;
1130                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1131
1132                 sock_reset_flag(newsk, SOCK_DONE);
1133                 skb_queue_head_init(&newsk->sk_error_queue);
1134
1135                 filter = newsk->sk_filter;
1136                 if (filter != NULL)
1137                         sk_filter_charge(newsk, filter);
1138
1139                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1140                         /* It is still raw copy of parent, so invalidate
1141                          * destructor and make plain sk_free() */
1142                         newsk->sk_destruct = NULL;
1143                         sk_free(newsk);
1144                         newsk = NULL;
1145                         goto out;
1146                 }
1147
1148                 newsk->sk_err      = 0;
1149                 newsk->sk_priority = 0;
1150                 /*
1151                  * Before updating sk_refcnt, we must commit prior changes to memory
1152                  * (Documentation/RCU/rculist_nulls.txt for details)
1153                  */
1154                 smp_wmb();
1155                 atomic_set(&newsk->sk_refcnt, 2);
1156
1157                 /*
1158                  * Increment the counter in the same struct proto as the master
1159                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1160                  * is the same as sk->sk_prot->socks, as this field was copied
1161                  * with memcpy).
1162                  *
1163                  * This _changes_ the previous behaviour, where
1164                  * tcp_create_openreq_child always was incrementing the
1165                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1166                  * to be taken into account in all callers. -acme
1167                  */
1168                 sk_refcnt_debug_inc(newsk);
1169                 sk_set_socket(newsk, NULL);
1170                 newsk->sk_sleep  = NULL;
1171
1172                 if (newsk->sk_prot->sockets_allocated)
1173                         percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1174         }
1175 out:
1176         return newsk;
1177 }
1178 EXPORT_SYMBOL_GPL(sk_clone);
1179
1180 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1181 {
1182         __sk_dst_set(sk, dst);
1183         sk->sk_route_caps = dst->dev->features;
1184         if (sk->sk_route_caps & NETIF_F_GSO)
1185                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1186         if (sk_can_gso(sk)) {
1187                 if (dst->header_len) {
1188                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1189                 } else {
1190                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1191                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1192                 }
1193         }
1194 }
1195 EXPORT_SYMBOL_GPL(sk_setup_caps);
1196
1197 void __init sk_init(void)
1198 {
1199         if (num_physpages <= 4096) {
1200                 sysctl_wmem_max = 32767;
1201                 sysctl_rmem_max = 32767;
1202                 sysctl_wmem_default = 32767;
1203                 sysctl_rmem_default = 32767;
1204         } else if (num_physpages >= 131072) {
1205                 sysctl_wmem_max = 131071;
1206                 sysctl_rmem_max = 131071;
1207         }
1208 }
1209
1210 /*
1211  *      Simple resource managers for sockets.
1212  */
1213
1214
1215 /*
1216  * Write buffer destructor automatically called from kfree_skb.
1217  */
1218 void sock_wfree(struct sk_buff *skb)
1219 {
1220         struct sock *sk = skb->sk;
1221         int res;
1222
1223         /* In case it might be waiting for more memory. */
1224         res = atomic_sub_return(skb->truesize, &sk->sk_wmem_alloc);
1225         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1226                 sk->sk_write_space(sk);
1227         /*
1228          * if sk_wmem_alloc reached 0, we are last user and should
1229          * free this sock, as sk_free() call could not do it.
1230          */
1231         if (res == 0)
1232                 __sk_free(sk);
1233 }
1234 EXPORT_SYMBOL(sock_wfree);
1235
1236 /*
1237  * Read buffer destructor automatically called from kfree_skb.
1238  */
1239 void sock_rfree(struct sk_buff *skb)
1240 {
1241         struct sock *sk = skb->sk;
1242
1243         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1244         sk_mem_uncharge(skb->sk, skb->truesize);
1245 }
1246 EXPORT_SYMBOL(sock_rfree);
1247
1248
1249 int sock_i_uid(struct sock *sk)
1250 {
1251         int uid;
1252
1253         read_lock(&sk->sk_callback_lock);
1254         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1255         read_unlock(&sk->sk_callback_lock);
1256         return uid;
1257 }
1258 EXPORT_SYMBOL(sock_i_uid);
1259
1260 unsigned long sock_i_ino(struct sock *sk)
1261 {
1262         unsigned long ino;
1263
1264         read_lock(&sk->sk_callback_lock);
1265         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1266         read_unlock(&sk->sk_callback_lock);
1267         return ino;
1268 }
1269 EXPORT_SYMBOL(sock_i_ino);
1270
1271 /*
1272  * Allocate a skb from the socket's send buffer.
1273  */
1274 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1275                              gfp_t priority)
1276 {
1277         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1278                 struct sk_buff *skb = alloc_skb(size, priority);
1279                 if (skb) {
1280                         skb_set_owner_w(skb, sk);
1281                         return skb;
1282                 }
1283         }
1284         return NULL;
1285 }
1286 EXPORT_SYMBOL(sock_wmalloc);
1287
1288 /*
1289  * Allocate a skb from the socket's receive buffer.
1290  */
1291 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1292                              gfp_t priority)
1293 {
1294         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1295                 struct sk_buff *skb = alloc_skb(size, priority);
1296                 if (skb) {
1297                         skb_set_owner_r(skb, sk);
1298                         return skb;
1299                 }
1300         }
1301         return NULL;
1302 }
1303
1304 /*
1305  * Allocate a memory block from the socket's option memory buffer.
1306  */
1307 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1308 {
1309         if ((unsigned)size <= sysctl_optmem_max &&
1310             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1311                 void *mem;
1312                 /* First do the add, to avoid the race if kmalloc
1313                  * might sleep.
1314                  */
1315                 atomic_add(size, &sk->sk_omem_alloc);
1316                 mem = kmalloc(size, priority);
1317                 if (mem)
1318                         return mem;
1319                 atomic_sub(size, &sk->sk_omem_alloc);
1320         }
1321         return NULL;
1322 }
1323 EXPORT_SYMBOL(sock_kmalloc);
1324
1325 /*
1326  * Free an option memory block.
1327  */
1328 void sock_kfree_s(struct sock *sk, void *mem, int size)
1329 {
1330         kfree(mem);
1331         atomic_sub(size, &sk->sk_omem_alloc);
1332 }
1333 EXPORT_SYMBOL(sock_kfree_s);
1334
1335 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1336    I think, these locks should be removed for datagram sockets.
1337  */
1338 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1339 {
1340         DEFINE_WAIT(wait);
1341
1342         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1343         for (;;) {
1344                 if (!timeo)
1345                         break;
1346                 if (signal_pending(current))
1347                         break;
1348                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1349                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1350                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1351                         break;
1352                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1353                         break;
1354                 if (sk->sk_err)
1355                         break;
1356                 timeo = schedule_timeout(timeo);
1357         }
1358         finish_wait(sk->sk_sleep, &wait);
1359         return timeo;
1360 }
1361
1362
1363 /*
1364  *      Generic send/receive buffer handlers
1365  */
1366
1367 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1368                                      unsigned long data_len, int noblock,
1369                                      int *errcode)
1370 {
1371         struct sk_buff *skb;
1372         gfp_t gfp_mask;
1373         long timeo;
1374         int err;
1375
1376         gfp_mask = sk->sk_allocation;
1377         if (gfp_mask & __GFP_WAIT)
1378                 gfp_mask |= __GFP_REPEAT;
1379
1380         timeo = sock_sndtimeo(sk, noblock);
1381         while (1) {
1382                 err = sock_error(sk);
1383                 if (err != 0)
1384                         goto failure;
1385
1386                 err = -EPIPE;
1387                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1388                         goto failure;
1389
1390                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1391                         skb = alloc_skb(header_len, gfp_mask);
1392                         if (skb) {
1393                                 int npages;
1394                                 int i;
1395
1396                                 /* No pages, we're done... */
1397                                 if (!data_len)
1398                                         break;
1399
1400                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1401                                 skb->truesize += data_len;
1402                                 skb_shinfo(skb)->nr_frags = npages;
1403                                 for (i = 0; i < npages; i++) {
1404                                         struct page *page;
1405                                         skb_frag_t *frag;
1406
1407                                         page = alloc_pages(sk->sk_allocation, 0);
1408                                         if (!page) {
1409                                                 err = -ENOBUFS;
1410                                                 skb_shinfo(skb)->nr_frags = i;
1411                                                 kfree_skb(skb);
1412                                                 goto failure;
1413                                         }
1414
1415                                         frag = &skb_shinfo(skb)->frags[i];
1416                                         frag->page = page;
1417                                         frag->page_offset = 0;
1418                                         frag->size = (data_len >= PAGE_SIZE ?
1419                                                       PAGE_SIZE :
1420                                                       data_len);
1421                                         data_len -= PAGE_SIZE;
1422                                 }
1423
1424                                 /* Full success... */
1425                                 break;
1426                         }
1427                         err = -ENOBUFS;
1428                         goto failure;
1429                 }
1430                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1431                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1432                 err = -EAGAIN;
1433                 if (!timeo)
1434                         goto failure;
1435                 if (signal_pending(current))
1436                         goto interrupted;
1437                 timeo = sock_wait_for_wmem(sk, timeo);
1438         }
1439
1440         skb_set_owner_w(skb, sk);
1441         return skb;
1442
1443 interrupted:
1444         err = sock_intr_errno(timeo);
1445 failure:
1446         *errcode = err;
1447         return NULL;
1448 }
1449 EXPORT_SYMBOL(sock_alloc_send_pskb);
1450
1451 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1452                                     int noblock, int *errcode)
1453 {
1454         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1455 }
1456 EXPORT_SYMBOL(sock_alloc_send_skb);
1457
1458 static void __lock_sock(struct sock *sk)
1459 {
1460         DEFINE_WAIT(wait);
1461
1462         for (;;) {
1463                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1464                                         TASK_UNINTERRUPTIBLE);
1465                 spin_unlock_bh(&sk->sk_lock.slock);
1466                 schedule();
1467                 spin_lock_bh(&sk->sk_lock.slock);
1468                 if (!sock_owned_by_user(sk))
1469                         break;
1470         }
1471         finish_wait(&sk->sk_lock.wq, &wait);
1472 }
1473
1474 static void __release_sock(struct sock *sk)
1475 {
1476         struct sk_buff *skb = sk->sk_backlog.head;
1477
1478         do {
1479                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1480                 bh_unlock_sock(sk);
1481
1482                 do {
1483                         struct sk_buff *next = skb->next;
1484
1485                         skb->next = NULL;
1486                         sk_backlog_rcv(sk, skb);
1487
1488                         /*
1489                          * We are in process context here with softirqs
1490                          * disabled, use cond_resched_softirq() to preempt.
1491                          * This is safe to do because we've taken the backlog
1492                          * queue private:
1493                          */
1494                         cond_resched_softirq();
1495
1496                         skb = next;
1497                 } while (skb != NULL);
1498
1499                 bh_lock_sock(sk);
1500         } while ((skb = sk->sk_backlog.head) != NULL);
1501 }
1502
1503 /**
1504  * sk_wait_data - wait for data to arrive at sk_receive_queue
1505  * @sk:    sock to wait on
1506  * @timeo: for how long
1507  *
1508  * Now socket state including sk->sk_err is changed only under lock,
1509  * hence we may omit checks after joining wait queue.
1510  * We check receive queue before schedule() only as optimization;
1511  * it is very likely that release_sock() added new data.
1512  */
1513 int sk_wait_data(struct sock *sk, long *timeo)
1514 {
1515         int rc;
1516         DEFINE_WAIT(wait);
1517
1518         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1519         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1520         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1521         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1522         finish_wait(sk->sk_sleep, &wait);
1523         return rc;
1524 }
1525 EXPORT_SYMBOL(sk_wait_data);
1526
1527 /**
1528  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1529  *      @sk: socket
1530  *      @size: memory size to allocate
1531  *      @kind: allocation type
1532  *
1533  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1534  *      rmem allocation. This function assumes that protocols which have
1535  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1536  */
1537 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1538 {
1539         struct proto *prot = sk->sk_prot;
1540         int amt = sk_mem_pages(size);
1541         int allocated;
1542
1543         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1544         allocated = atomic_add_return(amt, prot->memory_allocated);
1545
1546         /* Under limit. */
1547         if (allocated <= prot->sysctl_mem[0]) {
1548                 if (prot->memory_pressure && *prot->memory_pressure)
1549                         *prot->memory_pressure = 0;
1550                 return 1;
1551         }
1552
1553         /* Under pressure. */
1554         if (allocated > prot->sysctl_mem[1])
1555                 if (prot->enter_memory_pressure)
1556                         prot->enter_memory_pressure(sk);
1557
1558         /* Over hard limit. */
1559         if (allocated > prot->sysctl_mem[2])
1560                 goto suppress_allocation;
1561
1562         /* guarantee minimum buffer size under pressure */
1563         if (kind == SK_MEM_RECV) {
1564                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1565                         return 1;
1566         } else { /* SK_MEM_SEND */
1567                 if (sk->sk_type == SOCK_STREAM) {
1568                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1569                                 return 1;
1570                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1571                            prot->sysctl_wmem[0])
1572                                 return 1;
1573         }
1574
1575         if (prot->memory_pressure) {
1576                 int alloc;
1577
1578                 if (!*prot->memory_pressure)
1579                         return 1;
1580                 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1581                 if (prot->sysctl_mem[2] > alloc *
1582                     sk_mem_pages(sk->sk_wmem_queued +
1583                                  atomic_read(&sk->sk_rmem_alloc) +
1584                                  sk->sk_forward_alloc))
1585                         return 1;
1586         }
1587
1588 suppress_allocation:
1589
1590         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1591                 sk_stream_moderate_sndbuf(sk);
1592
1593                 /* Fail only if socket is _under_ its sndbuf.
1594                  * In this case we cannot block, so that we have to fail.
1595                  */
1596                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1597                         return 1;
1598         }
1599
1600         /* Alas. Undo changes. */
1601         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1602         atomic_sub(amt, prot->memory_allocated);
1603         return 0;
1604 }
1605 EXPORT_SYMBOL(__sk_mem_schedule);
1606
1607 /**
1608  *      __sk_reclaim - reclaim memory_allocated
1609  *      @sk: socket
1610  */
1611 void __sk_mem_reclaim(struct sock *sk)
1612 {
1613         struct proto *prot = sk->sk_prot;
1614
1615         atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1616                    prot->memory_allocated);
1617         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1618
1619         if (prot->memory_pressure && *prot->memory_pressure &&
1620             (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1621                 *prot->memory_pressure = 0;
1622 }
1623 EXPORT_SYMBOL(__sk_mem_reclaim);
1624
1625
1626 /*
1627  * Set of default routines for initialising struct proto_ops when
1628  * the protocol does not support a particular function. In certain
1629  * cases where it makes no sense for a protocol to have a "do nothing"
1630  * function, some default processing is provided.
1631  */
1632
1633 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1634 {
1635         return -EOPNOTSUPP;
1636 }
1637 EXPORT_SYMBOL(sock_no_bind);
1638
1639 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1640                     int len, int flags)
1641 {
1642         return -EOPNOTSUPP;
1643 }
1644 EXPORT_SYMBOL(sock_no_connect);
1645
1646 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1647 {
1648         return -EOPNOTSUPP;
1649 }
1650 EXPORT_SYMBOL(sock_no_socketpair);
1651
1652 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1653 {
1654         return -EOPNOTSUPP;
1655 }
1656 EXPORT_SYMBOL(sock_no_accept);
1657
1658 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1659                     int *len, int peer)
1660 {
1661         return -EOPNOTSUPP;
1662 }
1663 EXPORT_SYMBOL(sock_no_getname);
1664
1665 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1666 {
1667         return 0;
1668 }
1669 EXPORT_SYMBOL(sock_no_poll);
1670
1671 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1672 {
1673         return -EOPNOTSUPP;
1674 }
1675 EXPORT_SYMBOL(sock_no_ioctl);
1676
1677 int sock_no_listen(struct socket *sock, int backlog)
1678 {
1679         return -EOPNOTSUPP;
1680 }
1681 EXPORT_SYMBOL(sock_no_listen);
1682
1683 int sock_no_shutdown(struct socket *sock, int how)
1684 {
1685         return -EOPNOTSUPP;
1686 }
1687 EXPORT_SYMBOL(sock_no_shutdown);
1688
1689 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1690                     char __user *optval, int optlen)
1691 {
1692         return -EOPNOTSUPP;
1693 }
1694 EXPORT_SYMBOL(sock_no_setsockopt);
1695
1696 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1697                     char __user *optval, int __user *optlen)
1698 {
1699         return -EOPNOTSUPP;
1700 }
1701 EXPORT_SYMBOL(sock_no_getsockopt);
1702
1703 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1704                     size_t len)
1705 {
1706         return -EOPNOTSUPP;
1707 }
1708 EXPORT_SYMBOL(sock_no_sendmsg);
1709
1710 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1711                     size_t len, int flags)
1712 {
1713         return -EOPNOTSUPP;
1714 }
1715 EXPORT_SYMBOL(sock_no_recvmsg);
1716
1717 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1718 {
1719         /* Mirror missing mmap method error code */
1720         return -ENODEV;
1721 }
1722 EXPORT_SYMBOL(sock_no_mmap);
1723
1724 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1725 {
1726         ssize_t res;
1727         struct msghdr msg = {.msg_flags = flags};
1728         struct kvec iov;
1729         char *kaddr = kmap(page);
1730         iov.iov_base = kaddr + offset;
1731         iov.iov_len = size;
1732         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1733         kunmap(page);
1734         return res;
1735 }
1736 EXPORT_SYMBOL(sock_no_sendpage);
1737
1738 /*
1739  *      Default Socket Callbacks
1740  */
1741
1742 static void sock_def_wakeup(struct sock *sk)
1743 {
1744         read_lock(&sk->sk_callback_lock);
1745         if (sk_has_sleeper(sk))
1746                 wake_up_interruptible_all(sk->sk_sleep);
1747         read_unlock(&sk->sk_callback_lock);
1748 }
1749
1750 static void sock_def_error_report(struct sock *sk)
1751 {
1752         read_lock(&sk->sk_callback_lock);
1753         if (sk_has_sleeper(sk))
1754                 wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
1755         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1756         read_unlock(&sk->sk_callback_lock);
1757 }
1758
1759 static void sock_def_readable(struct sock *sk, int len)
1760 {
1761         read_lock(&sk->sk_callback_lock);
1762         if (sk_has_sleeper(sk))
1763                 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1764                                                 POLLRDNORM | POLLRDBAND);
1765         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1766         read_unlock(&sk->sk_callback_lock);
1767 }
1768
1769 static void sock_def_write_space(struct sock *sk)
1770 {
1771         read_lock(&sk->sk_callback_lock);
1772
1773         /* Do not wake up a writer until he can make "significant"
1774          * progress.  --DaveM
1775          */
1776         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1777                 if (sk_has_sleeper(sk))
1778                         wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1779                                                 POLLWRNORM | POLLWRBAND);
1780
1781                 /* Should agree with poll, otherwise some programs break */
1782                 if (sock_writeable(sk))
1783                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1784         }
1785
1786         read_unlock(&sk->sk_callback_lock);
1787 }
1788
1789 static void sock_def_destruct(struct sock *sk)
1790 {
1791         kfree(sk->sk_protinfo);
1792 }
1793
1794 void sk_send_sigurg(struct sock *sk)
1795 {
1796         if (sk->sk_socket && sk->sk_socket->file)
1797                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1798                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1799 }
1800 EXPORT_SYMBOL(sk_send_sigurg);
1801
1802 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1803                     unsigned long expires)
1804 {
1805         if (!mod_timer(timer, expires))
1806                 sock_hold(sk);
1807 }
1808 EXPORT_SYMBOL(sk_reset_timer);
1809
1810 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1811 {
1812         if (timer_pending(timer) && del_timer(timer))
1813                 __sock_put(sk);
1814 }
1815 EXPORT_SYMBOL(sk_stop_timer);
1816
1817 void sock_init_data(struct socket *sock, struct sock *sk)
1818 {
1819         skb_queue_head_init(&sk->sk_receive_queue);
1820         skb_queue_head_init(&sk->sk_write_queue);
1821         skb_queue_head_init(&sk->sk_error_queue);
1822 #ifdef CONFIG_NET_DMA
1823         skb_queue_head_init(&sk->sk_async_wait_queue);
1824 #endif
1825
1826         sk->sk_send_head        =       NULL;
1827
1828         init_timer(&sk->sk_timer);
1829
1830         sk->sk_allocation       =       GFP_KERNEL;
1831         sk->sk_rcvbuf           =       sysctl_rmem_default;
1832         sk->sk_sndbuf           =       sysctl_wmem_default;
1833         sk->sk_state            =       TCP_CLOSE;
1834         sk_set_socket(sk, sock);
1835
1836         sock_set_flag(sk, SOCK_ZAPPED);
1837
1838         if (sock) {
1839                 sk->sk_type     =       sock->type;
1840                 sk->sk_sleep    =       &sock->wait;
1841                 sock->sk        =       sk;
1842         } else
1843                 sk->sk_sleep    =       NULL;
1844
1845         rwlock_init(&sk->sk_dst_lock);
1846         rwlock_init(&sk->sk_callback_lock);
1847         lockdep_set_class_and_name(&sk->sk_callback_lock,
1848                         af_callback_keys + sk->sk_family,
1849                         af_family_clock_key_strings[sk->sk_family]);
1850
1851         sk->sk_state_change     =       sock_def_wakeup;
1852         sk->sk_data_ready       =       sock_def_readable;
1853         sk->sk_write_space      =       sock_def_write_space;
1854         sk->sk_error_report     =       sock_def_error_report;
1855         sk->sk_destruct         =       sock_def_destruct;
1856
1857         sk->sk_sndmsg_page      =       NULL;
1858         sk->sk_sndmsg_off       =       0;
1859
1860         sk->sk_peercred.pid     =       0;
1861         sk->sk_peercred.uid     =       -1;
1862         sk->sk_peercred.gid     =       -1;
1863         sk->sk_write_pending    =       0;
1864         sk->sk_rcvlowat         =       1;
1865         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1866         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1867
1868         sk->sk_stamp = ktime_set(-1L, 0);
1869
1870         /*
1871          * Before updating sk_refcnt, we must commit prior changes to memory
1872          * (Documentation/RCU/rculist_nulls.txt for details)
1873          */
1874         smp_wmb();
1875         atomic_set(&sk->sk_refcnt, 1);
1876         atomic_set(&sk->sk_drops, 0);
1877 }
1878 EXPORT_SYMBOL(sock_init_data);
1879
1880 void lock_sock_nested(struct sock *sk, int subclass)
1881 {
1882         might_sleep();
1883         spin_lock_bh(&sk->sk_lock.slock);
1884         if (sk->sk_lock.owned)
1885                 __lock_sock(sk);
1886         sk->sk_lock.owned = 1;
1887         spin_unlock(&sk->sk_lock.slock);
1888         /*
1889          * The sk_lock has mutex_lock() semantics here:
1890          */
1891         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1892         local_bh_enable();
1893 }
1894 EXPORT_SYMBOL(lock_sock_nested);
1895
1896 void release_sock(struct sock *sk)
1897 {
1898         /*
1899          * The sk_lock has mutex_unlock() semantics:
1900          */
1901         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1902
1903         spin_lock_bh(&sk->sk_lock.slock);
1904         if (sk->sk_backlog.tail)
1905                 __release_sock(sk);
1906         sk->sk_lock.owned = 0;
1907         if (waitqueue_active(&sk->sk_lock.wq))
1908                 wake_up(&sk->sk_lock.wq);
1909         spin_unlock_bh(&sk->sk_lock.slock);
1910 }
1911 EXPORT_SYMBOL(release_sock);
1912
1913 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1914 {
1915         struct timeval tv;
1916         if (!sock_flag(sk, SOCK_TIMESTAMP))
1917                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1918         tv = ktime_to_timeval(sk->sk_stamp);
1919         if (tv.tv_sec == -1)
1920                 return -ENOENT;
1921         if (tv.tv_sec == 0) {
1922                 sk->sk_stamp = ktime_get_real();
1923                 tv = ktime_to_timeval(sk->sk_stamp);
1924         }
1925         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1926 }
1927 EXPORT_SYMBOL(sock_get_timestamp);
1928
1929 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1930 {
1931         struct timespec ts;
1932         if (!sock_flag(sk, SOCK_TIMESTAMP))
1933                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1934         ts = ktime_to_timespec(sk->sk_stamp);
1935         if (ts.tv_sec == -1)
1936                 return -ENOENT;
1937         if (ts.tv_sec == 0) {
1938                 sk->sk_stamp = ktime_get_real();
1939                 ts = ktime_to_timespec(sk->sk_stamp);
1940         }
1941         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1942 }
1943 EXPORT_SYMBOL(sock_get_timestampns);
1944
1945 void sock_enable_timestamp(struct sock *sk, int flag)
1946 {
1947         if (!sock_flag(sk, flag)) {
1948                 sock_set_flag(sk, flag);
1949                 /*
1950                  * we just set one of the two flags which require net
1951                  * time stamping, but time stamping might have been on
1952                  * already because of the other one
1953                  */
1954                 if (!sock_flag(sk,
1955                                 flag == SOCK_TIMESTAMP ?
1956                                 SOCK_TIMESTAMPING_RX_SOFTWARE :
1957                                 SOCK_TIMESTAMP))
1958                         net_enable_timestamp();
1959         }
1960 }
1961
1962 /*
1963  *      Get a socket option on an socket.
1964  *
1965  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1966  *      asynchronous errors should be reported by getsockopt. We assume
1967  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1968  */
1969 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1970                            char __user *optval, int __user *optlen)
1971 {
1972         struct sock *sk = sock->sk;
1973
1974         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1975 }
1976 EXPORT_SYMBOL(sock_common_getsockopt);
1977
1978 #ifdef CONFIG_COMPAT
1979 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1980                                   char __user *optval, int __user *optlen)
1981 {
1982         struct sock *sk = sock->sk;
1983
1984         if (sk->sk_prot->compat_getsockopt != NULL)
1985                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1986                                                       optval, optlen);
1987         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1988 }
1989 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1990 #endif
1991
1992 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1993                         struct msghdr *msg, size_t size, int flags)
1994 {
1995         struct sock *sk = sock->sk;
1996         int addr_len = 0;
1997         int err;
1998
1999         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2000                                    flags & ~MSG_DONTWAIT, &addr_len);
2001         if (err >= 0)
2002                 msg->msg_namelen = addr_len;
2003         return err;
2004 }
2005 EXPORT_SYMBOL(sock_common_recvmsg);
2006
2007 /*
2008  *      Set socket options on an inet socket.
2009  */
2010 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2011                            char __user *optval, int optlen)
2012 {
2013         struct sock *sk = sock->sk;
2014
2015         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2016 }
2017 EXPORT_SYMBOL(sock_common_setsockopt);
2018
2019 #ifdef CONFIG_COMPAT
2020 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2021                                   char __user *optval, int optlen)
2022 {
2023         struct sock *sk = sock->sk;
2024
2025         if (sk->sk_prot->compat_setsockopt != NULL)
2026                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2027                                                       optval, optlen);
2028         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2029 }
2030 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2031 #endif
2032
2033 void sk_common_release(struct sock *sk)
2034 {
2035         if (sk->sk_prot->destroy)
2036                 sk->sk_prot->destroy(sk);
2037
2038         /*
2039          * Observation: when sock_common_release is called, processes have
2040          * no access to socket. But net still has.
2041          * Step one, detach it from networking:
2042          *
2043          * A. Remove from hash tables.
2044          */
2045
2046         sk->sk_prot->unhash(sk);
2047
2048         /*
2049          * In this point socket cannot receive new packets, but it is possible
2050          * that some packets are in flight because some CPU runs receiver and
2051          * did hash table lookup before we unhashed socket. They will achieve
2052          * receive queue and will be purged by socket destructor.
2053          *
2054          * Also we still have packets pending on receive queue and probably,
2055          * our own packets waiting in device queues. sock_destroy will drain
2056          * receive queue, but transmitted packets will delay socket destruction
2057          * until the last reference will be released.
2058          */
2059
2060         sock_orphan(sk);
2061
2062         xfrm_sk_free_policy(sk);
2063
2064         sk_refcnt_debug_release(sk);
2065         sock_put(sk);
2066 }
2067 EXPORT_SYMBOL(sk_common_release);
2068
2069 static DEFINE_RWLOCK(proto_list_lock);
2070 static LIST_HEAD(proto_list);
2071
2072 #ifdef CONFIG_PROC_FS
2073 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2074 struct prot_inuse {
2075         int val[PROTO_INUSE_NR];
2076 };
2077
2078 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2079
2080 #ifdef CONFIG_NET_NS
2081 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2082 {
2083         int cpu = smp_processor_id();
2084         per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2085 }
2086 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2087
2088 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2089 {
2090         int cpu, idx = prot->inuse_idx;
2091         int res = 0;
2092
2093         for_each_possible_cpu(cpu)
2094                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2095
2096         return res >= 0 ? res : 0;
2097 }
2098 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2099
2100 static int sock_inuse_init_net(struct net *net)
2101 {
2102         net->core.inuse = alloc_percpu(struct prot_inuse);
2103         return net->core.inuse ? 0 : -ENOMEM;
2104 }
2105
2106 static void sock_inuse_exit_net(struct net *net)
2107 {
2108         free_percpu(net->core.inuse);
2109 }
2110
2111 static struct pernet_operations net_inuse_ops = {
2112         .init = sock_inuse_init_net,
2113         .exit = sock_inuse_exit_net,
2114 };
2115
2116 static __init int net_inuse_init(void)
2117 {
2118         if (register_pernet_subsys(&net_inuse_ops))
2119                 panic("Cannot initialize net inuse counters");
2120
2121         return 0;
2122 }
2123
2124 core_initcall(net_inuse_init);
2125 #else
2126 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2127
2128 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2129 {
2130         __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2131 }
2132 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2133
2134 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2135 {
2136         int cpu, idx = prot->inuse_idx;
2137         int res = 0;
2138
2139         for_each_possible_cpu(cpu)
2140                 res += per_cpu(prot_inuse, cpu).val[idx];
2141
2142         return res >= 0 ? res : 0;
2143 }
2144 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2145 #endif
2146
2147 static void assign_proto_idx(struct proto *prot)
2148 {
2149         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2150
2151         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2152                 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2153                 return;
2154         }
2155
2156         set_bit(prot->inuse_idx, proto_inuse_idx);
2157 }
2158
2159 static void release_proto_idx(struct proto *prot)
2160 {
2161         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2162                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2163 }
2164 #else
2165 static inline void assign_proto_idx(struct proto *prot)
2166 {
2167 }
2168
2169 static inline void release_proto_idx(struct proto *prot)
2170 {
2171 }
2172 #endif
2173
2174 int proto_register(struct proto *prot, int alloc_slab)
2175 {
2176         if (alloc_slab) {
2177                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2178                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2179                                         NULL);
2180
2181                 if (prot->slab == NULL) {
2182                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2183                                prot->name);
2184                         goto out;
2185                 }
2186
2187                 if (prot->rsk_prot != NULL) {
2188                         static const char mask[] = "request_sock_%s";
2189
2190                         prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2191                         if (prot->rsk_prot->slab_name == NULL)
2192                                 goto out_free_sock_slab;
2193
2194                         sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2195                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2196                                                                  prot->rsk_prot->obj_size, 0,
2197                                                                  SLAB_HWCACHE_ALIGN, NULL);
2198
2199                         if (prot->rsk_prot->slab == NULL) {
2200                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2201                                        prot->name);
2202                                 goto out_free_request_sock_slab_name;
2203                         }
2204                 }
2205
2206                 if (prot->twsk_prot != NULL) {
2207                         static const char mask[] = "tw_sock_%s";
2208
2209                         prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2210
2211                         if (prot->twsk_prot->twsk_slab_name == NULL)
2212                                 goto out_free_request_sock_slab;
2213
2214                         sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2215                         prot->twsk_prot->twsk_slab =
2216                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2217                                                   prot->twsk_prot->twsk_obj_size,
2218                                                   0,
2219                                                   SLAB_HWCACHE_ALIGN |
2220                                                         prot->slab_flags,
2221                                                   NULL);
2222                         if (prot->twsk_prot->twsk_slab == NULL)
2223                                 goto out_free_timewait_sock_slab_name;
2224                 }
2225         }
2226
2227         write_lock(&proto_list_lock);
2228         list_add(&prot->node, &proto_list);
2229         assign_proto_idx(prot);
2230         write_unlock(&proto_list_lock);
2231         return 0;
2232
2233 out_free_timewait_sock_slab_name:
2234         kfree(prot->twsk_prot->twsk_slab_name);
2235 out_free_request_sock_slab:
2236         if (prot->rsk_prot && prot->rsk_prot->slab) {
2237                 kmem_cache_destroy(prot->rsk_prot->slab);
2238                 prot->rsk_prot->slab = NULL;
2239         }
2240 out_free_request_sock_slab_name:
2241         kfree(prot->rsk_prot->slab_name);
2242 out_free_sock_slab:
2243         kmem_cache_destroy(prot->slab);
2244         prot->slab = NULL;
2245 out:
2246         return -ENOBUFS;
2247 }
2248 EXPORT_SYMBOL(proto_register);
2249
2250 void proto_unregister(struct proto *prot)
2251 {
2252         write_lock(&proto_list_lock);
2253         release_proto_idx(prot);
2254         list_del(&prot->node);
2255         write_unlock(&proto_list_lock);
2256
2257         if (prot->slab != NULL) {
2258                 kmem_cache_destroy(prot->slab);
2259                 prot->slab = NULL;
2260         }
2261
2262         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2263                 kmem_cache_destroy(prot->rsk_prot->slab);
2264                 kfree(prot->rsk_prot->slab_name);
2265                 prot->rsk_prot->slab = NULL;
2266         }
2267
2268         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2269                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2270                 kfree(prot->twsk_prot->twsk_slab_name);
2271                 prot->twsk_prot->twsk_slab = NULL;
2272         }
2273 }
2274 EXPORT_SYMBOL(proto_unregister);
2275
2276 #ifdef CONFIG_PROC_FS
2277 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2278         __acquires(proto_list_lock)
2279 {
2280         read_lock(&proto_list_lock);
2281         return seq_list_start_head(&proto_list, *pos);
2282 }
2283
2284 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2285 {
2286         return seq_list_next(v, &proto_list, pos);
2287 }
2288
2289 static void proto_seq_stop(struct seq_file *seq, void *v)
2290         __releases(proto_list_lock)
2291 {
2292         read_unlock(&proto_list_lock);
2293 }
2294
2295 static char proto_method_implemented(const void *method)
2296 {
2297         return method == NULL ? 'n' : 'y';
2298 }
2299
2300 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2301 {
2302         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2303                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2304                    proto->name,
2305                    proto->obj_size,
2306                    sock_prot_inuse_get(seq_file_net(seq), proto),
2307                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2308                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2309                    proto->max_header,
2310                    proto->slab == NULL ? "no" : "yes",
2311                    module_name(proto->owner),
2312                    proto_method_implemented(proto->close),
2313                    proto_method_implemented(proto->connect),
2314                    proto_method_implemented(proto->disconnect),
2315                    proto_method_implemented(proto->accept),
2316                    proto_method_implemented(proto->ioctl),
2317                    proto_method_implemented(proto->init),
2318                    proto_method_implemented(proto->destroy),
2319                    proto_method_implemented(proto->shutdown),
2320                    proto_method_implemented(proto->setsockopt),
2321                    proto_method_implemented(proto->getsockopt),
2322                    proto_method_implemented(proto->sendmsg),
2323                    proto_method_implemented(proto->recvmsg),
2324                    proto_method_implemented(proto->sendpage),
2325                    proto_method_implemented(proto->bind),
2326                    proto_method_implemented(proto->backlog_rcv),
2327                    proto_method_implemented(proto->hash),
2328                    proto_method_implemented(proto->unhash),
2329                    proto_method_implemented(proto->get_port),
2330                    proto_method_implemented(proto->enter_memory_pressure));
2331 }
2332
2333 static int proto_seq_show(struct seq_file *seq, void *v)
2334 {
2335         if (v == &proto_list)
2336                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2337                            "protocol",
2338                            "size",
2339                            "sockets",
2340                            "memory",
2341                            "press",
2342                            "maxhdr",
2343                            "slab",
2344                            "module",
2345                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2346         else
2347                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2348         return 0;
2349 }
2350
2351 static const struct seq_operations proto_seq_ops = {
2352         .start  = proto_seq_start,
2353         .next   = proto_seq_next,
2354         .stop   = proto_seq_stop,
2355         .show   = proto_seq_show,
2356 };
2357
2358 static int proto_seq_open(struct inode *inode, struct file *file)
2359 {
2360         return seq_open_net(inode, file, &proto_seq_ops,
2361                             sizeof(struct seq_net_private));
2362 }
2363
2364 static const struct file_operations proto_seq_fops = {
2365         .owner          = THIS_MODULE,
2366         .open           = proto_seq_open,
2367         .read           = seq_read,
2368         .llseek         = seq_lseek,
2369         .release        = seq_release_net,
2370 };
2371
2372 static __net_init int proto_init_net(struct net *net)
2373 {
2374         if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2375                 return -ENOMEM;
2376
2377         return 0;
2378 }
2379
2380 static __net_exit void proto_exit_net(struct net *net)
2381 {
2382         proc_net_remove(net, "protocols");
2383 }
2384
2385
2386 static __net_initdata struct pernet_operations proto_net_ops = {
2387         .init = proto_init_net,
2388         .exit = proto_exit_net,
2389 };
2390
2391 static int __init proto_init(void)
2392 {
2393         return register_pernet_subsys(&proto_net_ops);
2394 }
2395
2396 subsys_initcall(proto_init);
2397
2398 #endif /* PROC_FS */