net: Add __sock_queue_rcv_skb()
[pandora-kernel.git] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Florian La Roche, <flla@stud.uni-sb.de>
13  *              Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *              Alan Cox        :       Numerous verify_area() problems
17  *              Alan Cox        :       Connecting on a connecting socket
18  *                                      now returns an error for tcp.
19  *              Alan Cox        :       sock->protocol is set correctly.
20  *                                      and is not sometimes left as 0.
21  *              Alan Cox        :       connect handles icmp errors on a
22  *                                      connect properly. Unfortunately there
23  *                                      is a restart syscall nasty there. I
24  *                                      can't match BSD without hacking the C
25  *                                      library. Ideas urgently sought!
26  *              Alan Cox        :       Disallow bind() to addresses that are
27  *                                      not ours - especially broadcast ones!!
28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
30  *                                      instead they leave that for the DESTROY timer.
31  *              Alan Cox        :       Clean up error flag in accept
32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
33  *                                      was buggy. Put a remove_sock() in the handler
34  *                                      for memory when we hit 0. Also altered the timer
35  *                                      code. The ACK stuff can wait and needs major
36  *                                      TCP layer surgery.
37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
38  *                                      and fixed timer/inet_bh race.
39  *              Alan Cox        :       Added zapped flag for TCP
40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
47  *      Pauline Middelink       :       identd support
48  *              Alan Cox        :       Fixed connect() taking signals I think.
49  *              Alan Cox        :       SO_LINGER supported
50  *              Alan Cox        :       Error reporting fixes
51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
52  *              Alan Cox        :       inet sockets don't set sk->type!
53  *              Alan Cox        :       Split socket option code
54  *              Alan Cox        :       Callbacks
55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
56  *              Alex            :       Removed restriction on inet fioctl
57  *              Alan Cox        :       Splitting INET from NET core
58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
60  *              Alan Cox        :       Split IP from generic code
61  *              Alan Cox        :       New kfree_skbmem()
62  *              Alan Cox        :       Make SO_DEBUG superuser only.
63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
64  *                                      (compatibility fix)
65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
66  *              Alan Cox        :       Allocator for a socket is settable.
67  *              Alan Cox        :       SO_ERROR includes soft errors.
68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
69  *              Alan Cox        :       Generic socket allocation to make hooks
70  *                                      easier (suggested by Craig Metz).
71  *              Michael Pall    :       SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
79  *              Andi Kleen      :       Fix write_space callback
80  *              Chris Evans     :       Security fixes - signedness again
81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *              This program is free software; you can redistribute it and/or
87  *              modify it under the terms of the GNU General Public License
88  *              as published by the Free Software Foundation; either version
89  *              2 of the License, or (at your option) any later version.
90  */
91
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114
115 #include <asm/uaccess.h>
116 #include <asm/system.h>
117
118 #include <linux/netdevice.h>
119 #include <net/protocol.h>
120 #include <linux/skbuff.h>
121 #include <net/net_namespace.h>
122 #include <net/request_sock.h>
123 #include <net/sock.h>
124 #include <linux/net_tstamp.h>
125 #include <net/xfrm.h>
126 #include <linux/ipsec.h>
127 #include <net/cls_cgroup.h>
128
129 #include <linux/filter.h>
130
131 #include <trace/events/sock.h>
132
133 #ifdef CONFIG_INET
134 #include <net/tcp.h>
135 #endif
136
137 /*
138  * Each address family might have different locking rules, so we have
139  * one slock key per address family:
140  */
141 static struct lock_class_key af_family_keys[AF_MAX];
142 static struct lock_class_key af_family_slock_keys[AF_MAX];
143
144 /*
145  * Make lock validator output more readable. (we pre-construct these
146  * strings build-time, so that runtime initialization of socket
147  * locks is fast):
148  */
149 static const char *const af_family_key_strings[AF_MAX+1] = {
150   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
151   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
152   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
153   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
154   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
155   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
156   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
157   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
158   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
159   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
160   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
161   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
162   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
163   "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
164 };
165 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
166   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
167   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
168   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
169   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
170   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
171   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
172   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
173   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
174   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
175   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
176   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
177   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
178   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
179   "slock-AF_NFC"   , "slock-AF_MAX"
180 };
181 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
182   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
183   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
184   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
185   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
186   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
187   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
188   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
189   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
190   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
191   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
192   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
193   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
194   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
195   "clock-AF_NFC"   , "clock-AF_MAX"
196 };
197
198 /*
199  * sk_callback_lock locking rules are per-address-family,
200  * so split the lock classes by using a per-AF key:
201  */
202 static struct lock_class_key af_callback_keys[AF_MAX];
203
204 /* Take into consideration the size of the struct sk_buff overhead in the
205  * determination of these values, since that is non-constant across
206  * platforms.  This makes socket queueing behavior and performance
207  * not depend upon such differences.
208  */
209 #define _SK_MEM_PACKETS         256
210 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
211 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
212 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
213
214 /* Run time adjustable parameters. */
215 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
216 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
217 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
218 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
219
220 /* Maximal space eaten by iovec or ancillary data plus some space */
221 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
222 EXPORT_SYMBOL(sysctl_optmem_max);
223
224 #if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
225 int net_cls_subsys_id = -1;
226 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
227 #endif
228
229 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
230 {
231         struct timeval tv;
232
233         if (optlen < sizeof(tv))
234                 return -EINVAL;
235         if (copy_from_user(&tv, optval, sizeof(tv)))
236                 return -EFAULT;
237         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
238                 return -EDOM;
239
240         if (tv.tv_sec < 0) {
241                 static int warned __read_mostly;
242
243                 *timeo_p = 0;
244                 if (warned < 10 && net_ratelimit()) {
245                         warned++;
246                         printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
247                                "tries to set negative timeout\n",
248                                 current->comm, task_pid_nr(current));
249                 }
250                 return 0;
251         }
252         *timeo_p = MAX_SCHEDULE_TIMEOUT;
253         if (tv.tv_sec == 0 && tv.tv_usec == 0)
254                 return 0;
255         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
256                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
257         return 0;
258 }
259
260 static void sock_warn_obsolete_bsdism(const char *name)
261 {
262         static int warned;
263         static char warncomm[TASK_COMM_LEN];
264         if (strcmp(warncomm, current->comm) && warned < 5) {
265                 strcpy(warncomm,  current->comm);
266                 printk(KERN_WARNING "process `%s' is using obsolete "
267                        "%s SO_BSDCOMPAT\n", warncomm, name);
268                 warned++;
269         }
270 }
271
272 static void sock_disable_timestamp(struct sock *sk, int flag)
273 {
274         if (sock_flag(sk, flag)) {
275                 sock_reset_flag(sk, flag);
276                 if (!sock_flag(sk, SOCK_TIMESTAMP) &&
277                     !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
278                         net_disable_timestamp();
279                 }
280         }
281 }
282
283
284 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
285 {
286         int skb_len;
287         unsigned long flags;
288         struct sk_buff_head *list = &sk->sk_receive_queue;
289
290         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
291                 atomic_inc(&sk->sk_drops);
292                 trace_sock_rcvqueue_full(sk, skb);
293                 return -ENOMEM;
294         }
295
296         if (!sk_rmem_schedule(sk, skb->truesize)) {
297                 atomic_inc(&sk->sk_drops);
298                 return -ENOBUFS;
299         }
300
301         skb->dev = NULL;
302         skb_set_owner_r(skb, sk);
303
304         /* Cache the SKB length before we tack it onto the receive
305          * queue.  Once it is added it no longer belongs to us and
306          * may be freed by other threads of control pulling packets
307          * from the queue.
308          */
309         skb_len = skb->len;
310
311         /* we escape from rcu protected region, make sure we dont leak
312          * a norefcounted dst
313          */
314         skb_dst_force(skb);
315
316         spin_lock_irqsave(&list->lock, flags);
317         skb->dropcount = atomic_read(&sk->sk_drops);
318         __skb_queue_tail(list, skb);
319         spin_unlock_irqrestore(&list->lock, flags);
320
321         if (!sock_flag(sk, SOCK_DEAD))
322                 sk->sk_data_ready(sk, skb_len);
323         return 0;
324 }
325 EXPORT_SYMBOL(__sock_queue_rcv_skb);
326
327 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
328 {
329         int err;
330
331         err = sk_filter(sk, skb);
332         if (err)
333                 return err;
334
335         return __sock_queue_rcv_skb(sk, skb);
336 }
337 EXPORT_SYMBOL(sock_queue_rcv_skb);
338
339 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
340 {
341         int rc = NET_RX_SUCCESS;
342
343         if (sk_filter(sk, skb))
344                 goto discard_and_relse;
345
346         skb->dev = NULL;
347
348         if (sk_rcvqueues_full(sk, skb)) {
349                 atomic_inc(&sk->sk_drops);
350                 goto discard_and_relse;
351         }
352         if (nested)
353                 bh_lock_sock_nested(sk);
354         else
355                 bh_lock_sock(sk);
356         if (!sock_owned_by_user(sk)) {
357                 /*
358                  * trylock + unlock semantics:
359                  */
360                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
361
362                 rc = sk_backlog_rcv(sk, skb);
363
364                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
365         } else if (sk_add_backlog(sk, skb)) {
366                 bh_unlock_sock(sk);
367                 atomic_inc(&sk->sk_drops);
368                 goto discard_and_relse;
369         }
370
371         bh_unlock_sock(sk);
372 out:
373         sock_put(sk);
374         return rc;
375 discard_and_relse:
376         kfree_skb(skb);
377         goto out;
378 }
379 EXPORT_SYMBOL(sk_receive_skb);
380
381 void sk_reset_txq(struct sock *sk)
382 {
383         sk_tx_queue_clear(sk);
384 }
385 EXPORT_SYMBOL(sk_reset_txq);
386
387 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
388 {
389         struct dst_entry *dst = __sk_dst_get(sk);
390
391         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
392                 sk_tx_queue_clear(sk);
393                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
394                 dst_release(dst);
395                 return NULL;
396         }
397
398         return dst;
399 }
400 EXPORT_SYMBOL(__sk_dst_check);
401
402 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
403 {
404         struct dst_entry *dst = sk_dst_get(sk);
405
406         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
407                 sk_dst_reset(sk);
408                 dst_release(dst);
409                 return NULL;
410         }
411
412         return dst;
413 }
414 EXPORT_SYMBOL(sk_dst_check);
415
416 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
417 {
418         int ret = -ENOPROTOOPT;
419 #ifdef CONFIG_NETDEVICES
420         struct net *net = sock_net(sk);
421         char devname[IFNAMSIZ];
422         int index;
423
424         /* Sorry... */
425         ret = -EPERM;
426         if (!capable(CAP_NET_RAW))
427                 goto out;
428
429         ret = -EINVAL;
430         if (optlen < 0)
431                 goto out;
432
433         /* Bind this socket to a particular device like "eth0",
434          * as specified in the passed interface name. If the
435          * name is "" or the option length is zero the socket
436          * is not bound.
437          */
438         if (optlen > IFNAMSIZ - 1)
439                 optlen = IFNAMSIZ - 1;
440         memset(devname, 0, sizeof(devname));
441
442         ret = -EFAULT;
443         if (copy_from_user(devname, optval, optlen))
444                 goto out;
445
446         index = 0;
447         if (devname[0] != '\0') {
448                 struct net_device *dev;
449
450                 rcu_read_lock();
451                 dev = dev_get_by_name_rcu(net, devname);
452                 if (dev)
453                         index = dev->ifindex;
454                 rcu_read_unlock();
455                 ret = -ENODEV;
456                 if (!dev)
457                         goto out;
458         }
459
460         lock_sock(sk);
461         sk->sk_bound_dev_if = index;
462         sk_dst_reset(sk);
463         release_sock(sk);
464
465         ret = 0;
466
467 out:
468 #endif
469
470         return ret;
471 }
472
473 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
474 {
475         if (valbool)
476                 sock_set_flag(sk, bit);
477         else
478                 sock_reset_flag(sk, bit);
479 }
480
481 /*
482  *      This is meant for all protocols to use and covers goings on
483  *      at the socket level. Everything here is generic.
484  */
485
486 int sock_setsockopt(struct socket *sock, int level, int optname,
487                     char __user *optval, unsigned int optlen)
488 {
489         struct sock *sk = sock->sk;
490         int val;
491         int valbool;
492         struct linger ling;
493         int ret = 0;
494
495         /*
496          *      Options without arguments
497          */
498
499         if (optname == SO_BINDTODEVICE)
500                 return sock_bindtodevice(sk, optval, optlen);
501
502         if (optlen < sizeof(int))
503                 return -EINVAL;
504
505         if (get_user(val, (int __user *)optval))
506                 return -EFAULT;
507
508         valbool = val ? 1 : 0;
509
510         lock_sock(sk);
511
512         switch (optname) {
513         case SO_DEBUG:
514                 if (val && !capable(CAP_NET_ADMIN))
515                         ret = -EACCES;
516                 else
517                         sock_valbool_flag(sk, SOCK_DBG, valbool);
518                 break;
519         case SO_REUSEADDR:
520                 sk->sk_reuse = valbool;
521                 break;
522         case SO_TYPE:
523         case SO_PROTOCOL:
524         case SO_DOMAIN:
525         case SO_ERROR:
526                 ret = -ENOPROTOOPT;
527                 break;
528         case SO_DONTROUTE:
529                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
530                 break;
531         case SO_BROADCAST:
532                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
533                 break;
534         case SO_SNDBUF:
535                 /* Don't error on this BSD doesn't and if you think
536                  * about it this is right. Otherwise apps have to
537                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
538                  * are treated in BSD as hints
539                  */
540                 val = min_t(u32, val, sysctl_wmem_max);
541 set_sndbuf:
542                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
543                 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
544                 /* Wake up sending tasks if we upped the value. */
545                 sk->sk_write_space(sk);
546                 break;
547
548         case SO_SNDBUFFORCE:
549                 if (!capable(CAP_NET_ADMIN)) {
550                         ret = -EPERM;
551                         break;
552                 }
553                 goto set_sndbuf;
554
555         case SO_RCVBUF:
556                 /* Don't error on this BSD doesn't and if you think
557                  * about it this is right. Otherwise apps have to
558                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
559                  * are treated in BSD as hints
560                  */
561                 val = min_t(u32, val, sysctl_rmem_max);
562 set_rcvbuf:
563                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
564                 /*
565                  * We double it on the way in to account for
566                  * "struct sk_buff" etc. overhead.   Applications
567                  * assume that the SO_RCVBUF setting they make will
568                  * allow that much actual data to be received on that
569                  * socket.
570                  *
571                  * Applications are unaware that "struct sk_buff" and
572                  * other overheads allocate from the receive buffer
573                  * during socket buffer allocation.
574                  *
575                  * And after considering the possible alternatives,
576                  * returning the value we actually used in getsockopt
577                  * is the most desirable behavior.
578                  */
579                 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
580                 break;
581
582         case SO_RCVBUFFORCE:
583                 if (!capable(CAP_NET_ADMIN)) {
584                         ret = -EPERM;
585                         break;
586                 }
587                 goto set_rcvbuf;
588
589         case SO_KEEPALIVE:
590 #ifdef CONFIG_INET
591                 if (sk->sk_protocol == IPPROTO_TCP &&
592                     sk->sk_type == SOCK_STREAM)
593                         tcp_set_keepalive(sk, valbool);
594 #endif
595                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
596                 break;
597
598         case SO_OOBINLINE:
599                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
600                 break;
601
602         case SO_NO_CHECK:
603                 sk->sk_no_check = valbool;
604                 break;
605
606         case SO_PRIORITY:
607                 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
608                         sk->sk_priority = val;
609                 else
610                         ret = -EPERM;
611                 break;
612
613         case SO_LINGER:
614                 if (optlen < sizeof(ling)) {
615                         ret = -EINVAL;  /* 1003.1g */
616                         break;
617                 }
618                 if (copy_from_user(&ling, optval, sizeof(ling))) {
619                         ret = -EFAULT;
620                         break;
621                 }
622                 if (!ling.l_onoff)
623                         sock_reset_flag(sk, SOCK_LINGER);
624                 else {
625 #if (BITS_PER_LONG == 32)
626                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
627                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
628                         else
629 #endif
630                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
631                         sock_set_flag(sk, SOCK_LINGER);
632                 }
633                 break;
634
635         case SO_BSDCOMPAT:
636                 sock_warn_obsolete_bsdism("setsockopt");
637                 break;
638
639         case SO_PASSCRED:
640                 if (valbool)
641                         set_bit(SOCK_PASSCRED, &sock->flags);
642                 else
643                         clear_bit(SOCK_PASSCRED, &sock->flags);
644                 break;
645
646         case SO_TIMESTAMP:
647         case SO_TIMESTAMPNS:
648                 if (valbool)  {
649                         if (optname == SO_TIMESTAMP)
650                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
651                         else
652                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
653                         sock_set_flag(sk, SOCK_RCVTSTAMP);
654                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
655                 } else {
656                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
657                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
658                 }
659                 break;
660
661         case SO_TIMESTAMPING:
662                 if (val & ~SOF_TIMESTAMPING_MASK) {
663                         ret = -EINVAL;
664                         break;
665                 }
666                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
667                                   val & SOF_TIMESTAMPING_TX_HARDWARE);
668                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
669                                   val & SOF_TIMESTAMPING_TX_SOFTWARE);
670                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
671                                   val & SOF_TIMESTAMPING_RX_HARDWARE);
672                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
673                         sock_enable_timestamp(sk,
674                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
675                 else
676                         sock_disable_timestamp(sk,
677                                                SOCK_TIMESTAMPING_RX_SOFTWARE);
678                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
679                                   val & SOF_TIMESTAMPING_SOFTWARE);
680                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
681                                   val & SOF_TIMESTAMPING_SYS_HARDWARE);
682                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
683                                   val & SOF_TIMESTAMPING_RAW_HARDWARE);
684                 break;
685
686         case SO_RCVLOWAT:
687                 if (val < 0)
688                         val = INT_MAX;
689                 sk->sk_rcvlowat = val ? : 1;
690                 break;
691
692         case SO_RCVTIMEO:
693                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
694                 break;
695
696         case SO_SNDTIMEO:
697                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
698                 break;
699
700         case SO_ATTACH_FILTER:
701                 ret = -EINVAL;
702                 if (optlen == sizeof(struct sock_fprog)) {
703                         struct sock_fprog fprog;
704
705                         ret = -EFAULT;
706                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
707                                 break;
708
709                         ret = sk_attach_filter(&fprog, sk);
710                 }
711                 break;
712
713         case SO_DETACH_FILTER:
714                 ret = sk_detach_filter(sk);
715                 break;
716
717         case SO_PASSSEC:
718                 if (valbool)
719                         set_bit(SOCK_PASSSEC, &sock->flags);
720                 else
721                         clear_bit(SOCK_PASSSEC, &sock->flags);
722                 break;
723         case SO_MARK:
724                 if (!capable(CAP_NET_ADMIN))
725                         ret = -EPERM;
726                 else
727                         sk->sk_mark = val;
728                 break;
729
730                 /* We implement the SO_SNDLOWAT etc to
731                    not be settable (1003.1g 5.3) */
732         case SO_RXQ_OVFL:
733                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
734                 break;
735         default:
736                 ret = -ENOPROTOOPT;
737                 break;
738         }
739         release_sock(sk);
740         return ret;
741 }
742 EXPORT_SYMBOL(sock_setsockopt);
743
744
745 void cred_to_ucred(struct pid *pid, const struct cred *cred,
746                    struct ucred *ucred)
747 {
748         ucred->pid = pid_vnr(pid);
749         ucred->uid = ucred->gid = -1;
750         if (cred) {
751                 struct user_namespace *current_ns = current_user_ns();
752
753                 ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
754                 ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
755         }
756 }
757 EXPORT_SYMBOL_GPL(cred_to_ucred);
758
759 void cred_real_to_ucred(struct pid *pid, const struct cred *cred,
760                         struct ucred *ucred)
761 {
762         ucred->pid = pid_vnr(pid);
763         ucred->uid = ucred->gid = -1;
764         if (cred) {
765                 struct user_namespace *current_ns = current_user_ns();
766
767                 ucred->uid = user_ns_map_uid(current_ns, cred, cred->uid);
768                 ucred->gid = user_ns_map_gid(current_ns, cred, cred->gid);
769         }
770 }
771 EXPORT_SYMBOL_GPL(cred_real_to_ucred);
772
773 int sock_getsockopt(struct socket *sock, int level, int optname,
774                     char __user *optval, int __user *optlen)
775 {
776         struct sock *sk = sock->sk;
777
778         union {
779                 int val;
780                 struct linger ling;
781                 struct timeval tm;
782         } v;
783
784         int lv = sizeof(int);
785         int len;
786
787         if (get_user(len, optlen))
788                 return -EFAULT;
789         if (len < 0)
790                 return -EINVAL;
791
792         memset(&v, 0, sizeof(v));
793
794         switch (optname) {
795         case SO_DEBUG:
796                 v.val = sock_flag(sk, SOCK_DBG);
797                 break;
798
799         case SO_DONTROUTE:
800                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
801                 break;
802
803         case SO_BROADCAST:
804                 v.val = !!sock_flag(sk, SOCK_BROADCAST);
805                 break;
806
807         case SO_SNDBUF:
808                 v.val = sk->sk_sndbuf;
809                 break;
810
811         case SO_RCVBUF:
812                 v.val = sk->sk_rcvbuf;
813                 break;
814
815         case SO_REUSEADDR:
816                 v.val = sk->sk_reuse;
817                 break;
818
819         case SO_KEEPALIVE:
820                 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
821                 break;
822
823         case SO_TYPE:
824                 v.val = sk->sk_type;
825                 break;
826
827         case SO_PROTOCOL:
828                 v.val = sk->sk_protocol;
829                 break;
830
831         case SO_DOMAIN:
832                 v.val = sk->sk_family;
833                 break;
834
835         case SO_ERROR:
836                 v.val = -sock_error(sk);
837                 if (v.val == 0)
838                         v.val = xchg(&sk->sk_err_soft, 0);
839                 break;
840
841         case SO_OOBINLINE:
842                 v.val = !!sock_flag(sk, SOCK_URGINLINE);
843                 break;
844
845         case SO_NO_CHECK:
846                 v.val = sk->sk_no_check;
847                 break;
848
849         case SO_PRIORITY:
850                 v.val = sk->sk_priority;
851                 break;
852
853         case SO_LINGER:
854                 lv              = sizeof(v.ling);
855                 v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
856                 v.ling.l_linger = sk->sk_lingertime / HZ;
857                 break;
858
859         case SO_BSDCOMPAT:
860                 sock_warn_obsolete_bsdism("getsockopt");
861                 break;
862
863         case SO_TIMESTAMP:
864                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
865                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
866                 break;
867
868         case SO_TIMESTAMPNS:
869                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
870                 break;
871
872         case SO_TIMESTAMPING:
873                 v.val = 0;
874                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
875                         v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
876                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
877                         v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
878                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
879                         v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
880                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
881                         v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
882                 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
883                         v.val |= SOF_TIMESTAMPING_SOFTWARE;
884                 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
885                         v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
886                 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
887                         v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
888                 break;
889
890         case SO_RCVTIMEO:
891                 lv = sizeof(struct timeval);
892                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
893                         v.tm.tv_sec = 0;
894                         v.tm.tv_usec = 0;
895                 } else {
896                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
897                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
898                 }
899                 break;
900
901         case SO_SNDTIMEO:
902                 lv = sizeof(struct timeval);
903                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
904                         v.tm.tv_sec = 0;
905                         v.tm.tv_usec = 0;
906                 } else {
907                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
908                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
909                 }
910                 break;
911
912         case SO_RCVLOWAT:
913                 v.val = sk->sk_rcvlowat;
914                 break;
915
916         case SO_SNDLOWAT:
917                 v.val = 1;
918                 break;
919
920         case SO_PASSCRED:
921                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
922                 break;
923
924         case SO_PEERCRED:
925         {
926                 struct ucred peercred;
927                 if (len > sizeof(peercred))
928                         len = sizeof(peercred);
929                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
930                 if (copy_to_user(optval, &peercred, len))
931                         return -EFAULT;
932                 goto lenout;
933         }
934
935         case SO_PEERNAME:
936         {
937                 char address[128];
938
939                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
940                         return -ENOTCONN;
941                 if (lv < len)
942                         return -EINVAL;
943                 if (copy_to_user(optval, address, len))
944                         return -EFAULT;
945                 goto lenout;
946         }
947
948         /* Dubious BSD thing... Probably nobody even uses it, but
949          * the UNIX standard wants it for whatever reason... -DaveM
950          */
951         case SO_ACCEPTCONN:
952                 v.val = sk->sk_state == TCP_LISTEN;
953                 break;
954
955         case SO_PASSSEC:
956                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
957                 break;
958
959         case SO_PEERSEC:
960                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
961
962         case SO_MARK:
963                 v.val = sk->sk_mark;
964                 break;
965
966         case SO_RXQ_OVFL:
967                 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
968                 break;
969
970         default:
971                 return -ENOPROTOOPT;
972         }
973
974         if (len > lv)
975                 len = lv;
976         if (copy_to_user(optval, &v, len))
977                 return -EFAULT;
978 lenout:
979         if (put_user(len, optlen))
980                 return -EFAULT;
981         return 0;
982 }
983
984 /*
985  * Initialize an sk_lock.
986  *
987  * (We also register the sk_lock with the lock validator.)
988  */
989 static inline void sock_lock_init(struct sock *sk)
990 {
991         sock_lock_init_class_and_name(sk,
992                         af_family_slock_key_strings[sk->sk_family],
993                         af_family_slock_keys + sk->sk_family,
994                         af_family_key_strings[sk->sk_family],
995                         af_family_keys + sk->sk_family);
996 }
997
998 /*
999  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1000  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1001  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1002  */
1003 static void sock_copy(struct sock *nsk, const struct sock *osk)
1004 {
1005 #ifdef CONFIG_SECURITY_NETWORK
1006         void *sptr = nsk->sk_security;
1007 #endif
1008         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1009
1010         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1011                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1012
1013 #ifdef CONFIG_SECURITY_NETWORK
1014         nsk->sk_security = sptr;
1015         security_sk_clone(osk, nsk);
1016 #endif
1017 }
1018
1019 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1020 {
1021         unsigned long nulls1, nulls2;
1022
1023         nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1024         nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1025         if (nulls1 > nulls2)
1026                 swap(nulls1, nulls2);
1027
1028         if (nulls1 != 0)
1029                 memset((char *)sk, 0, nulls1);
1030         memset((char *)sk + nulls1 + sizeof(void *), 0,
1031                nulls2 - nulls1 - sizeof(void *));
1032         memset((char *)sk + nulls2 + sizeof(void *), 0,
1033                size - nulls2 - sizeof(void *));
1034 }
1035 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1036
1037 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1038                 int family)
1039 {
1040         struct sock *sk;
1041         struct kmem_cache *slab;
1042
1043         slab = prot->slab;
1044         if (slab != NULL) {
1045                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1046                 if (!sk)
1047                         return sk;
1048                 if (priority & __GFP_ZERO) {
1049                         if (prot->clear_sk)
1050                                 prot->clear_sk(sk, prot->obj_size);
1051                         else
1052                                 sk_prot_clear_nulls(sk, prot->obj_size);
1053                 }
1054         } else
1055                 sk = kmalloc(prot->obj_size, priority);
1056
1057         if (sk != NULL) {
1058                 kmemcheck_annotate_bitfield(sk, flags);
1059
1060                 if (security_sk_alloc(sk, family, priority))
1061                         goto out_free;
1062
1063                 if (!try_module_get(prot->owner))
1064                         goto out_free_sec;
1065                 sk_tx_queue_clear(sk);
1066         }
1067
1068         return sk;
1069
1070 out_free_sec:
1071         security_sk_free(sk);
1072 out_free:
1073         if (slab != NULL)
1074                 kmem_cache_free(slab, sk);
1075         else
1076                 kfree(sk);
1077         return NULL;
1078 }
1079
1080 static void sk_prot_free(struct proto *prot, struct sock *sk)
1081 {
1082         struct kmem_cache *slab;
1083         struct module *owner;
1084
1085         owner = prot->owner;
1086         slab = prot->slab;
1087
1088         security_sk_free(sk);
1089         if (slab != NULL)
1090                 kmem_cache_free(slab, sk);
1091         else
1092                 kfree(sk);
1093         module_put(owner);
1094 }
1095
1096 #ifdef CONFIG_CGROUPS
1097 void sock_update_classid(struct sock *sk)
1098 {
1099         u32 classid;
1100
1101         rcu_read_lock();  /* doing current task, which cannot vanish. */
1102         classid = task_cls_classid(current);
1103         rcu_read_unlock();
1104         if (classid && classid != sk->sk_classid)
1105                 sk->sk_classid = classid;
1106 }
1107 EXPORT_SYMBOL(sock_update_classid);
1108 #endif
1109
1110 /**
1111  *      sk_alloc - All socket objects are allocated here
1112  *      @net: the applicable net namespace
1113  *      @family: protocol family
1114  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1115  *      @prot: struct proto associated with this new sock instance
1116  */
1117 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1118                       struct proto *prot)
1119 {
1120         struct sock *sk;
1121
1122         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1123         if (sk) {
1124                 sk->sk_family = family;
1125                 /*
1126                  * See comment in struct sock definition to understand
1127                  * why we need sk_prot_creator -acme
1128                  */
1129                 sk->sk_prot = sk->sk_prot_creator = prot;
1130                 sock_lock_init(sk);
1131                 sock_net_set(sk, get_net(net));
1132                 atomic_set(&sk->sk_wmem_alloc, 1);
1133
1134                 sock_update_classid(sk);
1135         }
1136
1137         return sk;
1138 }
1139 EXPORT_SYMBOL(sk_alloc);
1140
1141 static void __sk_free(struct sock *sk)
1142 {
1143         struct sk_filter *filter;
1144
1145         if (sk->sk_destruct)
1146                 sk->sk_destruct(sk);
1147
1148         filter = rcu_dereference_check(sk->sk_filter,
1149                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1150         if (filter) {
1151                 sk_filter_uncharge(sk, filter);
1152                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1153         }
1154
1155         sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1156         sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1157
1158         if (atomic_read(&sk->sk_omem_alloc))
1159                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1160                        __func__, atomic_read(&sk->sk_omem_alloc));
1161
1162         if (sk->sk_peer_cred)
1163                 put_cred(sk->sk_peer_cred);
1164         put_pid(sk->sk_peer_pid);
1165         put_net(sock_net(sk));
1166         sk_prot_free(sk->sk_prot_creator, sk);
1167 }
1168
1169 void sk_free(struct sock *sk)
1170 {
1171         /*
1172          * We subtract one from sk_wmem_alloc and can know if
1173          * some packets are still in some tx queue.
1174          * If not null, sock_wfree() will call __sk_free(sk) later
1175          */
1176         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1177                 __sk_free(sk);
1178 }
1179 EXPORT_SYMBOL(sk_free);
1180
1181 /*
1182  * Last sock_put should drop reference to sk->sk_net. It has already
1183  * been dropped in sk_change_net. Taking reference to stopping namespace
1184  * is not an option.
1185  * Take reference to a socket to remove it from hash _alive_ and after that
1186  * destroy it in the context of init_net.
1187  */
1188 void sk_release_kernel(struct sock *sk)
1189 {
1190         if (sk == NULL || sk->sk_socket == NULL)
1191                 return;
1192
1193         sock_hold(sk);
1194         sock_release(sk->sk_socket);
1195         release_net(sock_net(sk));
1196         sock_net_set(sk, get_net(&init_net));
1197         sock_put(sk);
1198 }
1199 EXPORT_SYMBOL(sk_release_kernel);
1200
1201 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1202 {
1203         struct sock *newsk;
1204
1205         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1206         if (newsk != NULL) {
1207                 struct sk_filter *filter;
1208
1209                 sock_copy(newsk, sk);
1210
1211                 /* SANITY */
1212                 get_net(sock_net(newsk));
1213                 sk_node_init(&newsk->sk_node);
1214                 sock_lock_init(newsk);
1215                 bh_lock_sock(newsk);
1216                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1217                 newsk->sk_backlog.len = 0;
1218
1219                 atomic_set(&newsk->sk_rmem_alloc, 0);
1220                 /*
1221                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1222                  */
1223                 atomic_set(&newsk->sk_wmem_alloc, 1);
1224                 atomic_set(&newsk->sk_omem_alloc, 0);
1225                 skb_queue_head_init(&newsk->sk_receive_queue);
1226                 skb_queue_head_init(&newsk->sk_write_queue);
1227 #ifdef CONFIG_NET_DMA
1228                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1229 #endif
1230
1231                 spin_lock_init(&newsk->sk_dst_lock);
1232                 rwlock_init(&newsk->sk_callback_lock);
1233                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1234                                 af_callback_keys + newsk->sk_family,
1235                                 af_family_clock_key_strings[newsk->sk_family]);
1236
1237                 newsk->sk_dst_cache     = NULL;
1238                 newsk->sk_wmem_queued   = 0;
1239                 newsk->sk_forward_alloc = 0;
1240                 newsk->sk_send_head     = NULL;
1241                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1242
1243                 sock_reset_flag(newsk, SOCK_DONE);
1244                 skb_queue_head_init(&newsk->sk_error_queue);
1245
1246                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1247                 if (filter != NULL)
1248                         sk_filter_charge(newsk, filter);
1249
1250                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1251                         /* It is still raw copy of parent, so invalidate
1252                          * destructor and make plain sk_free() */
1253                         newsk->sk_destruct = NULL;
1254                         bh_unlock_sock(newsk);
1255                         sk_free(newsk);
1256                         newsk = NULL;
1257                         goto out;
1258                 }
1259
1260                 newsk->sk_err      = 0;
1261                 newsk->sk_priority = 0;
1262                 /*
1263                  * Before updating sk_refcnt, we must commit prior changes to memory
1264                  * (Documentation/RCU/rculist_nulls.txt for details)
1265                  */
1266                 smp_wmb();
1267                 atomic_set(&newsk->sk_refcnt, 2);
1268
1269                 /*
1270                  * Increment the counter in the same struct proto as the master
1271                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1272                  * is the same as sk->sk_prot->socks, as this field was copied
1273                  * with memcpy).
1274                  *
1275                  * This _changes_ the previous behaviour, where
1276                  * tcp_create_openreq_child always was incrementing the
1277                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1278                  * to be taken into account in all callers. -acme
1279                  */
1280                 sk_refcnt_debug_inc(newsk);
1281                 sk_set_socket(newsk, NULL);
1282                 newsk->sk_wq = NULL;
1283
1284                 if (newsk->sk_prot->sockets_allocated)
1285                         percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1286
1287                 if (sock_flag(newsk, SOCK_TIMESTAMP) ||
1288                     sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1289                         net_enable_timestamp();
1290         }
1291 out:
1292         return newsk;
1293 }
1294 EXPORT_SYMBOL_GPL(sk_clone);
1295
1296 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1297 {
1298         __sk_dst_set(sk, dst);
1299         sk->sk_route_caps = dst->dev->features;
1300         if (sk->sk_route_caps & NETIF_F_GSO)
1301                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1302         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1303         if (sk_can_gso(sk)) {
1304                 if (dst->header_len) {
1305                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1306                 } else {
1307                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1308                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1309                 }
1310         }
1311 }
1312 EXPORT_SYMBOL_GPL(sk_setup_caps);
1313
1314 void __init sk_init(void)
1315 {
1316         if (totalram_pages <= 4096) {
1317                 sysctl_wmem_max = 32767;
1318                 sysctl_rmem_max = 32767;
1319                 sysctl_wmem_default = 32767;
1320                 sysctl_rmem_default = 32767;
1321         } else if (totalram_pages >= 131072) {
1322                 sysctl_wmem_max = 131071;
1323                 sysctl_rmem_max = 131071;
1324         }
1325 }
1326
1327 /*
1328  *      Simple resource managers for sockets.
1329  */
1330
1331
1332 /*
1333  * Write buffer destructor automatically called from kfree_skb.
1334  */
1335 void sock_wfree(struct sk_buff *skb)
1336 {
1337         struct sock *sk = skb->sk;
1338         unsigned int len = skb->truesize;
1339
1340         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1341                 /*
1342                  * Keep a reference on sk_wmem_alloc, this will be released
1343                  * after sk_write_space() call
1344                  */
1345                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1346                 sk->sk_write_space(sk);
1347                 len = 1;
1348         }
1349         /*
1350          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1351          * could not do because of in-flight packets
1352          */
1353         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1354                 __sk_free(sk);
1355 }
1356 EXPORT_SYMBOL(sock_wfree);
1357
1358 /*
1359  * Read buffer destructor automatically called from kfree_skb.
1360  */
1361 void sock_rfree(struct sk_buff *skb)
1362 {
1363         struct sock *sk = skb->sk;
1364         unsigned int len = skb->truesize;
1365
1366         atomic_sub(len, &sk->sk_rmem_alloc);
1367         sk_mem_uncharge(sk, len);
1368 }
1369 EXPORT_SYMBOL(sock_rfree);
1370
1371
1372 int sock_i_uid(struct sock *sk)
1373 {
1374         int uid;
1375
1376         read_lock_bh(&sk->sk_callback_lock);
1377         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1378         read_unlock_bh(&sk->sk_callback_lock);
1379         return uid;
1380 }
1381 EXPORT_SYMBOL(sock_i_uid);
1382
1383 unsigned long sock_i_ino(struct sock *sk)
1384 {
1385         unsigned long ino;
1386
1387         read_lock_bh(&sk->sk_callback_lock);
1388         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1389         read_unlock_bh(&sk->sk_callback_lock);
1390         return ino;
1391 }
1392 EXPORT_SYMBOL(sock_i_ino);
1393
1394 /*
1395  * Allocate a skb from the socket's send buffer.
1396  */
1397 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1398                              gfp_t priority)
1399 {
1400         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1401                 struct sk_buff *skb = alloc_skb(size, priority);
1402                 if (skb) {
1403                         skb_set_owner_w(skb, sk);
1404                         return skb;
1405                 }
1406         }
1407         return NULL;
1408 }
1409 EXPORT_SYMBOL(sock_wmalloc);
1410
1411 /*
1412  * Allocate a skb from the socket's receive buffer.
1413  */
1414 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1415                              gfp_t priority)
1416 {
1417         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1418                 struct sk_buff *skb = alloc_skb(size, priority);
1419                 if (skb) {
1420                         skb_set_owner_r(skb, sk);
1421                         return skb;
1422                 }
1423         }
1424         return NULL;
1425 }
1426
1427 /*
1428  * Allocate a memory block from the socket's option memory buffer.
1429  */
1430 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1431 {
1432         if ((unsigned)size <= sysctl_optmem_max &&
1433             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1434                 void *mem;
1435                 /* First do the add, to avoid the race if kmalloc
1436                  * might sleep.
1437                  */
1438                 atomic_add(size, &sk->sk_omem_alloc);
1439                 mem = kmalloc(size, priority);
1440                 if (mem)
1441                         return mem;
1442                 atomic_sub(size, &sk->sk_omem_alloc);
1443         }
1444         return NULL;
1445 }
1446 EXPORT_SYMBOL(sock_kmalloc);
1447
1448 /*
1449  * Free an option memory block.
1450  */
1451 void sock_kfree_s(struct sock *sk, void *mem, int size)
1452 {
1453         kfree(mem);
1454         atomic_sub(size, &sk->sk_omem_alloc);
1455 }
1456 EXPORT_SYMBOL(sock_kfree_s);
1457
1458 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1459    I think, these locks should be removed for datagram sockets.
1460  */
1461 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1462 {
1463         DEFINE_WAIT(wait);
1464
1465         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1466         for (;;) {
1467                 if (!timeo)
1468                         break;
1469                 if (signal_pending(current))
1470                         break;
1471                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1472                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1473                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1474                         break;
1475                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1476                         break;
1477                 if (sk->sk_err)
1478                         break;
1479                 timeo = schedule_timeout(timeo);
1480         }
1481         finish_wait(sk_sleep(sk), &wait);
1482         return timeo;
1483 }
1484
1485
1486 /*
1487  *      Generic send/receive buffer handlers
1488  */
1489
1490 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1491                                      unsigned long data_len, int noblock,
1492                                      int *errcode)
1493 {
1494         struct sk_buff *skb;
1495         gfp_t gfp_mask;
1496         long timeo;
1497         int err;
1498         int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1499
1500         err = -EMSGSIZE;
1501         if (npages > MAX_SKB_FRAGS)
1502                 goto failure;
1503
1504         gfp_mask = sk->sk_allocation;
1505         if (gfp_mask & __GFP_WAIT)
1506                 gfp_mask |= __GFP_REPEAT;
1507
1508         timeo = sock_sndtimeo(sk, noblock);
1509         while (1) {
1510                 err = sock_error(sk);
1511                 if (err != 0)
1512                         goto failure;
1513
1514                 err = -EPIPE;
1515                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1516                         goto failure;
1517
1518                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1519                         skb = alloc_skb(header_len, gfp_mask);
1520                         if (skb) {
1521                                 int i;
1522
1523                                 /* No pages, we're done... */
1524                                 if (!data_len)
1525                                         break;
1526
1527                                 skb->truesize += data_len;
1528                                 skb_shinfo(skb)->nr_frags = npages;
1529                                 for (i = 0; i < npages; i++) {
1530                                         struct page *page;
1531
1532                                         page = alloc_pages(sk->sk_allocation, 0);
1533                                         if (!page) {
1534                                                 err = -ENOBUFS;
1535                                                 skb_shinfo(skb)->nr_frags = i;
1536                                                 kfree_skb(skb);
1537                                                 goto failure;
1538                                         }
1539
1540                                         __skb_fill_page_desc(skb, i,
1541                                                         page, 0,
1542                                                         (data_len >= PAGE_SIZE ?
1543                                                          PAGE_SIZE :
1544                                                          data_len));
1545                                         data_len -= PAGE_SIZE;
1546                                 }
1547
1548                                 /* Full success... */
1549                                 break;
1550                         }
1551                         err = -ENOBUFS;
1552                         goto failure;
1553                 }
1554                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1555                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1556                 err = -EAGAIN;
1557                 if (!timeo)
1558                         goto failure;
1559                 if (signal_pending(current))
1560                         goto interrupted;
1561                 timeo = sock_wait_for_wmem(sk, timeo);
1562         }
1563
1564         skb_set_owner_w(skb, sk);
1565         return skb;
1566
1567 interrupted:
1568         err = sock_intr_errno(timeo);
1569 failure:
1570         *errcode = err;
1571         return NULL;
1572 }
1573 EXPORT_SYMBOL(sock_alloc_send_pskb);
1574
1575 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1576                                     int noblock, int *errcode)
1577 {
1578         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1579 }
1580 EXPORT_SYMBOL(sock_alloc_send_skb);
1581
1582 static void __lock_sock(struct sock *sk)
1583         __releases(&sk->sk_lock.slock)
1584         __acquires(&sk->sk_lock.slock)
1585 {
1586         DEFINE_WAIT(wait);
1587
1588         for (;;) {
1589                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1590                                         TASK_UNINTERRUPTIBLE);
1591                 spin_unlock_bh(&sk->sk_lock.slock);
1592                 schedule();
1593                 spin_lock_bh(&sk->sk_lock.slock);
1594                 if (!sock_owned_by_user(sk))
1595                         break;
1596         }
1597         finish_wait(&sk->sk_lock.wq, &wait);
1598 }
1599
1600 static void __release_sock(struct sock *sk)
1601         __releases(&sk->sk_lock.slock)
1602         __acquires(&sk->sk_lock.slock)
1603 {
1604         struct sk_buff *skb = sk->sk_backlog.head;
1605
1606         do {
1607                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1608                 bh_unlock_sock(sk);
1609
1610                 do {
1611                         struct sk_buff *next = skb->next;
1612
1613                         WARN_ON_ONCE(skb_dst_is_noref(skb));
1614                         skb->next = NULL;
1615                         sk_backlog_rcv(sk, skb);
1616
1617                         /*
1618                          * We are in process context here with softirqs
1619                          * disabled, use cond_resched_softirq() to preempt.
1620                          * This is safe to do because we've taken the backlog
1621                          * queue private:
1622                          */
1623                         cond_resched_softirq();
1624
1625                         skb = next;
1626                 } while (skb != NULL);
1627
1628                 bh_lock_sock(sk);
1629         } while ((skb = sk->sk_backlog.head) != NULL);
1630
1631         /*
1632          * Doing the zeroing here guarantee we can not loop forever
1633          * while a wild producer attempts to flood us.
1634          */
1635         sk->sk_backlog.len = 0;
1636 }
1637
1638 /**
1639  * sk_wait_data - wait for data to arrive at sk_receive_queue
1640  * @sk:    sock to wait on
1641  * @timeo: for how long
1642  *
1643  * Now socket state including sk->sk_err is changed only under lock,
1644  * hence we may omit checks after joining wait queue.
1645  * We check receive queue before schedule() only as optimization;
1646  * it is very likely that release_sock() added new data.
1647  */
1648 int sk_wait_data(struct sock *sk, long *timeo)
1649 {
1650         int rc;
1651         DEFINE_WAIT(wait);
1652
1653         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1654         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1655         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1656         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1657         finish_wait(sk_sleep(sk), &wait);
1658         return rc;
1659 }
1660 EXPORT_SYMBOL(sk_wait_data);
1661
1662 /**
1663  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1664  *      @sk: socket
1665  *      @size: memory size to allocate
1666  *      @kind: allocation type
1667  *
1668  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1669  *      rmem allocation. This function assumes that protocols which have
1670  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1671  */
1672 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1673 {
1674         struct proto *prot = sk->sk_prot;
1675         int amt = sk_mem_pages(size);
1676         long allocated;
1677
1678         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1679         allocated = atomic_long_add_return(amt, prot->memory_allocated);
1680
1681         /* Under limit. */
1682         if (allocated <= prot->sysctl_mem[0]) {
1683                 if (prot->memory_pressure && *prot->memory_pressure)
1684                         *prot->memory_pressure = 0;
1685                 return 1;
1686         }
1687
1688         /* Under pressure. */
1689         if (allocated > prot->sysctl_mem[1])
1690                 if (prot->enter_memory_pressure)
1691                         prot->enter_memory_pressure(sk);
1692
1693         /* Over hard limit. */
1694         if (allocated > prot->sysctl_mem[2])
1695                 goto suppress_allocation;
1696
1697         /* guarantee minimum buffer size under pressure */
1698         if (kind == SK_MEM_RECV) {
1699                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1700                         return 1;
1701         } else { /* SK_MEM_SEND */
1702                 if (sk->sk_type == SOCK_STREAM) {
1703                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1704                                 return 1;
1705                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1706                            prot->sysctl_wmem[0])
1707                                 return 1;
1708         }
1709
1710         if (prot->memory_pressure) {
1711                 int alloc;
1712
1713                 if (!*prot->memory_pressure)
1714                         return 1;
1715                 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1716                 if (prot->sysctl_mem[2] > alloc *
1717                     sk_mem_pages(sk->sk_wmem_queued +
1718                                  atomic_read(&sk->sk_rmem_alloc) +
1719                                  sk->sk_forward_alloc))
1720                         return 1;
1721         }
1722
1723 suppress_allocation:
1724
1725         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1726                 sk_stream_moderate_sndbuf(sk);
1727
1728                 /* Fail only if socket is _under_ its sndbuf.
1729                  * In this case we cannot block, so that we have to fail.
1730                  */
1731                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1732                         return 1;
1733         }
1734
1735         trace_sock_exceed_buf_limit(sk, prot, allocated);
1736
1737         /* Alas. Undo changes. */
1738         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1739         atomic_long_sub(amt, prot->memory_allocated);
1740         return 0;
1741 }
1742 EXPORT_SYMBOL(__sk_mem_schedule);
1743
1744 /**
1745  *      __sk_reclaim - reclaim memory_allocated
1746  *      @sk: socket
1747  */
1748 void __sk_mem_reclaim(struct sock *sk)
1749 {
1750         struct proto *prot = sk->sk_prot;
1751
1752         atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1753                    prot->memory_allocated);
1754         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1755
1756         if (prot->memory_pressure && *prot->memory_pressure &&
1757             (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1758                 *prot->memory_pressure = 0;
1759 }
1760 EXPORT_SYMBOL(__sk_mem_reclaim);
1761
1762
1763 /*
1764  * Set of default routines for initialising struct proto_ops when
1765  * the protocol does not support a particular function. In certain
1766  * cases where it makes no sense for a protocol to have a "do nothing"
1767  * function, some default processing is provided.
1768  */
1769
1770 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1771 {
1772         return -EOPNOTSUPP;
1773 }
1774 EXPORT_SYMBOL(sock_no_bind);
1775
1776 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1777                     int len, int flags)
1778 {
1779         return -EOPNOTSUPP;
1780 }
1781 EXPORT_SYMBOL(sock_no_connect);
1782
1783 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1784 {
1785         return -EOPNOTSUPP;
1786 }
1787 EXPORT_SYMBOL(sock_no_socketpair);
1788
1789 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1790 {
1791         return -EOPNOTSUPP;
1792 }
1793 EXPORT_SYMBOL(sock_no_accept);
1794
1795 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1796                     int *len, int peer)
1797 {
1798         return -EOPNOTSUPP;
1799 }
1800 EXPORT_SYMBOL(sock_no_getname);
1801
1802 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1803 {
1804         return 0;
1805 }
1806 EXPORT_SYMBOL(sock_no_poll);
1807
1808 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1809 {
1810         return -EOPNOTSUPP;
1811 }
1812 EXPORT_SYMBOL(sock_no_ioctl);
1813
1814 int sock_no_listen(struct socket *sock, int backlog)
1815 {
1816         return -EOPNOTSUPP;
1817 }
1818 EXPORT_SYMBOL(sock_no_listen);
1819
1820 int sock_no_shutdown(struct socket *sock, int how)
1821 {
1822         return -EOPNOTSUPP;
1823 }
1824 EXPORT_SYMBOL(sock_no_shutdown);
1825
1826 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1827                     char __user *optval, unsigned int optlen)
1828 {
1829         return -EOPNOTSUPP;
1830 }
1831 EXPORT_SYMBOL(sock_no_setsockopt);
1832
1833 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1834                     char __user *optval, int __user *optlen)
1835 {
1836         return -EOPNOTSUPP;
1837 }
1838 EXPORT_SYMBOL(sock_no_getsockopt);
1839
1840 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1841                     size_t len)
1842 {
1843         return -EOPNOTSUPP;
1844 }
1845 EXPORT_SYMBOL(sock_no_sendmsg);
1846
1847 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1848                     size_t len, int flags)
1849 {
1850         return -EOPNOTSUPP;
1851 }
1852 EXPORT_SYMBOL(sock_no_recvmsg);
1853
1854 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1855 {
1856         /* Mirror missing mmap method error code */
1857         return -ENODEV;
1858 }
1859 EXPORT_SYMBOL(sock_no_mmap);
1860
1861 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1862 {
1863         ssize_t res;
1864         struct msghdr msg = {.msg_flags = flags};
1865         struct kvec iov;
1866         char *kaddr = kmap(page);
1867         iov.iov_base = kaddr + offset;
1868         iov.iov_len = size;
1869         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1870         kunmap(page);
1871         return res;
1872 }
1873 EXPORT_SYMBOL(sock_no_sendpage);
1874
1875 /*
1876  *      Default Socket Callbacks
1877  */
1878
1879 static void sock_def_wakeup(struct sock *sk)
1880 {
1881         struct socket_wq *wq;
1882
1883         rcu_read_lock();
1884         wq = rcu_dereference(sk->sk_wq);
1885         if (wq_has_sleeper(wq))
1886                 wake_up_interruptible_all(&wq->wait);
1887         rcu_read_unlock();
1888 }
1889
1890 static void sock_def_error_report(struct sock *sk)
1891 {
1892         struct socket_wq *wq;
1893
1894         rcu_read_lock();
1895         wq = rcu_dereference(sk->sk_wq);
1896         if (wq_has_sleeper(wq))
1897                 wake_up_interruptible_poll(&wq->wait, POLLERR);
1898         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1899         rcu_read_unlock();
1900 }
1901
1902 static void sock_def_readable(struct sock *sk, int len)
1903 {
1904         struct socket_wq *wq;
1905
1906         rcu_read_lock();
1907         wq = rcu_dereference(sk->sk_wq);
1908         if (wq_has_sleeper(wq))
1909                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
1910                                                 POLLRDNORM | POLLRDBAND);
1911         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1912         rcu_read_unlock();
1913 }
1914
1915 static void sock_def_write_space(struct sock *sk)
1916 {
1917         struct socket_wq *wq;
1918
1919         rcu_read_lock();
1920
1921         /* Do not wake up a writer until he can make "significant"
1922          * progress.  --DaveM
1923          */
1924         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1925                 wq = rcu_dereference(sk->sk_wq);
1926                 if (wq_has_sleeper(wq))
1927                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
1928                                                 POLLWRNORM | POLLWRBAND);
1929
1930                 /* Should agree with poll, otherwise some programs break */
1931                 if (sock_writeable(sk))
1932                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1933         }
1934
1935         rcu_read_unlock();
1936 }
1937
1938 static void sock_def_destruct(struct sock *sk)
1939 {
1940         kfree(sk->sk_protinfo);
1941 }
1942
1943 void sk_send_sigurg(struct sock *sk)
1944 {
1945         if (sk->sk_socket && sk->sk_socket->file)
1946                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1947                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1948 }
1949 EXPORT_SYMBOL(sk_send_sigurg);
1950
1951 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1952                     unsigned long expires)
1953 {
1954         if (!mod_timer(timer, expires))
1955                 sock_hold(sk);
1956 }
1957 EXPORT_SYMBOL(sk_reset_timer);
1958
1959 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1960 {
1961         if (timer_pending(timer) && del_timer(timer))
1962                 __sock_put(sk);
1963 }
1964 EXPORT_SYMBOL(sk_stop_timer);
1965
1966 void sock_init_data(struct socket *sock, struct sock *sk)
1967 {
1968         skb_queue_head_init(&sk->sk_receive_queue);
1969         skb_queue_head_init(&sk->sk_write_queue);
1970         skb_queue_head_init(&sk->sk_error_queue);
1971 #ifdef CONFIG_NET_DMA
1972         skb_queue_head_init(&sk->sk_async_wait_queue);
1973 #endif
1974
1975         sk->sk_send_head        =       NULL;
1976
1977         init_timer(&sk->sk_timer);
1978
1979         sk->sk_allocation       =       GFP_KERNEL;
1980         sk->sk_rcvbuf           =       sysctl_rmem_default;
1981         sk->sk_sndbuf           =       sysctl_wmem_default;
1982         sk->sk_state            =       TCP_CLOSE;
1983         sk_set_socket(sk, sock);
1984
1985         sock_set_flag(sk, SOCK_ZAPPED);
1986
1987         if (sock) {
1988                 sk->sk_type     =       sock->type;
1989                 sk->sk_wq       =       sock->wq;
1990                 sock->sk        =       sk;
1991         } else
1992                 sk->sk_wq       =       NULL;
1993
1994         spin_lock_init(&sk->sk_dst_lock);
1995         rwlock_init(&sk->sk_callback_lock);
1996         lockdep_set_class_and_name(&sk->sk_callback_lock,
1997                         af_callback_keys + sk->sk_family,
1998                         af_family_clock_key_strings[sk->sk_family]);
1999
2000         sk->sk_state_change     =       sock_def_wakeup;
2001         sk->sk_data_ready       =       sock_def_readable;
2002         sk->sk_write_space      =       sock_def_write_space;
2003         sk->sk_error_report     =       sock_def_error_report;
2004         sk->sk_destruct         =       sock_def_destruct;
2005
2006         sk->sk_sndmsg_page      =       NULL;
2007         sk->sk_sndmsg_off       =       0;
2008
2009         sk->sk_peer_pid         =       NULL;
2010         sk->sk_peer_cred        =       NULL;
2011         sk->sk_write_pending    =       0;
2012         sk->sk_rcvlowat         =       1;
2013         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2014         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2015
2016         sk->sk_stamp = ktime_set(-1L, 0);
2017
2018         /*
2019          * Before updating sk_refcnt, we must commit prior changes to memory
2020          * (Documentation/RCU/rculist_nulls.txt for details)
2021          */
2022         smp_wmb();
2023         atomic_set(&sk->sk_refcnt, 1);
2024         atomic_set(&sk->sk_drops, 0);
2025 }
2026 EXPORT_SYMBOL(sock_init_data);
2027
2028 void lock_sock_nested(struct sock *sk, int subclass)
2029 {
2030         might_sleep();
2031         spin_lock_bh(&sk->sk_lock.slock);
2032         if (sk->sk_lock.owned)
2033                 __lock_sock(sk);
2034         sk->sk_lock.owned = 1;
2035         spin_unlock(&sk->sk_lock.slock);
2036         /*
2037          * The sk_lock has mutex_lock() semantics here:
2038          */
2039         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2040         local_bh_enable();
2041 }
2042 EXPORT_SYMBOL(lock_sock_nested);
2043
2044 void release_sock(struct sock *sk)
2045 {
2046         /*
2047          * The sk_lock has mutex_unlock() semantics:
2048          */
2049         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2050
2051         spin_lock_bh(&sk->sk_lock.slock);
2052         if (sk->sk_backlog.tail)
2053                 __release_sock(sk);
2054         sk->sk_lock.owned = 0;
2055         if (waitqueue_active(&sk->sk_lock.wq))
2056                 wake_up(&sk->sk_lock.wq);
2057         spin_unlock_bh(&sk->sk_lock.slock);
2058 }
2059 EXPORT_SYMBOL(release_sock);
2060
2061 /**
2062  * lock_sock_fast - fast version of lock_sock
2063  * @sk: socket
2064  *
2065  * This version should be used for very small section, where process wont block
2066  * return false if fast path is taken
2067  *   sk_lock.slock locked, owned = 0, BH disabled
2068  * return true if slow path is taken
2069  *   sk_lock.slock unlocked, owned = 1, BH enabled
2070  */
2071 bool lock_sock_fast(struct sock *sk)
2072 {
2073         might_sleep();
2074         spin_lock_bh(&sk->sk_lock.slock);
2075
2076         if (!sk->sk_lock.owned)
2077                 /*
2078                  * Note : We must disable BH
2079                  */
2080                 return false;
2081
2082         __lock_sock(sk);
2083         sk->sk_lock.owned = 1;
2084         spin_unlock(&sk->sk_lock.slock);
2085         /*
2086          * The sk_lock has mutex_lock() semantics here:
2087          */
2088         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2089         local_bh_enable();
2090         return true;
2091 }
2092 EXPORT_SYMBOL(lock_sock_fast);
2093
2094 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2095 {
2096         struct timeval tv;
2097         if (!sock_flag(sk, SOCK_TIMESTAMP))
2098                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2099         tv = ktime_to_timeval(sk->sk_stamp);
2100         if (tv.tv_sec == -1)
2101                 return -ENOENT;
2102         if (tv.tv_sec == 0) {
2103                 sk->sk_stamp = ktime_get_real();
2104                 tv = ktime_to_timeval(sk->sk_stamp);
2105         }
2106         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2107 }
2108 EXPORT_SYMBOL(sock_get_timestamp);
2109
2110 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2111 {
2112         struct timespec ts;
2113         if (!sock_flag(sk, SOCK_TIMESTAMP))
2114                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2115         ts = ktime_to_timespec(sk->sk_stamp);
2116         if (ts.tv_sec == -1)
2117                 return -ENOENT;
2118         if (ts.tv_sec == 0) {
2119                 sk->sk_stamp = ktime_get_real();
2120                 ts = ktime_to_timespec(sk->sk_stamp);
2121         }
2122         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2123 }
2124 EXPORT_SYMBOL(sock_get_timestampns);
2125
2126 void sock_enable_timestamp(struct sock *sk, int flag)
2127 {
2128         if (!sock_flag(sk, flag)) {
2129                 sock_set_flag(sk, flag);
2130                 /*
2131                  * we just set one of the two flags which require net
2132                  * time stamping, but time stamping might have been on
2133                  * already because of the other one
2134                  */
2135                 if (!sock_flag(sk,
2136                                 flag == SOCK_TIMESTAMP ?
2137                                 SOCK_TIMESTAMPING_RX_SOFTWARE :
2138                                 SOCK_TIMESTAMP))
2139                         net_enable_timestamp();
2140         }
2141 }
2142
2143 /*
2144  *      Get a socket option on an socket.
2145  *
2146  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2147  *      asynchronous errors should be reported by getsockopt. We assume
2148  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2149  */
2150 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2151                            char __user *optval, int __user *optlen)
2152 {
2153         struct sock *sk = sock->sk;
2154
2155         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2156 }
2157 EXPORT_SYMBOL(sock_common_getsockopt);
2158
2159 #ifdef CONFIG_COMPAT
2160 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2161                                   char __user *optval, int __user *optlen)
2162 {
2163         struct sock *sk = sock->sk;
2164
2165         if (sk->sk_prot->compat_getsockopt != NULL)
2166                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2167                                                       optval, optlen);
2168         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2169 }
2170 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2171 #endif
2172
2173 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2174                         struct msghdr *msg, size_t size, int flags)
2175 {
2176         struct sock *sk = sock->sk;
2177         int addr_len = 0;
2178         int err;
2179
2180         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2181                                    flags & ~MSG_DONTWAIT, &addr_len);
2182         if (err >= 0)
2183                 msg->msg_namelen = addr_len;
2184         return err;
2185 }
2186 EXPORT_SYMBOL(sock_common_recvmsg);
2187
2188 /*
2189  *      Set socket options on an inet socket.
2190  */
2191 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2192                            char __user *optval, unsigned int optlen)
2193 {
2194         struct sock *sk = sock->sk;
2195
2196         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2197 }
2198 EXPORT_SYMBOL(sock_common_setsockopt);
2199
2200 #ifdef CONFIG_COMPAT
2201 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2202                                   char __user *optval, unsigned int optlen)
2203 {
2204         struct sock *sk = sock->sk;
2205
2206         if (sk->sk_prot->compat_setsockopt != NULL)
2207                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2208                                                       optval, optlen);
2209         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2210 }
2211 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2212 #endif
2213
2214 void sk_common_release(struct sock *sk)
2215 {
2216         if (sk->sk_prot->destroy)
2217                 sk->sk_prot->destroy(sk);
2218
2219         /*
2220          * Observation: when sock_common_release is called, processes have
2221          * no access to socket. But net still has.
2222          * Step one, detach it from networking:
2223          *
2224          * A. Remove from hash tables.
2225          */
2226
2227         sk->sk_prot->unhash(sk);
2228
2229         /*
2230          * In this point socket cannot receive new packets, but it is possible
2231          * that some packets are in flight because some CPU runs receiver and
2232          * did hash table lookup before we unhashed socket. They will achieve
2233          * receive queue and will be purged by socket destructor.
2234          *
2235          * Also we still have packets pending on receive queue and probably,
2236          * our own packets waiting in device queues. sock_destroy will drain
2237          * receive queue, but transmitted packets will delay socket destruction
2238          * until the last reference will be released.
2239          */
2240
2241         sock_orphan(sk);
2242
2243         xfrm_sk_free_policy(sk);
2244
2245         sk_refcnt_debug_release(sk);
2246         sock_put(sk);
2247 }
2248 EXPORT_SYMBOL(sk_common_release);
2249
2250 static DEFINE_RWLOCK(proto_list_lock);
2251 static LIST_HEAD(proto_list);
2252
2253 #ifdef CONFIG_PROC_FS
2254 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2255 struct prot_inuse {
2256         int val[PROTO_INUSE_NR];
2257 };
2258
2259 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2260
2261 #ifdef CONFIG_NET_NS
2262 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2263 {
2264         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2265 }
2266 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2267
2268 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2269 {
2270         int cpu, idx = prot->inuse_idx;
2271         int res = 0;
2272
2273         for_each_possible_cpu(cpu)
2274                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2275
2276         return res >= 0 ? res : 0;
2277 }
2278 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2279
2280 static int __net_init sock_inuse_init_net(struct net *net)
2281 {
2282         net->core.inuse = alloc_percpu(struct prot_inuse);
2283         return net->core.inuse ? 0 : -ENOMEM;
2284 }
2285
2286 static void __net_exit sock_inuse_exit_net(struct net *net)
2287 {
2288         free_percpu(net->core.inuse);
2289 }
2290
2291 static struct pernet_operations net_inuse_ops = {
2292         .init = sock_inuse_init_net,
2293         .exit = sock_inuse_exit_net,
2294 };
2295
2296 static __init int net_inuse_init(void)
2297 {
2298         if (register_pernet_subsys(&net_inuse_ops))
2299                 panic("Cannot initialize net inuse counters");
2300
2301         return 0;
2302 }
2303
2304 core_initcall(net_inuse_init);
2305 #else
2306 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2307
2308 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2309 {
2310         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2311 }
2312 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2313
2314 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2315 {
2316         int cpu, idx = prot->inuse_idx;
2317         int res = 0;
2318
2319         for_each_possible_cpu(cpu)
2320                 res += per_cpu(prot_inuse, cpu).val[idx];
2321
2322         return res >= 0 ? res : 0;
2323 }
2324 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2325 #endif
2326
2327 static void assign_proto_idx(struct proto *prot)
2328 {
2329         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2330
2331         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2332                 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2333                 return;
2334         }
2335
2336         set_bit(prot->inuse_idx, proto_inuse_idx);
2337 }
2338
2339 static void release_proto_idx(struct proto *prot)
2340 {
2341         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2342                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2343 }
2344 #else
2345 static inline void assign_proto_idx(struct proto *prot)
2346 {
2347 }
2348
2349 static inline void release_proto_idx(struct proto *prot)
2350 {
2351 }
2352 #endif
2353
2354 int proto_register(struct proto *prot, int alloc_slab)
2355 {
2356         if (alloc_slab) {
2357                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2358                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2359                                         NULL);
2360
2361                 if (prot->slab == NULL) {
2362                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2363                                prot->name);
2364                         goto out;
2365                 }
2366
2367                 if (prot->rsk_prot != NULL) {
2368                         prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2369                         if (prot->rsk_prot->slab_name == NULL)
2370                                 goto out_free_sock_slab;
2371
2372                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2373                                                                  prot->rsk_prot->obj_size, 0,
2374                                                                  SLAB_HWCACHE_ALIGN, NULL);
2375
2376                         if (prot->rsk_prot->slab == NULL) {
2377                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2378                                        prot->name);
2379                                 goto out_free_request_sock_slab_name;
2380                         }
2381                 }
2382
2383                 if (prot->twsk_prot != NULL) {
2384                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2385
2386                         if (prot->twsk_prot->twsk_slab_name == NULL)
2387                                 goto out_free_request_sock_slab;
2388
2389                         prot->twsk_prot->twsk_slab =
2390                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2391                                                   prot->twsk_prot->twsk_obj_size,
2392                                                   0,
2393                                                   SLAB_HWCACHE_ALIGN |
2394                                                         prot->slab_flags,
2395                                                   NULL);
2396                         if (prot->twsk_prot->twsk_slab == NULL)
2397                                 goto out_free_timewait_sock_slab_name;
2398                 }
2399         }
2400
2401         write_lock(&proto_list_lock);
2402         list_add(&prot->node, &proto_list);
2403         assign_proto_idx(prot);
2404         write_unlock(&proto_list_lock);
2405         return 0;
2406
2407 out_free_timewait_sock_slab_name:
2408         kfree(prot->twsk_prot->twsk_slab_name);
2409 out_free_request_sock_slab:
2410         if (prot->rsk_prot && prot->rsk_prot->slab) {
2411                 kmem_cache_destroy(prot->rsk_prot->slab);
2412                 prot->rsk_prot->slab = NULL;
2413         }
2414 out_free_request_sock_slab_name:
2415         if (prot->rsk_prot)
2416                 kfree(prot->rsk_prot->slab_name);
2417 out_free_sock_slab:
2418         kmem_cache_destroy(prot->slab);
2419         prot->slab = NULL;
2420 out:
2421         return -ENOBUFS;
2422 }
2423 EXPORT_SYMBOL(proto_register);
2424
2425 void proto_unregister(struct proto *prot)
2426 {
2427         write_lock(&proto_list_lock);
2428         release_proto_idx(prot);
2429         list_del(&prot->node);
2430         write_unlock(&proto_list_lock);
2431
2432         if (prot->slab != NULL) {
2433                 kmem_cache_destroy(prot->slab);
2434                 prot->slab = NULL;
2435         }
2436
2437         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2438                 kmem_cache_destroy(prot->rsk_prot->slab);
2439                 kfree(prot->rsk_prot->slab_name);
2440                 prot->rsk_prot->slab = NULL;
2441         }
2442
2443         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2444                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2445                 kfree(prot->twsk_prot->twsk_slab_name);
2446                 prot->twsk_prot->twsk_slab = NULL;
2447         }
2448 }
2449 EXPORT_SYMBOL(proto_unregister);
2450
2451 #ifdef CONFIG_PROC_FS
2452 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2453         __acquires(proto_list_lock)
2454 {
2455         read_lock(&proto_list_lock);
2456         return seq_list_start_head(&proto_list, *pos);
2457 }
2458
2459 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2460 {
2461         return seq_list_next(v, &proto_list, pos);
2462 }
2463
2464 static void proto_seq_stop(struct seq_file *seq, void *v)
2465         __releases(proto_list_lock)
2466 {
2467         read_unlock(&proto_list_lock);
2468 }
2469
2470 static char proto_method_implemented(const void *method)
2471 {
2472         return method == NULL ? 'n' : 'y';
2473 }
2474
2475 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2476 {
2477         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2478                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2479                    proto->name,
2480                    proto->obj_size,
2481                    sock_prot_inuse_get(seq_file_net(seq), proto),
2482                    proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
2483                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2484                    proto->max_header,
2485                    proto->slab == NULL ? "no" : "yes",
2486                    module_name(proto->owner),
2487                    proto_method_implemented(proto->close),
2488                    proto_method_implemented(proto->connect),
2489                    proto_method_implemented(proto->disconnect),
2490                    proto_method_implemented(proto->accept),
2491                    proto_method_implemented(proto->ioctl),
2492                    proto_method_implemented(proto->init),
2493                    proto_method_implemented(proto->destroy),
2494                    proto_method_implemented(proto->shutdown),
2495                    proto_method_implemented(proto->setsockopt),
2496                    proto_method_implemented(proto->getsockopt),
2497                    proto_method_implemented(proto->sendmsg),
2498                    proto_method_implemented(proto->recvmsg),
2499                    proto_method_implemented(proto->sendpage),
2500                    proto_method_implemented(proto->bind),
2501                    proto_method_implemented(proto->backlog_rcv),
2502                    proto_method_implemented(proto->hash),
2503                    proto_method_implemented(proto->unhash),
2504                    proto_method_implemented(proto->get_port),
2505                    proto_method_implemented(proto->enter_memory_pressure));
2506 }
2507
2508 static int proto_seq_show(struct seq_file *seq, void *v)
2509 {
2510         if (v == &proto_list)
2511                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2512                            "protocol",
2513                            "size",
2514                            "sockets",
2515                            "memory",
2516                            "press",
2517                            "maxhdr",
2518                            "slab",
2519                            "module",
2520                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2521         else
2522                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2523         return 0;
2524 }
2525
2526 static const struct seq_operations proto_seq_ops = {
2527         .start  = proto_seq_start,
2528         .next   = proto_seq_next,
2529         .stop   = proto_seq_stop,
2530         .show   = proto_seq_show,
2531 };
2532
2533 static int proto_seq_open(struct inode *inode, struct file *file)
2534 {
2535         return seq_open_net(inode, file, &proto_seq_ops,
2536                             sizeof(struct seq_net_private));
2537 }
2538
2539 static const struct file_operations proto_seq_fops = {
2540         .owner          = THIS_MODULE,
2541         .open           = proto_seq_open,
2542         .read           = seq_read,
2543         .llseek         = seq_lseek,
2544         .release        = seq_release_net,
2545 };
2546
2547 static __net_init int proto_init_net(struct net *net)
2548 {
2549         if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2550                 return -ENOMEM;
2551
2552         return 0;
2553 }
2554
2555 static __net_exit void proto_exit_net(struct net *net)
2556 {
2557         proc_net_remove(net, "protocols");
2558 }
2559
2560
2561 static __net_initdata struct pernet_operations proto_net_ops = {
2562         .init = proto_init_net,
2563         .exit = proto_exit_net,
2564 };
2565
2566 static int __init proto_init(void)
2567 {
2568         return register_pernet_subsys(&proto_net_ops);
2569 }
2570
2571 subsys_initcall(proto_init);
2572
2573 #endif /* PROC_FS */