[PATCH] lockdep: annotate af_unix locking
[pandora-kernel.git] / net / unix / af_unix.c
1 /*
2  * NET4:        Implementation of BSD Unix domain sockets.
3  *
4  * Authors:     Alan Cox, <alan.cox@linux.org>
5  *
6  *              This program is free software; you can redistribute it and/or
7  *              modify it under the terms of the GNU General Public License
8  *              as published by the Free Software Foundation; either version
9  *              2 of the License, or (at your option) any later version.
10  *
11  * Version:     $Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12  *
13  * Fixes:
14  *              Linus Torvalds  :       Assorted bug cures.
15  *              Niibe Yutaka    :       async I/O support.
16  *              Carsten Paeth   :       PF_UNIX check, address fixes.
17  *              Alan Cox        :       Limit size of allocated blocks.
18  *              Alan Cox        :       Fixed the stupid socketpair bug.
19  *              Alan Cox        :       BSD compatibility fine tuning.
20  *              Alan Cox        :       Fixed a bug in connect when interrupted.
21  *              Alan Cox        :       Sorted out a proper draft version of
22  *                                      file descriptor passing hacked up from
23  *                                      Mike Shaver's work.
24  *              Marty Leisner   :       Fixes to fd passing
25  *              Nick Nevin      :       recvmsg bugfix.
26  *              Alan Cox        :       Started proper garbage collector
27  *              Heiko EiBfeldt  :       Missing verify_area check
28  *              Alan Cox        :       Started POSIXisms
29  *              Andreas Schwab  :       Replace inode by dentry for proper
30  *                                      reference counting
31  *              Kirk Petersen   :       Made this a module
32  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
33  *                                      Lots of bug fixes.
34  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
35  *                                      by above two patches.
36  *           Andrea Arcangeli   :       If possible we block in connect(2)
37  *                                      if the max backlog of the listen socket
38  *                                      is been reached. This won't break
39  *                                      old apps and it will avoid huge amount
40  *                                      of socks hashed (this for unix_gc()
41  *                                      performances reasons).
42  *                                      Security fix that limits the max
43  *                                      number of socks to 2*max_files and
44  *                                      the number of skb queueable in the
45  *                                      dgram receiver.
46  *              Artur Skawina   :       Hash function optimizations
47  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
48  *            Malcolm Beattie   :       Set peercred for socketpair
49  *           Michal Ostrowski   :       Module initialization cleanup.
50  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
51  *                                      the core infrastructure is doing that
52  *                                      for all net proto families now (2.5.69+)
53  *
54  *
55  * Known differences from reference BSD that was tested:
56  *
57  *      [TO FIX]
58  *      ECONNREFUSED is not returned from one end of a connected() socket to the
59  *              other the moment one end closes.
60  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
61  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
62  *      [NOT TO FIX]
63  *      accept() returns a path name even if the connecting socket has closed
64  *              in the meantime (BSD loses the path and gives up).
65  *      accept() returns 0 length path for an unbound connector. BSD returns 16
66  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
68  *      BSD af_unix apparently has connect forgetting to block properly.
69  *              (need to check this with the POSIX spec in detail)
70  *
71  * Differences from 2.0.0-11-... (ANK)
72  *      Bug fixes and improvements.
73  *              - client shutdown killed server socket.
74  *              - removed all useless cli/sti pairs.
75  *
76  *      Semantic changes/extensions.
77  *              - generic control message passing.
78  *              - SCM_CREDENTIALS control message.
79  *              - "Abstract" (not FS based) socket bindings.
80  *                Abstract names are sequences of bytes (not zero terminated)
81  *                started by 0, so that this name space does not intersect
82  *                with BSD names.
83  */
84
85 #include <linux/module.h>
86 #include <linux/kernel.h>
87 #include <linux/signal.h>
88 #include <linux/sched.h>
89 #include <linux/errno.h>
90 #include <linux/string.h>
91 #include <linux/stat.h>
92 #include <linux/dcache.h>
93 #include <linux/namei.h>
94 #include <linux/socket.h>
95 #include <linux/un.h>
96 #include <linux/fcntl.h>
97 #include <linux/termios.h>
98 #include <linux/sockios.h>
99 #include <linux/net.h>
100 #include <linux/in.h>
101 #include <linux/fs.h>
102 #include <linux/slab.h>
103 #include <asm/uaccess.h>
104 #include <linux/skbuff.h>
105 #include <linux/netdevice.h>
106 #include <net/sock.h>
107 #include <net/tcp_states.h>
108 #include <net/af_unix.h>
109 #include <linux/proc_fs.h>
110 #include <linux/seq_file.h>
111 #include <net/scm.h>
112 #include <linux/init.h>
113 #include <linux/poll.h>
114 #include <linux/smp_lock.h>
115 #include <linux/rtnetlink.h>
116 #include <linux/mount.h>
117 #include <net/checksum.h>
118 #include <linux/security.h>
119
120 int sysctl_unix_max_dgram_qlen = 10;
121
122 struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
123 DEFINE_SPINLOCK(unix_table_lock);
124 static atomic_t unix_nr_socks = ATOMIC_INIT(0);
125
126 #define unix_sockets_unbound    (&unix_socket_table[UNIX_HASH_SIZE])
127
128 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
129
130 #ifdef CONFIG_SECURITY_NETWORK
131 static void unix_get_peersec_dgram(struct sk_buff *skb)
132 {
133         int err;
134
135         err = security_socket_getpeersec_dgram(skb, UNIXSECDATA(skb),
136                                                UNIXSECLEN(skb));
137         if (err)
138                 *(UNIXSECDATA(skb)) = NULL;
139 }
140
141 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
142 {
143         scm->secdata = *UNIXSECDATA(skb);
144         scm->seclen = *UNIXSECLEN(skb);
145 }
146 #else
147 static void unix_get_peersec_dgram(struct sk_buff *skb)
148 { }
149
150 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
151 { }
152 #endif /* CONFIG_SECURITY_NETWORK */
153
154 /*
155  *  SMP locking strategy:
156  *    hash table is protected with spinlock unix_table_lock
157  *    each socket state is protected by separate rwlock.
158  */
159
160 static inline unsigned unix_hash_fold(unsigned hash)
161 {
162         hash ^= hash>>16;
163         hash ^= hash>>8;
164         return hash&(UNIX_HASH_SIZE-1);
165 }
166
167 #define unix_peer(sk) (unix_sk(sk)->peer)
168
169 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
170 {
171         return unix_peer(osk) == sk;
172 }
173
174 static inline int unix_may_send(struct sock *sk, struct sock *osk)
175 {
176         return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
177 }
178
179 static struct sock *unix_peer_get(struct sock *s)
180 {
181         struct sock *peer;
182
183         unix_state_rlock(s);
184         peer = unix_peer(s);
185         if (peer)
186                 sock_hold(peer);
187         unix_state_runlock(s);
188         return peer;
189 }
190
191 static inline void unix_release_addr(struct unix_address *addr)
192 {
193         if (atomic_dec_and_test(&addr->refcnt))
194                 kfree(addr);
195 }
196
197 /*
198  *      Check unix socket name:
199  *              - should be not zero length.
200  *              - if started by not zero, should be NULL terminated (FS object)
201  *              - if started by zero, it is abstract name.
202  */
203  
204 static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
205 {
206         if (len <= sizeof(short) || len > sizeof(*sunaddr))
207                 return -EINVAL;
208         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
209                 return -EINVAL;
210         if (sunaddr->sun_path[0]) {
211                 /*
212                  * This may look like an off by one error but it is a bit more
213                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
214                  * sun_path[108] doesnt as such exist.  However in kernel space
215                  * we are guaranteed that it is a valid memory location in our
216                  * kernel address buffer.
217                  */
218                 ((char *)sunaddr)[len]=0;
219                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
220                 return len;
221         }
222
223         *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
224         return len;
225 }
226
227 static void __unix_remove_socket(struct sock *sk)
228 {
229         sk_del_node_init(sk);
230 }
231
232 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
233 {
234         BUG_TRAP(sk_unhashed(sk));
235         sk_add_node(sk, list);
236 }
237
238 static inline void unix_remove_socket(struct sock *sk)
239 {
240         spin_lock(&unix_table_lock);
241         __unix_remove_socket(sk);
242         spin_unlock(&unix_table_lock);
243 }
244
245 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
246 {
247         spin_lock(&unix_table_lock);
248         __unix_insert_socket(list, sk);
249         spin_unlock(&unix_table_lock);
250 }
251
252 static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
253                                               int len, int type, unsigned hash)
254 {
255         struct sock *s;
256         struct hlist_node *node;
257
258         sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
259                 struct unix_sock *u = unix_sk(s);
260
261                 if (u->addr->len == len &&
262                     !memcmp(u->addr->name, sunname, len))
263                         goto found;
264         }
265         s = NULL;
266 found:
267         return s;
268 }
269
270 static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
271                                                    int len, int type,
272                                                    unsigned hash)
273 {
274         struct sock *s;
275
276         spin_lock(&unix_table_lock);
277         s = __unix_find_socket_byname(sunname, len, type, hash);
278         if (s)
279                 sock_hold(s);
280         spin_unlock(&unix_table_lock);
281         return s;
282 }
283
284 static struct sock *unix_find_socket_byinode(struct inode *i)
285 {
286         struct sock *s;
287         struct hlist_node *node;
288
289         spin_lock(&unix_table_lock);
290         sk_for_each(s, node,
291                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
292                 struct dentry *dentry = unix_sk(s)->dentry;
293
294                 if(dentry && dentry->d_inode == i)
295                 {
296                         sock_hold(s);
297                         goto found;
298                 }
299         }
300         s = NULL;
301 found:
302         spin_unlock(&unix_table_lock);
303         return s;
304 }
305
306 static inline int unix_writable(struct sock *sk)
307 {
308         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
309 }
310
311 static void unix_write_space(struct sock *sk)
312 {
313         read_lock(&sk->sk_callback_lock);
314         if (unix_writable(sk)) {
315                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
316                         wake_up_interruptible(sk->sk_sleep);
317                 sk_wake_async(sk, 2, POLL_OUT);
318         }
319         read_unlock(&sk->sk_callback_lock);
320 }
321
322 /* When dgram socket disconnects (or changes its peer), we clear its receive
323  * queue of packets arrived from previous peer. First, it allows to do
324  * flow control based only on wmem_alloc; second, sk connected to peer
325  * may receive messages only from that peer. */
326 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
327 {
328         if (!skb_queue_empty(&sk->sk_receive_queue)) {
329                 skb_queue_purge(&sk->sk_receive_queue);
330                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
331
332                 /* If one link of bidirectional dgram pipe is disconnected,
333                  * we signal error. Messages are lost. Do not make this,
334                  * when peer was not connected to us.
335                  */
336                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
337                         other->sk_err = ECONNRESET;
338                         other->sk_error_report(other);
339                 }
340         }
341 }
342
343 static void unix_sock_destructor(struct sock *sk)
344 {
345         struct unix_sock *u = unix_sk(sk);
346
347         skb_queue_purge(&sk->sk_receive_queue);
348
349         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
350         BUG_TRAP(sk_unhashed(sk));
351         BUG_TRAP(!sk->sk_socket);
352         if (!sock_flag(sk, SOCK_DEAD)) {
353                 printk("Attempt to release alive unix socket: %p\n", sk);
354                 return;
355         }
356
357         if (u->addr)
358                 unix_release_addr(u->addr);
359
360         atomic_dec(&unix_nr_socks);
361 #ifdef UNIX_REFCNT_DEBUG
362         printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
363 #endif
364 }
365
366 static int unix_release_sock (struct sock *sk, int embrion)
367 {
368         struct unix_sock *u = unix_sk(sk);
369         struct dentry *dentry;
370         struct vfsmount *mnt;
371         struct sock *skpair;
372         struct sk_buff *skb;
373         int state;
374
375         unix_remove_socket(sk);
376
377         /* Clear state */
378         unix_state_wlock(sk);
379         sock_orphan(sk);
380         sk->sk_shutdown = SHUTDOWN_MASK;
381         dentry       = u->dentry;
382         u->dentry    = NULL;
383         mnt          = u->mnt;
384         u->mnt       = NULL;
385         state = sk->sk_state;
386         sk->sk_state = TCP_CLOSE;
387         unix_state_wunlock(sk);
388
389         wake_up_interruptible_all(&u->peer_wait);
390
391         skpair=unix_peer(sk);
392
393         if (skpair!=NULL) {
394                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
395                         unix_state_wlock(skpair);
396                         /* No more writes */
397                         skpair->sk_shutdown = SHUTDOWN_MASK;
398                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
399                                 skpair->sk_err = ECONNRESET;
400                         unix_state_wunlock(skpair);
401                         skpair->sk_state_change(skpair);
402                         read_lock(&skpair->sk_callback_lock);
403                         sk_wake_async(skpair,1,POLL_HUP);
404                         read_unlock(&skpair->sk_callback_lock);
405                 }
406                 sock_put(skpair); /* It may now die */
407                 unix_peer(sk) = NULL;
408         }
409
410         /* Try to flush out this socket. Throw out buffers at least */
411
412         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
413                 if (state==TCP_LISTEN)
414                         unix_release_sock(skb->sk, 1);
415                 /* passed fds are erased in the kfree_skb hook        */
416                 kfree_skb(skb);
417         }
418
419         if (dentry) {
420                 dput(dentry);
421                 mntput(mnt);
422         }
423
424         sock_put(sk);
425
426         /* ---- Socket is dead now and most probably destroyed ---- */
427
428         /*
429          * Fixme: BSD difference: In BSD all sockets connected to use get
430          *        ECONNRESET and we die on the spot. In Linux we behave
431          *        like files and pipes do and wait for the last
432          *        dereference.
433          *
434          * Can't we simply set sock->err?
435          *
436          *        What the above comment does talk about? --ANK(980817)
437          */
438
439         if (atomic_read(&unix_tot_inflight))
440                 unix_gc();              /* Garbage collect fds */       
441
442         return 0;
443 }
444
445 static int unix_listen(struct socket *sock, int backlog)
446 {
447         int err;
448         struct sock *sk = sock->sk;
449         struct unix_sock *u = unix_sk(sk);
450
451         err = -EOPNOTSUPP;
452         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
453                 goto out;                       /* Only stream/seqpacket sockets accept */
454         err = -EINVAL;
455         if (!u->addr)
456                 goto out;                       /* No listens on an unbound socket */
457         unix_state_wlock(sk);
458         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
459                 goto out_unlock;
460         if (backlog > sk->sk_max_ack_backlog)
461                 wake_up_interruptible_all(&u->peer_wait);
462         sk->sk_max_ack_backlog  = backlog;
463         sk->sk_state            = TCP_LISTEN;
464         /* set credentials so connect can copy them */
465         sk->sk_peercred.pid     = current->tgid;
466         sk->sk_peercred.uid     = current->euid;
467         sk->sk_peercred.gid     = current->egid;
468         err = 0;
469
470 out_unlock:
471         unix_state_wunlock(sk);
472 out:
473         return err;
474 }
475
476 static int unix_release(struct socket *);
477 static int unix_bind(struct socket *, struct sockaddr *, int);
478 static int unix_stream_connect(struct socket *, struct sockaddr *,
479                                int addr_len, int flags);
480 static int unix_socketpair(struct socket *, struct socket *);
481 static int unix_accept(struct socket *, struct socket *, int);
482 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
483 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
484 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
485 static int unix_shutdown(struct socket *, int);
486 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
487                                struct msghdr *, size_t);
488 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
489                                struct msghdr *, size_t, int);
490 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
491                               struct msghdr *, size_t);
492 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
493                               struct msghdr *, size_t, int);
494 static int unix_dgram_connect(struct socket *, struct sockaddr *,
495                               int, int);
496 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
497                                   struct msghdr *, size_t);
498
499 static const struct proto_ops unix_stream_ops = {
500         .family =       PF_UNIX,
501         .owner =        THIS_MODULE,
502         .release =      unix_release,
503         .bind =         unix_bind,
504         .connect =      unix_stream_connect,
505         .socketpair =   unix_socketpair,
506         .accept =       unix_accept,
507         .getname =      unix_getname,
508         .poll =         unix_poll,
509         .ioctl =        unix_ioctl,
510         .listen =       unix_listen,
511         .shutdown =     unix_shutdown,
512         .setsockopt =   sock_no_setsockopt,
513         .getsockopt =   sock_no_getsockopt,
514         .sendmsg =      unix_stream_sendmsg,
515         .recvmsg =      unix_stream_recvmsg,
516         .mmap =         sock_no_mmap,
517         .sendpage =     sock_no_sendpage,
518 };
519
520 static const struct proto_ops unix_dgram_ops = {
521         .family =       PF_UNIX,
522         .owner =        THIS_MODULE,
523         .release =      unix_release,
524         .bind =         unix_bind,
525         .connect =      unix_dgram_connect,
526         .socketpair =   unix_socketpair,
527         .accept =       sock_no_accept,
528         .getname =      unix_getname,
529         .poll =         datagram_poll,
530         .ioctl =        unix_ioctl,
531         .listen =       sock_no_listen,
532         .shutdown =     unix_shutdown,
533         .setsockopt =   sock_no_setsockopt,
534         .getsockopt =   sock_no_getsockopt,
535         .sendmsg =      unix_dgram_sendmsg,
536         .recvmsg =      unix_dgram_recvmsg,
537         .mmap =         sock_no_mmap,
538         .sendpage =     sock_no_sendpage,
539 };
540
541 static const struct proto_ops unix_seqpacket_ops = {
542         .family =       PF_UNIX,
543         .owner =        THIS_MODULE,
544         .release =      unix_release,
545         .bind =         unix_bind,
546         .connect =      unix_stream_connect,
547         .socketpair =   unix_socketpair,
548         .accept =       unix_accept,
549         .getname =      unix_getname,
550         .poll =         datagram_poll,
551         .ioctl =        unix_ioctl,
552         .listen =       unix_listen,
553         .shutdown =     unix_shutdown,
554         .setsockopt =   sock_no_setsockopt,
555         .getsockopt =   sock_no_getsockopt,
556         .sendmsg =      unix_seqpacket_sendmsg,
557         .recvmsg =      unix_dgram_recvmsg,
558         .mmap =         sock_no_mmap,
559         .sendpage =     sock_no_sendpage,
560 };
561
562 static struct proto unix_proto = {
563         .name     = "UNIX",
564         .owner    = THIS_MODULE,
565         .obj_size = sizeof(struct unix_sock),
566 };
567
568 /*
569  * AF_UNIX sockets do not interact with hardware, hence they
570  * dont trigger interrupts - so it's safe for them to have
571  * bh-unsafe locking for their sk_receive_queue.lock. Split off
572  * this special lock-class by reinitializing the spinlock key:
573  */
574 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
575
576 static struct sock * unix_create1(struct socket *sock)
577 {
578         struct sock *sk = NULL;
579         struct unix_sock *u;
580
581         if (atomic_read(&unix_nr_socks) >= 2*get_max_files())
582                 goto out;
583
584         sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1);
585         if (!sk)
586                 goto out;
587
588         atomic_inc(&unix_nr_socks);
589
590         sock_init_data(sock,sk);
591         lockdep_set_class(&sk->sk_receive_queue.lock,
592                                 &af_unix_sk_receive_queue_lock_key);
593
594         sk->sk_write_space      = unix_write_space;
595         sk->sk_max_ack_backlog  = sysctl_unix_max_dgram_qlen;
596         sk->sk_destruct         = unix_sock_destructor;
597         u         = unix_sk(sk);
598         u->dentry = NULL;
599         u->mnt    = NULL;
600         spin_lock_init(&u->lock);
601         atomic_set(&u->inflight, sock ? 0 : -1);
602         mutex_init(&u->readlock); /* single task reading lock */
603         init_waitqueue_head(&u->peer_wait);
604         unix_insert_socket(unix_sockets_unbound, sk);
605 out:
606         return sk;
607 }
608
609 static int unix_create(struct socket *sock, int protocol)
610 {
611         if (protocol && protocol != PF_UNIX)
612                 return -EPROTONOSUPPORT;
613
614         sock->state = SS_UNCONNECTED;
615
616         switch (sock->type) {
617         case SOCK_STREAM:
618                 sock->ops = &unix_stream_ops;
619                 break;
620                 /*
621                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
622                  *      nothing uses it.
623                  */
624         case SOCK_RAW:
625                 sock->type=SOCK_DGRAM;
626         case SOCK_DGRAM:
627                 sock->ops = &unix_dgram_ops;
628                 break;
629         case SOCK_SEQPACKET:
630                 sock->ops = &unix_seqpacket_ops;
631                 break;
632         default:
633                 return -ESOCKTNOSUPPORT;
634         }
635
636         return unix_create1(sock) ? 0 : -ENOMEM;
637 }
638
639 static int unix_release(struct socket *sock)
640 {
641         struct sock *sk = sock->sk;
642
643         if (!sk)
644                 return 0;
645
646         sock->sk = NULL;
647
648         return unix_release_sock (sk, 0);
649 }
650
651 static int unix_autobind(struct socket *sock)
652 {
653         struct sock *sk = sock->sk;
654         struct unix_sock *u = unix_sk(sk);
655         static u32 ordernum = 1;
656         struct unix_address * addr;
657         int err;
658
659         mutex_lock(&u->readlock);
660
661         err = 0;
662         if (u->addr)
663                 goto out;
664
665         err = -ENOMEM;
666         addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
667         if (!addr)
668                 goto out;
669
670         memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
671         addr->name->sun_family = AF_UNIX;
672         atomic_set(&addr->refcnt, 1);
673
674 retry:
675         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
676         addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
677
678         spin_lock(&unix_table_lock);
679         ordernum = (ordernum+1)&0xFFFFF;
680
681         if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
682                                       addr->hash)) {
683                 spin_unlock(&unix_table_lock);
684                 /* Sanity yield. It is unusual case, but yet... */
685                 if (!(ordernum&0xFF))
686                         yield();
687                 goto retry;
688         }
689         addr->hash ^= sk->sk_type;
690
691         __unix_remove_socket(sk);
692         u->addr = addr;
693         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
694         spin_unlock(&unix_table_lock);
695         err = 0;
696
697 out:    mutex_unlock(&u->readlock);
698         return err;
699 }
700
701 static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
702                                     int type, unsigned hash, int *error)
703 {
704         struct sock *u;
705         struct nameidata nd;
706         int err = 0;
707         
708         if (sunname->sun_path[0]) {
709                 err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
710                 if (err)
711                         goto fail;
712                 err = vfs_permission(&nd, MAY_WRITE);
713                 if (err)
714                         goto put_fail;
715
716                 err = -ECONNREFUSED;
717                 if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
718                         goto put_fail;
719                 u=unix_find_socket_byinode(nd.dentry->d_inode);
720                 if (!u)
721                         goto put_fail;
722
723                 if (u->sk_type == type)
724                         touch_atime(nd.mnt, nd.dentry);
725
726                 path_release(&nd);
727
728                 err=-EPROTOTYPE;
729                 if (u->sk_type != type) {
730                         sock_put(u);
731                         goto fail;
732                 }
733         } else {
734                 err = -ECONNREFUSED;
735                 u=unix_find_socket_byname(sunname, len, type, hash);
736                 if (u) {
737                         struct dentry *dentry;
738                         dentry = unix_sk(u)->dentry;
739                         if (dentry)
740                                 touch_atime(unix_sk(u)->mnt, dentry);
741                 } else
742                         goto fail;
743         }
744         return u;
745
746 put_fail:
747         path_release(&nd);
748 fail:
749         *error=err;
750         return NULL;
751 }
752
753
754 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
755 {
756         struct sock *sk = sock->sk;
757         struct unix_sock *u = unix_sk(sk);
758         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
759         struct dentry * dentry = NULL;
760         struct nameidata nd;
761         int err;
762         unsigned hash;
763         struct unix_address *addr;
764         struct hlist_head *list;
765
766         err = -EINVAL;
767         if (sunaddr->sun_family != AF_UNIX)
768                 goto out;
769
770         if (addr_len==sizeof(short)) {
771                 err = unix_autobind(sock);
772                 goto out;
773         }
774
775         err = unix_mkname(sunaddr, addr_len, &hash);
776         if (err < 0)
777                 goto out;
778         addr_len = err;
779
780         mutex_lock(&u->readlock);
781
782         err = -EINVAL;
783         if (u->addr)
784                 goto out_up;
785
786         err = -ENOMEM;
787         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
788         if (!addr)
789                 goto out_up;
790
791         memcpy(addr->name, sunaddr, addr_len);
792         addr->len = addr_len;
793         addr->hash = hash ^ sk->sk_type;
794         atomic_set(&addr->refcnt, 1);
795
796         if (sunaddr->sun_path[0]) {
797                 unsigned int mode;
798                 err = 0;
799                 /*
800                  * Get the parent directory, calculate the hash for last
801                  * component.
802                  */
803                 err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
804                 if (err)
805                         goto out_mknod_parent;
806
807                 dentry = lookup_create(&nd, 0);
808                 err = PTR_ERR(dentry);
809                 if (IS_ERR(dentry))
810                         goto out_mknod_unlock;
811
812                 /*
813                  * All right, let's create it.
814                  */
815                 mode = S_IFSOCK |
816                        (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
817                 err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
818                 if (err)
819                         goto out_mknod_dput;
820                 mutex_unlock(&nd.dentry->d_inode->i_mutex);
821                 dput(nd.dentry);
822                 nd.dentry = dentry;
823
824                 addr->hash = UNIX_HASH_SIZE;
825         }
826
827         spin_lock(&unix_table_lock);
828
829         if (!sunaddr->sun_path[0]) {
830                 err = -EADDRINUSE;
831                 if (__unix_find_socket_byname(sunaddr, addr_len,
832                                               sk->sk_type, hash)) {
833                         unix_release_addr(addr);
834                         goto out_unlock;
835                 }
836
837                 list = &unix_socket_table[addr->hash];
838         } else {
839                 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
840                 u->dentry = nd.dentry;
841                 u->mnt    = nd.mnt;
842         }
843
844         err = 0;
845         __unix_remove_socket(sk);
846         u->addr = addr;
847         __unix_insert_socket(list, sk);
848
849 out_unlock:
850         spin_unlock(&unix_table_lock);
851 out_up:
852         mutex_unlock(&u->readlock);
853 out:
854         return err;
855
856 out_mknod_dput:
857         dput(dentry);
858 out_mknod_unlock:
859         mutex_unlock(&nd.dentry->d_inode->i_mutex);
860         path_release(&nd);
861 out_mknod_parent:
862         if (err==-EEXIST)
863                 err=-EADDRINUSE;
864         unix_release_addr(addr);
865         goto out_up;
866 }
867
868 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
869                               int alen, int flags)
870 {
871         struct sock *sk = sock->sk;
872         struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
873         struct sock *other;
874         unsigned hash;
875         int err;
876
877         if (addr->sa_family != AF_UNSPEC) {
878                 err = unix_mkname(sunaddr, alen, &hash);
879                 if (err < 0)
880                         goto out;
881                 alen = err;
882
883                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
884                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
885                         goto out;
886
887                 other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
888                 if (!other)
889                         goto out;
890
891                 unix_state_wlock(sk);
892
893                 err = -EPERM;
894                 if (!unix_may_send(sk, other))
895                         goto out_unlock;
896
897                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
898                 if (err)
899                         goto out_unlock;
900
901         } else {
902                 /*
903                  *      1003.1g breaking connected state with AF_UNSPEC
904                  */
905                 other = NULL;
906                 unix_state_wlock(sk);
907         }
908
909         /*
910          * If it was connected, reconnect.
911          */
912         if (unix_peer(sk)) {
913                 struct sock *old_peer = unix_peer(sk);
914                 unix_peer(sk)=other;
915                 unix_state_wunlock(sk);
916
917                 if (other != old_peer)
918                         unix_dgram_disconnected(sk, old_peer);
919                 sock_put(old_peer);
920         } else {
921                 unix_peer(sk)=other;
922                 unix_state_wunlock(sk);
923         }
924         return 0;
925
926 out_unlock:
927         unix_state_wunlock(sk);
928         sock_put(other);
929 out:
930         return err;
931 }
932
933 static long unix_wait_for_peer(struct sock *other, long timeo)
934 {
935         struct unix_sock *u = unix_sk(other);
936         int sched;
937         DEFINE_WAIT(wait);
938
939         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
940
941         sched = !sock_flag(other, SOCK_DEAD) &&
942                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
943                 (skb_queue_len(&other->sk_receive_queue) >
944                  other->sk_max_ack_backlog);
945
946         unix_state_runlock(other);
947
948         if (sched)
949                 timeo = schedule_timeout(timeo);
950
951         finish_wait(&u->peer_wait, &wait);
952         return timeo;
953 }
954
955 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
956                                int addr_len, int flags)
957 {
958         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
959         struct sock *sk = sock->sk;
960         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
961         struct sock *newsk = NULL;
962         struct sock *other = NULL;
963         struct sk_buff *skb = NULL;
964         unsigned hash;
965         int st;
966         int err;
967         long timeo;
968
969         err = unix_mkname(sunaddr, addr_len, &hash);
970         if (err < 0)
971                 goto out;
972         addr_len = err;
973
974         if (test_bit(SOCK_PASSCRED, &sock->flags)
975                 && !u->addr && (err = unix_autobind(sock)) != 0)
976                 goto out;
977
978         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
979
980         /* First of all allocate resources.
981            If we will make it after state is locked,
982            we will have to recheck all again in any case.
983          */
984
985         err = -ENOMEM;
986
987         /* create new sock for complete connection */
988         newsk = unix_create1(NULL);
989         if (newsk == NULL)
990                 goto out;
991
992         /* Allocate skb for sending to listening sock */
993         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
994         if (skb == NULL)
995                 goto out;
996
997 restart:
998         /*  Find listening sock. */
999         other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
1000         if (!other)
1001                 goto out;
1002
1003         /* Latch state of peer */
1004         unix_state_rlock(other);
1005
1006         /* Apparently VFS overslept socket death. Retry. */
1007         if (sock_flag(other, SOCK_DEAD)) {
1008                 unix_state_runlock(other);
1009                 sock_put(other);
1010                 goto restart;
1011         }
1012
1013         err = -ECONNREFUSED;
1014         if (other->sk_state != TCP_LISTEN)
1015                 goto out_unlock;
1016
1017         if (skb_queue_len(&other->sk_receive_queue) >
1018             other->sk_max_ack_backlog) {
1019                 err = -EAGAIN;
1020                 if (!timeo)
1021                         goto out_unlock;
1022
1023                 timeo = unix_wait_for_peer(other, timeo);
1024
1025                 err = sock_intr_errno(timeo);
1026                 if (signal_pending(current))
1027                         goto out;
1028                 sock_put(other);
1029                 goto restart;
1030         }
1031
1032         /* Latch our state.
1033
1034            It is tricky place. We need to grab write lock and cannot
1035            drop lock on peer. It is dangerous because deadlock is
1036            possible. Connect to self case and simultaneous
1037            attempt to connect are eliminated by checking socket
1038            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1039            check this before attempt to grab lock.
1040
1041            Well, and we have to recheck the state after socket locked.
1042          */
1043         st = sk->sk_state;
1044
1045         switch (st) {
1046         case TCP_CLOSE:
1047                 /* This is ok... continue with connect */
1048                 break;
1049         case TCP_ESTABLISHED:
1050                 /* Socket is already connected */
1051                 err = -EISCONN;
1052                 goto out_unlock;
1053         default:
1054                 err = -EINVAL;
1055                 goto out_unlock;
1056         }
1057
1058         unix_state_wlock_nested(sk);
1059
1060         if (sk->sk_state != st) {
1061                 unix_state_wunlock(sk);
1062                 unix_state_runlock(other);
1063                 sock_put(other);
1064                 goto restart;
1065         }
1066
1067         err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1068         if (err) {
1069                 unix_state_wunlock(sk);
1070                 goto out_unlock;
1071         }
1072
1073         /* The way is open! Fastly set all the necessary fields... */
1074
1075         sock_hold(sk);
1076         unix_peer(newsk)        = sk;
1077         newsk->sk_state         = TCP_ESTABLISHED;
1078         newsk->sk_type          = sk->sk_type;
1079         newsk->sk_peercred.pid  = current->tgid;
1080         newsk->sk_peercred.uid  = current->euid;
1081         newsk->sk_peercred.gid  = current->egid;
1082         newu = unix_sk(newsk);
1083         newsk->sk_sleep         = &newu->peer_wait;
1084         otheru = unix_sk(other);
1085
1086         /* copy address information from listening to new sock*/
1087         if (otheru->addr) {
1088                 atomic_inc(&otheru->addr->refcnt);
1089                 newu->addr = otheru->addr;
1090         }
1091         if (otheru->dentry) {
1092                 newu->dentry    = dget(otheru->dentry);
1093                 newu->mnt       = mntget(otheru->mnt);
1094         }
1095
1096         /* Set credentials */
1097         sk->sk_peercred = other->sk_peercred;
1098
1099         sock->state     = SS_CONNECTED;
1100         sk->sk_state    = TCP_ESTABLISHED;
1101         sock_hold(newsk);
1102
1103         smp_mb__after_atomic_inc();     /* sock_hold() does an atomic_inc() */
1104         unix_peer(sk)   = newsk;
1105
1106         unix_state_wunlock(sk);
1107
1108         /* take ten and and send info to listening sock */
1109         spin_lock(&other->sk_receive_queue.lock);
1110         __skb_queue_tail(&other->sk_receive_queue, skb);
1111         /* Undo artificially decreased inflight after embrion
1112          * is installed to listening socket. */
1113         atomic_inc(&newu->inflight);
1114         spin_unlock(&other->sk_receive_queue.lock);
1115         unix_state_runlock(other);
1116         other->sk_data_ready(other, 0);
1117         sock_put(other);
1118         return 0;
1119
1120 out_unlock:
1121         if (other)
1122                 unix_state_runlock(other);
1123
1124 out:
1125         if (skb)
1126                 kfree_skb(skb);
1127         if (newsk)
1128                 unix_release_sock(newsk, 0);
1129         if (other)
1130                 sock_put(other);
1131         return err;
1132 }
1133
1134 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1135 {
1136         struct sock *ska=socka->sk, *skb = sockb->sk;
1137
1138         /* Join our sockets back to back */
1139         sock_hold(ska);
1140         sock_hold(skb);
1141         unix_peer(ska)=skb;
1142         unix_peer(skb)=ska;
1143         ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1144         ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1145         ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1146
1147         if (ska->sk_type != SOCK_DGRAM) {
1148                 ska->sk_state = TCP_ESTABLISHED;
1149                 skb->sk_state = TCP_ESTABLISHED;
1150                 socka->state  = SS_CONNECTED;
1151                 sockb->state  = SS_CONNECTED;
1152         }
1153         return 0;
1154 }
1155
1156 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1157 {
1158         struct sock *sk = sock->sk;
1159         struct sock *tsk;
1160         struct sk_buff *skb;
1161         int err;
1162
1163         err = -EOPNOTSUPP;
1164         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1165                 goto out;
1166
1167         err = -EINVAL;
1168         if (sk->sk_state != TCP_LISTEN)
1169                 goto out;
1170
1171         /* If socket state is TCP_LISTEN it cannot change (for now...),
1172          * so that no locks are necessary.
1173          */
1174
1175         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1176         if (!skb) {
1177                 /* This means receive shutdown. */
1178                 if (err == 0)
1179                         err = -EINVAL;
1180                 goto out;
1181         }
1182
1183         tsk = skb->sk;
1184         skb_free_datagram(sk, skb);
1185         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1186
1187         /* attach accepted sock to socket */
1188         unix_state_wlock(tsk);
1189         newsock->state = SS_CONNECTED;
1190         sock_graft(tsk, newsock);
1191         unix_state_wunlock(tsk);
1192         return 0;
1193
1194 out:
1195         return err;
1196 }
1197
1198
1199 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1200 {
1201         struct sock *sk = sock->sk;
1202         struct unix_sock *u;
1203         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1204         int err = 0;
1205
1206         if (peer) {
1207                 sk = unix_peer_get(sk);
1208
1209                 err = -ENOTCONN;
1210                 if (!sk)
1211                         goto out;
1212                 err = 0;
1213         } else {
1214                 sock_hold(sk);
1215         }
1216
1217         u = unix_sk(sk);
1218         unix_state_rlock(sk);
1219         if (!u->addr) {
1220                 sunaddr->sun_family = AF_UNIX;
1221                 sunaddr->sun_path[0] = 0;
1222                 *uaddr_len = sizeof(short);
1223         } else {
1224                 struct unix_address *addr = u->addr;
1225
1226                 *uaddr_len = addr->len;
1227                 memcpy(sunaddr, addr->name, *uaddr_len);
1228         }
1229         unix_state_runlock(sk);
1230         sock_put(sk);
1231 out:
1232         return err;
1233 }
1234
1235 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1236 {
1237         int i;
1238
1239         scm->fp = UNIXCB(skb).fp;
1240         skb->destructor = sock_wfree;
1241         UNIXCB(skb).fp = NULL;
1242
1243         for (i=scm->fp->count-1; i>=0; i--)
1244                 unix_notinflight(scm->fp->fp[i]);
1245 }
1246
1247 static void unix_destruct_fds(struct sk_buff *skb)
1248 {
1249         struct scm_cookie scm;
1250         memset(&scm, 0, sizeof(scm));
1251         unix_detach_fds(&scm, skb);
1252
1253         /* Alas, it calls VFS */
1254         /* So fscking what? fput() had been SMP-safe since the last Summer */
1255         scm_destroy(&scm);
1256         sock_wfree(skb);
1257 }
1258
1259 static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1260 {
1261         int i;
1262         for (i=scm->fp->count-1; i>=0; i--)
1263                 unix_inflight(scm->fp->fp[i]);
1264         UNIXCB(skb).fp = scm->fp;
1265         skb->destructor = unix_destruct_fds;
1266         scm->fp = NULL;
1267 }
1268
1269 /*
1270  *      Send AF_UNIX data.
1271  */
1272
1273 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1274                               struct msghdr *msg, size_t len)
1275 {
1276         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1277         struct sock *sk = sock->sk;
1278         struct unix_sock *u = unix_sk(sk);
1279         struct sockaddr_un *sunaddr=msg->msg_name;
1280         struct sock *other = NULL;
1281         int namelen = 0; /* fake GCC */
1282         int err;
1283         unsigned hash;
1284         struct sk_buff *skb;
1285         long timeo;
1286         struct scm_cookie tmp_scm;
1287
1288         if (NULL == siocb->scm)
1289                 siocb->scm = &tmp_scm;
1290         err = scm_send(sock, msg, siocb->scm);
1291         if (err < 0)
1292                 return err;
1293
1294         err = -EOPNOTSUPP;
1295         if (msg->msg_flags&MSG_OOB)
1296                 goto out;
1297
1298         if (msg->msg_namelen) {
1299                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1300                 if (err < 0)
1301                         goto out;
1302                 namelen = err;
1303         } else {
1304                 sunaddr = NULL;
1305                 err = -ENOTCONN;
1306                 other = unix_peer_get(sk);
1307                 if (!other)
1308                         goto out;
1309         }
1310
1311         if (test_bit(SOCK_PASSCRED, &sock->flags)
1312                 && !u->addr && (err = unix_autobind(sock)) != 0)
1313                 goto out;
1314
1315         err = -EMSGSIZE;
1316         if (len > sk->sk_sndbuf - 32)
1317                 goto out;
1318
1319         skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1320         if (skb==NULL)
1321                 goto out;
1322
1323         memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1324         if (siocb->scm->fp)
1325                 unix_attach_fds(siocb->scm, skb);
1326
1327         unix_get_peersec_dgram(skb);
1328
1329         skb->h.raw = skb->data;
1330         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1331         if (err)
1332                 goto out_free;
1333
1334         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1335
1336 restart:
1337         if (!other) {
1338                 err = -ECONNRESET;
1339                 if (sunaddr == NULL)
1340                         goto out_free;
1341
1342                 other = unix_find_other(sunaddr, namelen, sk->sk_type,
1343                                         hash, &err);
1344                 if (other==NULL)
1345                         goto out_free;
1346         }
1347
1348         unix_state_rlock(other);
1349         err = -EPERM;
1350         if (!unix_may_send(sk, other))
1351                 goto out_unlock;
1352
1353         if (sock_flag(other, SOCK_DEAD)) {
1354                 /*
1355                  *      Check with 1003.1g - what should
1356                  *      datagram error
1357                  */
1358                 unix_state_runlock(other);
1359                 sock_put(other);
1360
1361                 err = 0;
1362                 unix_state_wlock(sk);
1363                 if (unix_peer(sk) == other) {
1364                         unix_peer(sk)=NULL;
1365                         unix_state_wunlock(sk);
1366
1367                         unix_dgram_disconnected(sk, other);
1368                         sock_put(other);
1369                         err = -ECONNREFUSED;
1370                 } else {
1371                         unix_state_wunlock(sk);
1372                 }
1373
1374                 other = NULL;
1375                 if (err)
1376                         goto out_free;
1377                 goto restart;
1378         }
1379
1380         err = -EPIPE;
1381         if (other->sk_shutdown & RCV_SHUTDOWN)
1382                 goto out_unlock;
1383
1384         if (sk->sk_type != SOCK_SEQPACKET) {
1385                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1386                 if (err)
1387                         goto out_unlock;
1388         }
1389
1390         if (unix_peer(other) != sk &&
1391             (skb_queue_len(&other->sk_receive_queue) >
1392              other->sk_max_ack_backlog)) {
1393                 if (!timeo) {
1394                         err = -EAGAIN;
1395                         goto out_unlock;
1396                 }
1397
1398                 timeo = unix_wait_for_peer(other, timeo);
1399
1400                 err = sock_intr_errno(timeo);
1401                 if (signal_pending(current))
1402                         goto out_free;
1403
1404                 goto restart;
1405         }
1406
1407         skb_queue_tail(&other->sk_receive_queue, skb);
1408         unix_state_runlock(other);
1409         other->sk_data_ready(other, len);
1410         sock_put(other);
1411         scm_destroy(siocb->scm);
1412         return len;
1413
1414 out_unlock:
1415         unix_state_runlock(other);
1416 out_free:
1417         kfree_skb(skb);
1418 out:
1419         if (other)
1420                 sock_put(other);
1421         scm_destroy(siocb->scm);
1422         return err;
1423 }
1424
1425                 
1426 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1427                                struct msghdr *msg, size_t len)
1428 {
1429         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1430         struct sock *sk = sock->sk;
1431         struct sock *other = NULL;
1432         struct sockaddr_un *sunaddr=msg->msg_name;
1433         int err,size;
1434         struct sk_buff *skb;
1435         int sent=0;
1436         struct scm_cookie tmp_scm;
1437
1438         if (NULL == siocb->scm)
1439                 siocb->scm = &tmp_scm;
1440         err = scm_send(sock, msg, siocb->scm);
1441         if (err < 0)
1442                 return err;
1443
1444         err = -EOPNOTSUPP;
1445         if (msg->msg_flags&MSG_OOB)
1446                 goto out_err;
1447
1448         if (msg->msg_namelen) {
1449                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1450                 goto out_err;
1451         } else {
1452                 sunaddr = NULL;
1453                 err = -ENOTCONN;
1454                 other = unix_peer(sk);
1455                 if (!other)
1456                         goto out_err;
1457         }
1458
1459         if (sk->sk_shutdown & SEND_SHUTDOWN)
1460                 goto pipe_err;
1461
1462         while(sent < len)
1463         {
1464                 /*
1465                  *      Optimisation for the fact that under 0.01% of X
1466                  *      messages typically need breaking up.
1467                  */
1468
1469                 size = len-sent;
1470
1471                 /* Keep two messages in the pipe so it schedules better */
1472                 if (size > ((sk->sk_sndbuf >> 1) - 64))
1473                         size = (sk->sk_sndbuf >> 1) - 64;
1474
1475                 if (size > SKB_MAX_ALLOC)
1476                         size = SKB_MAX_ALLOC;
1477                         
1478                 /*
1479                  *      Grab a buffer
1480                  */
1481                  
1482                 skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1483
1484                 if (skb==NULL)
1485                         goto out_err;
1486
1487                 /*
1488                  *      If you pass two values to the sock_alloc_send_skb
1489                  *      it tries to grab the large buffer with GFP_NOFS
1490                  *      (which can fail easily), and if it fails grab the
1491                  *      fallback size buffer which is under a page and will
1492                  *      succeed. [Alan]
1493                  */
1494                 size = min_t(int, size, skb_tailroom(skb));
1495
1496                 memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1497                 if (siocb->scm->fp)
1498                         unix_attach_fds(siocb->scm, skb);
1499
1500                 if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1501                         kfree_skb(skb);
1502                         goto out_err;
1503                 }
1504
1505                 unix_state_rlock(other);
1506
1507                 if (sock_flag(other, SOCK_DEAD) ||
1508                     (other->sk_shutdown & RCV_SHUTDOWN))
1509                         goto pipe_err_free;
1510
1511                 skb_queue_tail(&other->sk_receive_queue, skb);
1512                 unix_state_runlock(other);
1513                 other->sk_data_ready(other, size);
1514                 sent+=size;
1515         }
1516
1517         scm_destroy(siocb->scm);
1518         siocb->scm = NULL;
1519
1520         return sent;
1521
1522 pipe_err_free:
1523         unix_state_runlock(other);
1524         kfree_skb(skb);
1525 pipe_err:
1526         if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1527                 send_sig(SIGPIPE,current,0);
1528         err = -EPIPE;
1529 out_err:
1530         scm_destroy(siocb->scm);
1531         siocb->scm = NULL;
1532         return sent ? : err;
1533 }
1534
1535 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1536                                   struct msghdr *msg, size_t len)
1537 {
1538         int err;
1539         struct sock *sk = sock->sk;
1540         
1541         err = sock_error(sk);
1542         if (err)
1543                 return err;
1544
1545         if (sk->sk_state != TCP_ESTABLISHED)
1546                 return -ENOTCONN;
1547
1548         if (msg->msg_namelen)
1549                 msg->msg_namelen = 0;
1550
1551         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1552 }
1553                                                                                             
1554 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1555 {
1556         struct unix_sock *u = unix_sk(sk);
1557
1558         msg->msg_namelen = 0;
1559         if (u->addr) {
1560                 msg->msg_namelen = u->addr->len;
1561                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1562         }
1563 }
1564
1565 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1566                               struct msghdr *msg, size_t size,
1567                               int flags)
1568 {
1569         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1570         struct scm_cookie tmp_scm;
1571         struct sock *sk = sock->sk;
1572         struct unix_sock *u = unix_sk(sk);
1573         int noblock = flags & MSG_DONTWAIT;
1574         struct sk_buff *skb;
1575         int err;
1576
1577         err = -EOPNOTSUPP;
1578         if (flags&MSG_OOB)
1579                 goto out;
1580
1581         msg->msg_namelen = 0;
1582
1583         mutex_lock(&u->readlock);
1584
1585         skb = skb_recv_datagram(sk, flags, noblock, &err);
1586         if (!skb)
1587                 goto out_unlock;
1588
1589         wake_up_interruptible(&u->peer_wait);
1590
1591         if (msg->msg_name)
1592                 unix_copy_addr(msg, skb->sk);
1593
1594         if (size > skb->len)
1595                 size = skb->len;
1596         else if (size < skb->len)
1597                 msg->msg_flags |= MSG_TRUNC;
1598
1599         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1600         if (err)
1601                 goto out_free;
1602
1603         if (!siocb->scm) {
1604                 siocb->scm = &tmp_scm;
1605                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1606         }
1607         siocb->scm->creds = *UNIXCREDS(skb);
1608         unix_set_secdata(siocb->scm, skb);
1609
1610         if (!(flags & MSG_PEEK))
1611         {
1612                 if (UNIXCB(skb).fp)
1613                         unix_detach_fds(siocb->scm, skb);
1614         }
1615         else 
1616         {
1617                 /* It is questionable: on PEEK we could:
1618                    - do not return fds - good, but too simple 8)
1619                    - return fds, and do not return them on read (old strategy,
1620                      apparently wrong)
1621                    - clone fds (I chose it for now, it is the most universal
1622                      solution)
1623                 
1624                    POSIX 1003.1g does not actually define this clearly
1625                    at all. POSIX 1003.1g doesn't define a lot of things
1626                    clearly however!                  
1627                    
1628                 */
1629                 if (UNIXCB(skb).fp)
1630                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1631         }
1632         err = size;
1633
1634         scm_recv(sock, msg, siocb->scm, flags);
1635
1636 out_free:
1637         skb_free_datagram(sk,skb);
1638 out_unlock:
1639         mutex_unlock(&u->readlock);
1640 out:
1641         return err;
1642 }
1643
1644 /*
1645  *      Sleep until data has arrive. But check for races..
1646  */
1647  
1648 static long unix_stream_data_wait(struct sock * sk, long timeo)
1649 {
1650         DEFINE_WAIT(wait);
1651
1652         unix_state_rlock(sk);
1653
1654         for (;;) {
1655                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1656
1657                 if (!skb_queue_empty(&sk->sk_receive_queue) ||
1658                     sk->sk_err ||
1659                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1660                     signal_pending(current) ||
1661                     !timeo)
1662                         break;
1663
1664                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1665                 unix_state_runlock(sk);
1666                 timeo = schedule_timeout(timeo);
1667                 unix_state_rlock(sk);
1668                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1669         }
1670
1671         finish_wait(sk->sk_sleep, &wait);
1672         unix_state_runlock(sk);
1673         return timeo;
1674 }
1675
1676
1677
1678 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1679                                struct msghdr *msg, size_t size,
1680                                int flags)
1681 {
1682         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1683         struct scm_cookie tmp_scm;
1684         struct sock *sk = sock->sk;
1685         struct unix_sock *u = unix_sk(sk);
1686         struct sockaddr_un *sunaddr=msg->msg_name;
1687         int copied = 0;
1688         int check_creds = 0;
1689         int target;
1690         int err = 0;
1691         long timeo;
1692
1693         err = -EINVAL;
1694         if (sk->sk_state != TCP_ESTABLISHED)
1695                 goto out;
1696
1697         err = -EOPNOTSUPP;
1698         if (flags&MSG_OOB)
1699                 goto out;
1700
1701         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1702         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1703
1704         msg->msg_namelen = 0;
1705
1706         /* Lock the socket to prevent queue disordering
1707          * while sleeps in memcpy_tomsg
1708          */
1709
1710         if (!siocb->scm) {
1711                 siocb->scm = &tmp_scm;
1712                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1713         }
1714
1715         mutex_lock(&u->readlock);
1716
1717         do
1718         {
1719                 int chunk;
1720                 struct sk_buff *skb;
1721
1722                 skb = skb_dequeue(&sk->sk_receive_queue);
1723                 if (skb==NULL)
1724                 {
1725                         if (copied >= target)
1726                                 break;
1727
1728                         /*
1729                          *      POSIX 1003.1g mandates this order.
1730                          */
1731                          
1732                         if ((err = sock_error(sk)) != 0)
1733                                 break;
1734                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1735                                 break;
1736                         err = -EAGAIN;
1737                         if (!timeo)
1738                                 break;
1739                         mutex_unlock(&u->readlock);
1740
1741                         timeo = unix_stream_data_wait(sk, timeo);
1742
1743                         if (signal_pending(current)) {
1744                                 err = sock_intr_errno(timeo);
1745                                 goto out;
1746                         }
1747                         mutex_lock(&u->readlock);
1748                         continue;
1749                 }
1750
1751                 if (check_creds) {
1752                         /* Never glue messages from different writers */
1753                         if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1754                                 skb_queue_head(&sk->sk_receive_queue, skb);
1755                                 break;
1756                         }
1757                 } else {
1758                         /* Copy credentials */
1759                         siocb->scm->creds = *UNIXCREDS(skb);
1760                         check_creds = 1;
1761                 }
1762
1763                 /* Copy address just once */
1764                 if (sunaddr)
1765                 {
1766                         unix_copy_addr(msg, skb->sk);
1767                         sunaddr = NULL;
1768                 }
1769
1770                 chunk = min_t(unsigned int, skb->len, size);
1771                 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1772                         skb_queue_head(&sk->sk_receive_queue, skb);
1773                         if (copied == 0)
1774                                 copied = -EFAULT;
1775                         break;
1776                 }
1777                 copied += chunk;
1778                 size -= chunk;
1779
1780                 /* Mark read part of skb as used */
1781                 if (!(flags & MSG_PEEK))
1782                 {
1783                         skb_pull(skb, chunk);
1784
1785                         if (UNIXCB(skb).fp)
1786                                 unix_detach_fds(siocb->scm, skb);
1787
1788                         /* put the skb back if we didn't use it up.. */
1789                         if (skb->len)
1790                         {
1791                                 skb_queue_head(&sk->sk_receive_queue, skb);
1792                                 break;
1793                         }
1794
1795                         kfree_skb(skb);
1796
1797                         if (siocb->scm->fp)
1798                                 break;
1799                 }
1800                 else
1801                 {
1802                         /* It is questionable, see note in unix_dgram_recvmsg.
1803                          */
1804                         if (UNIXCB(skb).fp)
1805                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1806
1807                         /* put message back and return */
1808                         skb_queue_head(&sk->sk_receive_queue, skb);
1809                         break;
1810                 }
1811         } while (size);
1812
1813         mutex_unlock(&u->readlock);
1814         scm_recv(sock, msg, siocb->scm, flags);
1815 out:
1816         return copied ? : err;
1817 }
1818
1819 static int unix_shutdown(struct socket *sock, int mode)
1820 {
1821         struct sock *sk = sock->sk;
1822         struct sock *other;
1823
1824         mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1825
1826         if (mode) {
1827                 unix_state_wlock(sk);
1828                 sk->sk_shutdown |= mode;
1829                 other=unix_peer(sk);
1830                 if (other)
1831                         sock_hold(other);
1832                 unix_state_wunlock(sk);
1833                 sk->sk_state_change(sk);
1834
1835                 if (other &&
1836                         (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1837
1838                         int peer_mode = 0;
1839
1840                         if (mode&RCV_SHUTDOWN)
1841                                 peer_mode |= SEND_SHUTDOWN;
1842                         if (mode&SEND_SHUTDOWN)
1843                                 peer_mode |= RCV_SHUTDOWN;
1844                         unix_state_wlock(other);
1845                         other->sk_shutdown |= peer_mode;
1846                         unix_state_wunlock(other);
1847                         other->sk_state_change(other);
1848                         read_lock(&other->sk_callback_lock);
1849                         if (peer_mode == SHUTDOWN_MASK)
1850                                 sk_wake_async(other,1,POLL_HUP);
1851                         else if (peer_mode & RCV_SHUTDOWN)
1852                                 sk_wake_async(other,1,POLL_IN);
1853                         read_unlock(&other->sk_callback_lock);
1854                 }
1855                 if (other)
1856                         sock_put(other);
1857         }
1858         return 0;
1859 }
1860
1861 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1862 {
1863         struct sock *sk = sock->sk;
1864         long amount=0;
1865         int err;
1866
1867         switch(cmd)
1868         {
1869                 case SIOCOUTQ:
1870                         amount = atomic_read(&sk->sk_wmem_alloc);
1871                         err = put_user(amount, (int __user *)arg);
1872                         break;
1873                 case SIOCINQ:
1874                 {
1875                         struct sk_buff *skb;
1876
1877                         if (sk->sk_state == TCP_LISTEN) {
1878                                 err = -EINVAL;
1879                                 break;
1880                         }
1881
1882                         spin_lock(&sk->sk_receive_queue.lock);
1883                         if (sk->sk_type == SOCK_STREAM ||
1884                             sk->sk_type == SOCK_SEQPACKET) {
1885                                 skb_queue_walk(&sk->sk_receive_queue, skb)
1886                                         amount += skb->len;
1887                         } else {
1888                                 skb = skb_peek(&sk->sk_receive_queue);
1889                                 if (skb)
1890                                         amount=skb->len;
1891                         }
1892                         spin_unlock(&sk->sk_receive_queue.lock);
1893                         err = put_user(amount, (int __user *)arg);
1894                         break;
1895                 }
1896
1897                 default:
1898                         err = -ENOIOCTLCMD;
1899                         break;
1900         }
1901         return err;
1902 }
1903
1904 static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1905 {
1906         struct sock *sk = sock->sk;
1907         unsigned int mask;
1908
1909         poll_wait(file, sk->sk_sleep, wait);
1910         mask = 0;
1911
1912         /* exceptional events? */
1913         if (sk->sk_err)
1914                 mask |= POLLERR;
1915         if (sk->sk_shutdown == SHUTDOWN_MASK)
1916                 mask |= POLLHUP;
1917         if (sk->sk_shutdown & RCV_SHUTDOWN)
1918                 mask |= POLLRDHUP;
1919
1920         /* readable? */
1921         if (!skb_queue_empty(&sk->sk_receive_queue) ||
1922             (sk->sk_shutdown & RCV_SHUTDOWN))
1923                 mask |= POLLIN | POLLRDNORM;
1924
1925         /* Connection-based need to check for termination and startup */
1926         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1927                 mask |= POLLHUP;
1928
1929         /*
1930          * we set writable also when the other side has shut down the
1931          * connection. This prevents stuck sockets.
1932          */
1933         if (unix_writable(sk))
1934                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1935
1936         return mask;
1937 }
1938
1939
1940 #ifdef CONFIG_PROC_FS
1941 static struct sock *unix_seq_idx(int *iter, loff_t pos)
1942 {
1943         loff_t off = 0;
1944         struct sock *s;
1945
1946         for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1947                 if (off == pos) 
1948                         return s;
1949                 ++off;
1950         }
1951         return NULL;
1952 }
1953
1954
1955 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1956 {
1957         spin_lock(&unix_table_lock);
1958         return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1959 }
1960
1961 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1962 {
1963         ++*pos;
1964
1965         if (v == (void *)1) 
1966                 return first_unix_socket(seq->private);
1967         return next_unix_socket(seq->private, v);
1968 }
1969
1970 static void unix_seq_stop(struct seq_file *seq, void *v)
1971 {
1972         spin_unlock(&unix_table_lock);
1973 }
1974
1975 static int unix_seq_show(struct seq_file *seq, void *v)
1976 {
1977         
1978         if (v == (void *)1)
1979                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1980                          "Inode Path\n");
1981         else {
1982                 struct sock *s = v;
1983                 struct unix_sock *u = unix_sk(s);
1984                 unix_state_rlock(s);
1985
1986                 seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1987                         s,
1988                         atomic_read(&s->sk_refcnt),
1989                         0,
1990                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1991                         s->sk_type,
1992                         s->sk_socket ?
1993                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1994                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1995                         sock_i_ino(s));
1996
1997                 if (u->addr) {
1998                         int i, len;
1999                         seq_putc(seq, ' ');
2000
2001                         i = 0;
2002                         len = u->addr->len - sizeof(short);
2003                         if (!UNIX_ABSTRACT(s))
2004                                 len--;
2005                         else {
2006                                 seq_putc(seq, '@');
2007                                 i++;
2008                         }
2009                         for ( ; i < len; i++)
2010                                 seq_putc(seq, u->addr->name->sun_path[i]);
2011                 }
2012                 unix_state_runlock(s);
2013                 seq_putc(seq, '\n');
2014         }
2015
2016         return 0;
2017 }
2018
2019 static struct seq_operations unix_seq_ops = {
2020         .start  = unix_seq_start,
2021         .next   = unix_seq_next,
2022         .stop   = unix_seq_stop,
2023         .show   = unix_seq_show,
2024 };
2025
2026
2027 static int unix_seq_open(struct inode *inode, struct file *file)
2028 {
2029         struct seq_file *seq;
2030         int rc = -ENOMEM;
2031         int *iter = kmalloc(sizeof(int), GFP_KERNEL);
2032
2033         if (!iter)
2034                 goto out;
2035
2036         rc = seq_open(file, &unix_seq_ops);
2037         if (rc)
2038                 goto out_kfree;
2039
2040         seq          = file->private_data;
2041         seq->private = iter;
2042         *iter = 0;
2043 out:
2044         return rc;
2045 out_kfree:
2046         kfree(iter);
2047         goto out;
2048 }
2049
2050 static struct file_operations unix_seq_fops = {
2051         .owner          = THIS_MODULE,
2052         .open           = unix_seq_open,
2053         .read           = seq_read,
2054         .llseek         = seq_lseek,
2055         .release        = seq_release_private,
2056 };
2057
2058 #endif
2059
2060 static struct net_proto_family unix_family_ops = {
2061         .family = PF_UNIX,
2062         .create = unix_create,
2063         .owner  = THIS_MODULE,
2064 };
2065
2066 static int __init af_unix_init(void)
2067 {
2068         int rc = -1;
2069         struct sk_buff *dummy_skb;
2070
2071         if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2072                 printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2073                 goto out;
2074         }
2075
2076         rc = proto_register(&unix_proto, 1);
2077         if (rc != 0) {
2078                 printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2079                        __FUNCTION__);
2080                 goto out;
2081         }
2082
2083         sock_register(&unix_family_ops);
2084 #ifdef CONFIG_PROC_FS
2085         proc_net_fops_create("unix", 0, &unix_seq_fops);
2086 #endif
2087         unix_sysctl_register();
2088 out:
2089         return rc;
2090 }
2091
2092 static void __exit af_unix_exit(void)
2093 {
2094         sock_unregister(PF_UNIX);
2095         unix_sysctl_unregister();
2096         proc_net_remove("unix");
2097         proto_unregister(&unix_proto);
2098 }
2099
2100 module_init(af_unix_init);
2101 module_exit(af_unix_exit);
2102
2103 MODULE_LICENSE("GPL");
2104 MODULE_ALIAS_NETPROTO(PF_UNIX);