net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #include <linux/module.h>
  84 #include <linux/kernel.h>
  85 #include <linux/signal.h>
  86 #include <linux/sched.h>
  87 #include <linux/errno.h>
  88 #include <linux/string.h>
  89 #include <linux/stat.h>
  90 #include <linux/dcache.h>
  91 #include <linux/namei.h>
  92 #include <linux/socket.h>
  93 #include <linux/un.h>
  94 #include <linux/fcntl.h>
  95 #include <linux/termios.h>
  96 #include <linux/sockios.h>
  97 #include <linux/net.h>
  98 #include <linux/in.h>
  99 #include <linux/fs.h>
 100 #include <linux/slab.h>
 101 #include <asm/uaccess.h>
 102 #include <linux/skbuff.h>
 103 #include <linux/netdevice.h>
 104 #include <net/net_namespace.h>
 105 #include <net/sock.h>
 106 #include <net/tcp_states.h>
 107 #include <net/af_unix.h>
 108 #include <linux/proc_fs.h>
 109 #include <linux/seq_file.h>
 110 #include <net/scm.h>
 111 #include <linux/init.h>
 112 #include <linux/poll.h>
 113 #include <linux/rtnetlink.h>
 114 #include <linux/mount.h>
 115 #include <net/checksum.h>
 116 #include <linux/security.h>
 117
 118 static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
 119 static DEFINE_SPINLOCK(unix_table_lock);
 120 static atomic_long_t unix_nr_socks;
 121
 122 #define unix_sockets_unbound    (&unix_socket_table[UNIX_HASH_SIZE])
 123
 124 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
 125
 126 #ifdef CONFIG_SECURITY_NETWORK
 127 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 128 {
 129         memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
 130 }
 131
 132 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 133 {
 134         scm->secid = *UNIXSID(skb);
 135 }
 136 #else
 137 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 138 { }
 139
 140 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 141 { }
 142 #endif /* CONFIG_SECURITY_NETWORK */
 143
 144 /*
 145  *  SMP locking strategy:
 146  *    hash table is protected with spinlock unix_table_lock
 147  *    each socket state is protected by separate spin lock.
 148  */
 149
 150 static inline unsigned unix_hash_fold(__wsum n)
 151 {
 152         unsigned hash = (__force unsigned)n;
 153         hash ^= hash>>16;
 154         hash ^= hash>>8;
 155         return hash&(UNIX_HASH_SIZE-1);
 156 }
 157
 158 #define unix_peer(sk) (unix_sk(sk)->peer)
 159
 160 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 161 {
 162         return unix_peer(osk) == sk;
 163 }
 164
 165 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 166 {
 167         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 168 }
 169
 170 static inline int unix_recvq_full(struct sock const *sk)
 171 {
 172         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 173 }
 174
 175 static struct sock *unix_peer_get(struct sock *s)
 176 {
 177         struct sock *peer;
 178
 179         unix_state_lock(s);
 180         peer = unix_peer(s);
 181         if (peer)
 182                 sock_hold(peer);
 183         unix_state_unlock(s);
 184         return peer;
 185 }
 186
 187 static inline void unix_release_addr(struct unix_address *addr)
 188 {
 189         if (atomic_dec_and_test(&addr->refcnt))
 190                 kfree(addr);
 191 }
 192
 193 /*
 194  *      Check unix socket name:
 195  *              - should be not zero length.
 196  *              - if started by not zero, should be NULL terminated (FS object)
 197  *              - if started by zero, it is abstract name.
 198  */
 199
 200 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned *hashp)
 201 {
 202         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 203                 return -EINVAL;
 204         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 205                 return -EINVAL;
 206         if (sunaddr->sun_path[0]) {
 207                 /*
 208                  * This may look like an off by one error but it is a bit more
 209                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 210                  * sun_path[108] doesn't as such exist.  However in kernel space
 211                  * we are guaranteed that it is a valid memory location in our
 212                  * kernel address buffer.
 213                  */
 214                 ((char *)sunaddr)[len] = 0;
 215                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 216                 return len;
 217         }
 218
 219         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 220         return len;
 221 }
 222
 223 static void __unix_remove_socket(struct sock *sk)
 224 {
 225         sk_del_node_init(sk);
 226 }
 227
 228 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 229 {
 230         WARN_ON(!sk_unhashed(sk));
 231         sk_add_node(sk, list);
 232 }
 233
 234 static inline void unix_remove_socket(struct sock *sk)
 235 {
 236         spin_lock(&unix_table_lock);
 237         __unix_remove_socket(sk);
 238         spin_unlock(&unix_table_lock);
 239 }
 240
 241 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 242 {
 243         spin_lock(&unix_table_lock);
 244         __unix_insert_socket(list, sk);
 245         spin_unlock(&unix_table_lock);
 246 }
 247
 248 static struct sock *__unix_find_socket_byname(struct net *net,
 249                                               struct sockaddr_un *sunname,
 250                                               int len, int type, unsigned hash)
 251 {
 252         struct sock *s;
 253         struct hlist_node *node;
 254
 255         sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
 256                 struct unix_sock *u = unix_sk(s);
 257
 258                 if (!net_eq(sock_net(s), net))
 259                         continue;
 260
 261                 if (u->addr->len == len &&
 262                     !memcmp(u->addr->name, sunname, len))
 263                         goto found;
 264         }
 265         s = NULL;
 266 found:
 267         return s;
 268 }
 269
 270 static inline struct sock *unix_find_socket_byname(struct net *net,
 271                                                    struct sockaddr_un *sunname,
 272                                                    int len, int type,
 273                                                    unsigned hash)
 274 {
 275         struct sock *s;
 276
 277         spin_lock(&unix_table_lock);
 278         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 279         if (s)
 280                 sock_hold(s);
 281         spin_unlock(&unix_table_lock);
 282         return s;
 283 }
 284
 285 static struct sock *unix_find_socket_byinode(struct inode *i)
 286 {
 287         struct sock *s;
 288         struct hlist_node *node;
 289
 290         spin_lock(&unix_table_lock);
 291         sk_for_each(s, node,
 292                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 293                 struct dentry *dentry = unix_sk(s)->dentry;
 294
 295                 if (dentry && dentry->d_inode == i) {
 296                         sock_hold(s);
 297                         goto found;
 298                 }
 299         }
 300         s = NULL;
 301 found:
 302         spin_unlock(&unix_table_lock);
 303         return s;
 304 }
 305
 306 static inline int unix_writable(struct sock *sk)
 307 {
 308         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 309 }
 310
 311 static void unix_write_space(struct sock *sk)
 312 {
 313         struct socket_wq *wq;
 314
 315         rcu_read_lock();
 316         if (unix_writable(sk)) {
 317                 wq = rcu_dereference(sk->sk_wq);
 318                 if (wq_has_sleeper(wq))
 319                         wake_up_interruptible_sync_poll(&wq->wait,
 320                                 POLLOUT | POLLWRNORM | POLLWRBAND);
 321                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 322         }
 323         rcu_read_unlock();
 324 }
 325
 326 /* When dgram socket disconnects (or changes its peer), we clear its receive
 327  * queue of packets arrived from previous peer. First, it allows to do
 328  * flow control based only on wmem_alloc; second, sk connected to peer
 329  * may receive messages only from that peer. */
 330 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 331 {
 332         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 333                 skb_queue_purge(&sk->sk_receive_queue);
 334                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 335
 336                 /* If one link of bidirectional dgram pipe is disconnected,
 337                  * we signal error. Messages are lost. Do not make this,
 338                  * when peer was not connected to us.
 339                  */
 340                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 341                         other->sk_err = ECONNRESET;
 342                         other->sk_error_report(other);
 343                 }
 344         }
 345 }
 346
 347 static void unix_sock_destructor(struct sock *sk)
 348 {
 349         struct unix_sock *u = unix_sk(sk);
 350
 351         skb_queue_purge(&sk->sk_receive_queue);
 352
 353         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 354         WARN_ON(!sk_unhashed(sk));
 355         WARN_ON(sk->sk_socket);
 356         if (!sock_flag(sk, SOCK_DEAD)) {
 357                 printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
 358                 return;
 359         }
 360
 361         if (u->addr)
 362                 unix_release_addr(u->addr);
 363
 364         atomic_long_dec(&unix_nr_socks);
 365         local_bh_disable();
 366         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 367         local_bh_enable();
 368 #ifdef UNIX_REFCNT_DEBUG
 369         printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
 370                 atomic_long_read(&unix_nr_socks));
 371 #endif
 372 }
 373
 374 static void unix_release_sock(struct sock *sk, int embrion)
 375 {
 376         struct unix_sock *u = unix_sk(sk);
 377         struct dentry *dentry;
 378         struct vfsmount *mnt;
 379         struct sock *skpair;
 380         struct sk_buff *skb;
 381         int state;
 382
 383         unix_remove_socket(sk);
 384
 385         /* Clear state */
 386         unix_state_lock(sk);
 387         sock_orphan(sk);
 388         sk->sk_shutdown = SHUTDOWN_MASK;
 389         dentry       = u->dentry;
 390         u->dentry    = NULL;
 391         mnt          = u->mnt;
 392         u->mnt       = NULL;
 393         state = sk->sk_state;
 394         sk->sk_state = TCP_CLOSE;
 395         unix_state_unlock(sk);
 396
 397         wake_up_interruptible_all(&u->peer_wait);
 398
 399         skpair = unix_peer(sk);
 400
 401         if (skpair != NULL) {
 402                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 403                         unix_state_lock(skpair);
 404                         /* No more writes */
 405                         skpair->sk_shutdown = SHUTDOWN_MASK;
 406                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 407                                 skpair->sk_err = ECONNRESET;
 408                         unix_state_unlock(skpair);
 409                         skpair->sk_state_change(skpair);
 410                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 411                 }
 412                 sock_put(skpair); /* It may now die */
 413                 unix_peer(sk) = NULL;
 414         }
 415
 416         /* Try to flush out this socket. Throw out buffers at least */
 417
 418         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 419                 if (state == TCP_LISTEN)
 420                         unix_release_sock(skb->sk, 1);
 421                 /* passed fds are erased in the kfree_skb hook        */
 422                 kfree_skb(skb);
 423         }
 424
 425         if (dentry) {
 426                 dput(dentry);
 427                 mntput(mnt);
 428         }
 429
 430         sock_put(sk);
 431
 432         /* ---- Socket is dead now and most probably destroyed ---- */
 433
 434         /*
 435          * Fixme: BSD difference: In BSD all sockets connected to use get
 436          *        ECONNRESET and we die on the spot. In Linux we behave
 437          *        like files and pipes do and wait for the last
 438          *        dereference.
 439          *
 440          * Can't we simply set sock->err?
 441          *
 442          *        What the above comment does talk about? --ANK(980817)
 443          */
 444
 445         if (unix_tot_inflight)
 446                 unix_gc();              /* Garbage collect fds */
 447 }
 448
 449 static void init_peercred(struct sock *sk)
 450 {
 451         put_pid(sk->sk_peer_pid);
 452         if (sk->sk_peer_cred)
 453                 put_cred(sk->sk_peer_cred);
 454         sk->sk_peer_pid  = get_pid(task_tgid(current));
 455         sk->sk_peer_cred = get_current_cred();
 456 }
 457
 458 static void copy_peercred(struct sock *sk, struct sock *peersk)
 459 {
 460         put_pid(sk->sk_peer_pid);
 461         if (sk->sk_peer_cred)
 462                 put_cred(sk->sk_peer_cred);
 463         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 464         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 465 }
 466
 467 static int unix_listen(struct socket *sock, int backlog)
 468 {
 469         int err;
 470         struct sock *sk = sock->sk;
 471         struct unix_sock *u = unix_sk(sk);
 472         struct pid *old_pid = NULL;
 473         const struct cred *old_cred = NULL;
 474
 475         err = -EOPNOTSUPP;
 476         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 477                 goto out;       /* Only stream/seqpacket sockets accept */
 478         err = -EINVAL;
 479         if (!u->addr)
 480                 goto out;       /* No listens on an unbound socket */
 481         unix_state_lock(sk);
 482         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 483                 goto out_unlock;
 484         if (backlog > sk->sk_max_ack_backlog)
 485                 wake_up_interruptible_all(&u->peer_wait);
 486         sk->sk_max_ack_backlog  = backlog;
 487         sk->sk_state            = TCP_LISTEN;
 488         /* set credentials so connect can copy them */
 489         init_peercred(sk);
 490         err = 0;
 491
 492 out_unlock:
 493         unix_state_unlock(sk);
 494         put_pid(old_pid);
 495         if (old_cred)
 496                 put_cred(old_cred);
 497 out:
 498         return err;
 499 }
 500
 501 static int unix_release(struct socket *);
 502 static int unix_bind(struct socket *, struct sockaddr *, int);
 503 static int unix_stream_connect(struct socket *, struct sockaddr *,
 504                                int addr_len, int flags);
 505 static int unix_socketpair(struct socket *, struct socket *);
 506 static int unix_accept(struct socket *, struct socket *, int);
 507 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 508 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 509 static unsigned int unix_dgram_poll(struct file *, struct socket *,
 510                                     poll_table *);
 511 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 512 static int unix_shutdown(struct socket *, int);
 513 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
 514                                struct msghdr *, size_t);
 515 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
 516                                struct msghdr *, size_t, int);
 517 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
 518                               struct msghdr *, size_t);
 519 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
 520                               struct msghdr *, size_t, int);
 521 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 522                               int, int);
 523 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
 524                                   struct msghdr *, size_t);
 525 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
 526                                   struct msghdr *, size_t, int);
 527
 528 static const struct proto_ops unix_stream_ops = {
 529         .family =       PF_UNIX,
 530         .owner =        THIS_MODULE,
 531         .release =      unix_release,
 532         .bind =         unix_bind,
 533         .connect =      unix_stream_connect,
 534         .socketpair =   unix_socketpair,
 535         .accept =       unix_accept,
 536         .getname =      unix_getname,
 537         .poll =         unix_poll,
 538         .ioctl =        unix_ioctl,
 539         .listen =       unix_listen,
 540         .shutdown =     unix_shutdown,
 541         .setsockopt =   sock_no_setsockopt,
 542         .getsockopt =   sock_no_getsockopt,
 543         .sendmsg =      unix_stream_sendmsg,
 544         .recvmsg =      unix_stream_recvmsg,
 545         .mmap =         sock_no_mmap,
 546         .sendpage =     sock_no_sendpage,
 547 };
 548
 549 static const struct proto_ops unix_dgram_ops = {
 550         .family =       PF_UNIX,
 551         .owner =        THIS_MODULE,
 552         .release =      unix_release,
 553         .bind =         unix_bind,
 554         .connect =      unix_dgram_connect,
 555         .socketpair =   unix_socketpair,
 556         .accept =       sock_no_accept,
 557         .getname =      unix_getname,
 558         .poll =         unix_dgram_poll,
 559         .ioctl =        unix_ioctl,
 560         .listen =       sock_no_listen,
 561         .shutdown =     unix_shutdown,
 562         .setsockopt =   sock_no_setsockopt,
 563         .getsockopt =   sock_no_getsockopt,
 564         .sendmsg =      unix_dgram_sendmsg,
 565         .recvmsg =      unix_dgram_recvmsg,
 566         .mmap =         sock_no_mmap,
 567         .sendpage =     sock_no_sendpage,
 568 };
 569
 570 static const struct proto_ops unix_seqpacket_ops = {
 571         .family =       PF_UNIX,
 572         .owner =        THIS_MODULE,
 573         .release =      unix_release,
 574         .bind =         unix_bind,
 575         .connect =      unix_stream_connect,
 576         .socketpair =   unix_socketpair,
 577         .accept =       unix_accept,
 578         .getname =      unix_getname,
 579         .poll =         unix_dgram_poll,
 580         .ioctl =        unix_ioctl,
 581         .listen =       unix_listen,
 582         .shutdown =     unix_shutdown,
 583         .setsockopt =   sock_no_setsockopt,
 584         .getsockopt =   sock_no_getsockopt,
 585         .sendmsg =      unix_seqpacket_sendmsg,
 586         .recvmsg =      unix_seqpacket_recvmsg,
 587         .mmap =         sock_no_mmap,
 588         .sendpage =     sock_no_sendpage,
 589 };
 590
 591 static struct proto unix_proto = {
 592         .name                   = "UNIX",
 593         .owner                  = THIS_MODULE,
 594         .obj_size               = sizeof(struct unix_sock),
 595 };
 596
 597 /*
 598  * AF_UNIX sockets do not interact with hardware, hence they
 599  * dont trigger interrupts - so it's safe for them to have
 600  * bh-unsafe locking for their sk_receive_queue.lock. Split off
 601  * this special lock-class by reinitializing the spinlock key:
 602  */
 603 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 604
 605 static struct sock *unix_create1(struct net *net, struct socket *sock)
 606 {
 607         struct sock *sk = NULL;
 608         struct unix_sock *u;
 609
 610         atomic_long_inc(&unix_nr_socks);
 611         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 612                 goto out;
 613
 614         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
 615         if (!sk)
 616                 goto out;
 617
 618         sock_init_data(sock, sk);
 619         lockdep_set_class(&sk->sk_receive_queue.lock,
 620                                 &af_unix_sk_receive_queue_lock_key);
 621
 622         sk->sk_write_space      = unix_write_space;
 623         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 624         sk->sk_destruct         = unix_sock_destructor;
 625         u         = unix_sk(sk);
 626         u->dentry = NULL;
 627         u->mnt    = NULL;
 628         spin_lock_init(&u->lock);
 629         atomic_long_set(&u->inflight, 0);
 630         INIT_LIST_HEAD(&u->link);
 631         mutex_init(&u->readlock); /* single task reading lock */
 632         init_waitqueue_head(&u->peer_wait);
 633         unix_insert_socket(unix_sockets_unbound, sk);
 634 out:
 635         if (sk == NULL)
 636                 atomic_long_dec(&unix_nr_socks);
 637         else {
 638                 local_bh_disable();
 639                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 640                 local_bh_enable();
 641         }
 642         return sk;
 643 }
 644
 645 static int unix_create(struct net *net, struct socket *sock, int protocol,
 646                        int kern)
 647 {
 648         if (protocol && protocol != PF_UNIX)
 649                 return -EPROTONOSUPPORT;
 650
 651         sock->state = SS_UNCONNECTED;
 652
 653         switch (sock->type) {
 654         case SOCK_STREAM:
 655                 sock->ops = &unix_stream_ops;
 656                 break;
 657                 /*
 658                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 659                  *      nothing uses it.
 660                  */
 661         case SOCK_RAW:
 662                 sock->type = SOCK_DGRAM;
 663         case SOCK_DGRAM:
 664                 sock->ops = &unix_dgram_ops;
 665                 break;
 666         case SOCK_SEQPACKET:
 667                 sock->ops = &unix_seqpacket_ops;
 668                 break;
 669         default:
 670                 return -ESOCKTNOSUPPORT;
 671         }
 672
 673         return unix_create1(net, sock) ? 0 : -ENOMEM;
 674 }
 675
 676 static int unix_release(struct socket *sock)
 677 {
 678         struct sock *sk = sock->sk;
 679
 680         if (!sk)
 681                 return 0;
 682
 683         unix_release_sock(sk, 0);
 684         sock->sk = NULL;
 685
 686         return 0;
 687 }
 688
 689 static int unix_autobind(struct socket *sock)
 690 {
 691         struct sock *sk = sock->sk;
 692         struct net *net = sock_net(sk);
 693         struct unix_sock *u = unix_sk(sk);
 694         static u32 ordernum = 1;
 695         struct unix_address *addr;
 696         int err;
 697         unsigned int retries = 0;
 698
 699         mutex_lock(&u->readlock);
 700
 701         err = 0;
 702         if (u->addr)
 703                 goto out;
 704
 705         err = -ENOMEM;
 706         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 707         if (!addr)
 708                 goto out;
 709
 710         addr->name->sun_family = AF_UNIX;
 711         atomic_set(&addr->refcnt, 1);
 712
 713 retry:
 714         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 715         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 716
 717         spin_lock(&unix_table_lock);
 718         ordernum = (ordernum+1)&0xFFFFF;
 719
 720         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 721                                       addr->hash)) {
 722                 spin_unlock(&unix_table_lock);
 723                 /*
 724                  * __unix_find_socket_byname() may take long time if many names
 725                  * are already in use.
 726                  */
 727                 cond_resched();
 728                 /* Give up if all names seems to be in use. */
 729                 if (retries++ == 0xFFFFF) {
 730                         err = -ENOSPC;
 731                         kfree(addr);
 732                         goto out;
 733                 }
 734                 goto retry;
 735         }
 736         addr->hash ^= sk->sk_type;
 737
 738         __unix_remove_socket(sk);
 739         u->addr = addr;
 740         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 741         spin_unlock(&unix_table_lock);
 742         err = 0;
 743
 744 out:    mutex_unlock(&u->readlock);
 745         return err;
 746 }
 747
 748 static struct sock *unix_find_other(struct net *net,
 749                                     struct sockaddr_un *sunname, int len,
 750                                     int type, unsigned hash, int *error)
 751 {
 752         struct sock *u;
 753         struct path path;
 754         int err = 0;
 755
 756         if (sunname->sun_path[0]) {
 757                 struct inode *inode;
 758                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 759                 if (err)
 760                         goto fail;
 761                 inode = path.dentry->d_inode;
 762                 err = inode_permission(inode, MAY_WRITE);
 763                 if (err)
 764                         goto put_fail;
 765
 766                 err = -ECONNREFUSED;
 767                 if (!S_ISSOCK(inode->i_mode))
 768                         goto put_fail;
 769                 u = unix_find_socket_byinode(inode);
 770                 if (!u)
 771                         goto put_fail;
 772
 773                 if (u->sk_type == type)
 774                         touch_atime(path.mnt, path.dentry);
 775
 776                 path_put(&path);
 777
 778                 err = -EPROTOTYPE;
 779                 if (u->sk_type != type) {
 780                         sock_put(u);
 781                         goto fail;
 782                 }
 783         } else {
 784                 err = -ECONNREFUSED;
 785                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 786                 if (u) {
 787                         struct dentry *dentry;
 788                         dentry = unix_sk(u)->dentry;
 789                         if (dentry)
 790                                 touch_atime(unix_sk(u)->mnt, dentry);
 791                 } else
 792                         goto fail;
 793         }
 794         return u;
 795
 796 put_fail:
 797         path_put(&path);
 798 fail:
 799         *error = err;
 800         return NULL;
 801 }
 802
 803
 804 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 805 {
 806         struct sock *sk = sock->sk;
 807         struct net *net = sock_net(sk);
 808         struct unix_sock *u = unix_sk(sk);
 809         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 810         char *sun_path = sunaddr->sun_path;
 811         struct dentry *dentry = NULL;
 812         struct path path;
 813         int err;
 814         unsigned hash;
 815         struct unix_address *addr;
 816         struct hlist_head *list;
 817
 818         err = -EINVAL;
 819         if (sunaddr->sun_family != AF_UNIX)
 820                 goto out;
 821
 822         if (addr_len == sizeof(short)) {
 823                 err = unix_autobind(sock);
 824                 goto out;
 825         }
 826
 827         err = unix_mkname(sunaddr, addr_len, &hash);
 828         if (err < 0)
 829                 goto out;
 830         addr_len = err;
 831
 832         mutex_lock(&u->readlock);
 833
 834         err = -EINVAL;
 835         if (u->addr)
 836                 goto out_up;
 837
 838         err = -ENOMEM;
 839         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
 840         if (!addr)
 841                 goto out_up;
 842
 843         memcpy(addr->name, sunaddr, addr_len);
 844         addr->len = addr_len;
 845         addr->hash = hash ^ sk->sk_type;
 846         atomic_set(&addr->refcnt, 1);
 847
 848         if (sun_path[0]) {
 849                 unsigned int mode;
 850                 err = 0;
 851                 /*
 852                  * Get the parent directory, calculate the hash for last
 853                  * component.
 854                  */
 855                 dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 856                 err = PTR_ERR(dentry);
 857                 if (IS_ERR(dentry))
 858                         goto out_mknod_parent;
 859
 860                 /*
 861                  * All right, let's create it.
 862                  */
 863                 mode = S_IFSOCK |
 864                        (SOCK_INODE(sock)->i_mode & ~current_umask());
 865                 err = mnt_want_write(path.mnt);
 866                 if (err)
 867                         goto out_mknod_dput;
 868                 err = security_path_mknod(&path, dentry, mode, 0);
 869                 if (err)
 870                         goto out_mknod_drop_write;
 871                 err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
 872 out_mknod_drop_write:
 873                 mnt_drop_write(path.mnt);
 874                 if (err)
 875                         goto out_mknod_dput;
 876                 mutex_unlock(&path.dentry->d_inode->i_mutex);
 877                 dput(path.dentry);
 878                 path.dentry = dentry;
 879
 880                 addr->hash = UNIX_HASH_SIZE;
 881         }
 882
 883         spin_lock(&unix_table_lock);
 884
 885         if (!sun_path[0]) {
 886                 err = -EADDRINUSE;
 887                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
 888                                               sk->sk_type, hash)) {
 889                         unix_release_addr(addr);
 890                         goto out_unlock;
 891                 }
 892
 893                 list = &unix_socket_table[addr->hash];
 894         } else {
 895                 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
 896                 u->dentry = path.dentry;
 897                 u->mnt    = path.mnt;
 898         }
 899
 900         err = 0;
 901         __unix_remove_socket(sk);
 902         u->addr = addr;
 903         __unix_insert_socket(list, sk);
 904
 905 out_unlock:
 906         spin_unlock(&unix_table_lock);
 907 out_up:
 908         mutex_unlock(&u->readlock);
 909 out:
 910         return err;
 911
 912 out_mknod_dput:
 913         dput(dentry);
 914         mutex_unlock(&path.dentry->d_inode->i_mutex);
 915         path_put(&path);
 916 out_mknod_parent:
 917         if (err == -EEXIST)
 918                 err = -EADDRINUSE;
 919         unix_release_addr(addr);
 920         goto out_up;
 921 }
 922
 923 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
 924 {
 925         if (unlikely(sk1 == sk2) || !sk2) {
 926                 unix_state_lock(sk1);
 927                 return;
 928         }
 929         if (sk1 < sk2) {
 930                 unix_state_lock(sk1);
 931                 unix_state_lock_nested(sk2);
 932         } else {
 933                 unix_state_lock(sk2);
 934                 unix_state_lock_nested(sk1);
 935         }
 936 }
 937
 938 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
 939 {
 940         if (unlikely(sk1 == sk2) || !sk2) {
 941                 unix_state_unlock(sk1);
 942                 return;
 943         }
 944         unix_state_unlock(sk1);
 945         unix_state_unlock(sk2);
 946 }
 947
 948 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 949                               int alen, int flags)
 950 {
 951         struct sock *sk = sock->sk;
 952         struct net *net = sock_net(sk);
 953         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 954         struct sock *other;
 955         unsigned hash;
 956         int err;
 957
 958         if (addr->sa_family != AF_UNSPEC) {
 959                 err = unix_mkname(sunaddr, alen, &hash);
 960                 if (err < 0)
 961                         goto out;
 962                 alen = err;
 963
 964                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
 965                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
 966                         goto out;
 967
 968 restart:
 969                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
 970                 if (!other)
 971                         goto out;
 972
 973                 unix_state_double_lock(sk, other);
 974
 975                 /* Apparently VFS overslept socket death. Retry. */
 976                 if (sock_flag(other, SOCK_DEAD)) {
 977                         unix_state_double_unlock(sk, other);
 978                         sock_put(other);
 979                         goto restart;
 980                 }
 981
 982                 err = -EPERM;
 983                 if (!unix_may_send(sk, other))
 984                         goto out_unlock;
 985
 986                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
 987                 if (err)
 988                         goto out_unlock;
 989
 990         } else {
 991                 /*
 992                  *      1003.1g breaking connected state with AF_UNSPEC
 993                  */
 994                 other = NULL;
 995                 unix_state_double_lock(sk, other);
 996         }
 997
 998         /*
 999          * If it was connected, reconnect.
1000          */
1001         if (unix_peer(sk)) {
1002                 struct sock *old_peer = unix_peer(sk);
1003                 unix_peer(sk) = other;
1004                 unix_state_double_unlock(sk, other);
1005
1006                 if (other != old_peer)
1007                         unix_dgram_disconnected(sk, old_peer);
1008                 sock_put(old_peer);
1009         } else {
1010                 unix_peer(sk) = other;
1011                 unix_state_double_unlock(sk, other);
1012         }
1013         return 0;
1014
1015 out_unlock:
1016         unix_state_double_unlock(sk, other);
1017         sock_put(other);
1018 out:
1019         return err;
1020 }
1021
1022 static long unix_wait_for_peer(struct sock *other, long timeo)
1023 {
1024         struct unix_sock *u = unix_sk(other);
1025         int sched;
1026         DEFINE_WAIT(wait);
1027
1028         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1029
1030         sched = !sock_flag(other, SOCK_DEAD) &&
1031                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1032                 unix_recvq_full(other);
1033
1034         unix_state_unlock(other);
1035
1036         if (sched)
1037                 timeo = schedule_timeout(timeo);
1038
1039         finish_wait(&u->peer_wait, &wait);
1040         return timeo;
1041 }
1042
1043 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1044                                int addr_len, int flags)
1045 {
1046         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1047         struct sock *sk = sock->sk;
1048         struct net *net = sock_net(sk);
1049         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1050         struct sock *newsk = NULL;
1051         struct sock *other = NULL;
1052         struct sk_buff *skb = NULL;
1053         unsigned hash;
1054         int st;
1055         int err;
1056         long timeo;
1057
1058         err = unix_mkname(sunaddr, addr_len, &hash);
1059         if (err < 0)
1060                 goto out;
1061         addr_len = err;
1062
1063         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1064             (err = unix_autobind(sock)) != 0)
1065                 goto out;
1066
1067         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1068
1069         /* First of all allocate resources.
1070            If we will make it after state is locked,
1071            we will have to recheck all again in any case.
1072          */
1073
1074         err = -ENOMEM;
1075
1076         /* create new sock for complete connection */
1077         newsk = unix_create1(sock_net(sk), NULL);
1078         if (newsk == NULL)
1079                 goto out;
1080
1081         /* Allocate skb for sending to listening sock */
1082         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1083         if (skb == NULL)
1084                 goto out;
1085
1086 restart:
1087         /*  Find listening sock. */
1088         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1089         if (!other)
1090                 goto out;
1091
1092         /* Latch state of peer */
1093         unix_state_lock(other);
1094
1095         /* Apparently VFS overslept socket death. Retry. */
1096         if (sock_flag(other, SOCK_DEAD)) {
1097                 unix_state_unlock(other);
1098                 sock_put(other);
1099                 goto restart;
1100         }
1101
1102         err = -ECONNREFUSED;
1103         if (other->sk_state != TCP_LISTEN)
1104                 goto out_unlock;
1105         if (other->sk_shutdown & RCV_SHUTDOWN)
1106                 goto out_unlock;
1107
1108         if (unix_recvq_full(other)) {
1109                 err = -EAGAIN;
1110                 if (!timeo)
1111                         goto out_unlock;
1112
1113                 timeo = unix_wait_for_peer(other, timeo);
1114
1115                 err = sock_intr_errno(timeo);
1116                 if (signal_pending(current))
1117                         goto out;
1118                 sock_put(other);
1119                 goto restart;
1120         }
1121
1122         /* Latch our state.
1123
1124            It is tricky place. We need to grab our state lock and cannot
1125            drop lock on peer. It is dangerous because deadlock is
1126            possible. Connect to self case and simultaneous
1127            attempt to connect are eliminated by checking socket
1128            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1129            check this before attempt to grab lock.
1130
1131            Well, and we have to recheck the state after socket locked.
1132          */
1133         st = sk->sk_state;
1134
1135         switch (st) {
1136         case TCP_CLOSE:
1137                 /* This is ok... continue with connect */
1138                 break;
1139         case TCP_ESTABLISHED:
1140                 /* Socket is already connected */
1141                 err = -EISCONN;
1142                 goto out_unlock;
1143         default:
1144                 err = -EINVAL;
1145                 goto out_unlock;
1146         }
1147
1148         unix_state_lock_nested(sk);
1149
1150         if (sk->sk_state != st) {
1151                 unix_state_unlock(sk);
1152                 unix_state_unlock(other);
1153                 sock_put(other);
1154                 goto restart;
1155         }
1156
1157         err = security_unix_stream_connect(sk, other, newsk);
1158         if (err) {
1159                 unix_state_unlock(sk);
1160                 goto out_unlock;
1161         }
1162
1163         /* The way is open! Fastly set all the necessary fields... */
1164
1165         sock_hold(sk);
1166         unix_peer(newsk)        = sk;
1167         newsk->sk_state         = TCP_ESTABLISHED;
1168         newsk->sk_type          = sk->sk_type;
1169         init_peercred(newsk);
1170         newu = unix_sk(newsk);
1171         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1172         otheru = unix_sk(other);
1173
1174         /* copy address information from listening to new sock*/
1175         if (otheru->addr) {
1176                 atomic_inc(&otheru->addr->refcnt);
1177                 newu->addr = otheru->addr;
1178         }
1179         if (otheru->dentry) {
1180                 newu->dentry    = dget(otheru->dentry);
1181                 newu->mnt       = mntget(otheru->mnt);
1182         }
1183
1184         /* Set credentials */
1185         copy_peercred(sk, other);
1186
1187         sock->state     = SS_CONNECTED;
1188         sk->sk_state    = TCP_ESTABLISHED;
1189         sock_hold(newsk);
1190
1191         smp_mb__after_atomic_inc();     /* sock_hold() does an atomic_inc() */
1192         unix_peer(sk)   = newsk;
1193
1194         unix_state_unlock(sk);
1195
1196         /* take ten and and send info to listening sock */
1197         spin_lock(&other->sk_receive_queue.lock);
1198         __skb_queue_tail(&other->sk_receive_queue, skb);
1199         spin_unlock(&other->sk_receive_queue.lock);
1200         unix_state_unlock(other);
1201         other->sk_data_ready(other, 0);
1202         sock_put(other);
1203         return 0;
1204
1205 out_unlock:
1206         if (other)
1207                 unix_state_unlock(other);
1208
1209 out:
1210         kfree_skb(skb);
1211         if (newsk)
1212                 unix_release_sock(newsk, 0);
1213         if (other)
1214                 sock_put(other);
1215         return err;
1216 }
1217
1218 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1219 {
1220         struct sock *ska = socka->sk, *skb = sockb->sk;
1221
1222         /* Join our sockets back to back */
1223         sock_hold(ska);
1224         sock_hold(skb);
1225         unix_peer(ska) = skb;
1226         unix_peer(skb) = ska;
1227         init_peercred(ska);
1228         init_peercred(skb);
1229
1230         if (ska->sk_type != SOCK_DGRAM) {
1231                 ska->sk_state = TCP_ESTABLISHED;
1232                 skb->sk_state = TCP_ESTABLISHED;
1233                 socka->state  = SS_CONNECTED;
1234                 sockb->state  = SS_CONNECTED;
1235         }
1236         return 0;
1237 }
1238
1239 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1240 {
1241         struct sock *sk = sock->sk;
1242         struct sock *tsk;
1243         struct sk_buff *skb;
1244         int err;
1245
1246         err = -EOPNOTSUPP;
1247         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1248                 goto out;
1249
1250         err = -EINVAL;
1251         if (sk->sk_state != TCP_LISTEN)
1252                 goto out;
1253
1254         /* If socket state is TCP_LISTEN it cannot change (for now...),
1255          * so that no locks are necessary.
1256          */
1257
1258         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1259         if (!skb) {
1260                 /* This means receive shutdown. */
1261                 if (err == 0)
1262                         err = -EINVAL;
1263                 goto out;
1264         }
1265
1266         tsk = skb->sk;
1267         skb_free_datagram(sk, skb);
1268         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1269
1270         /* attach accepted sock to socket */
1271         unix_state_lock(tsk);
1272         newsock->state = SS_CONNECTED;
1273         sock_graft(tsk, newsock);
1274         unix_state_unlock(tsk);
1275         return 0;
1276
1277 out:
1278         return err;
1279 }
1280
1281
1282 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1283 {
1284         struct sock *sk = sock->sk;
1285         struct unix_sock *u;
1286         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1287         int err = 0;
1288
1289         if (peer) {
1290                 sk = unix_peer_get(sk);
1291
1292                 err = -ENOTCONN;
1293                 if (!sk)
1294                         goto out;
1295                 err = 0;
1296         } else {
1297                 sock_hold(sk);
1298         }
1299
1300         u = unix_sk(sk);
1301         unix_state_lock(sk);
1302         if (!u->addr) {
1303                 sunaddr->sun_family = AF_UNIX;
1304                 sunaddr->sun_path[0] = 0;
1305                 *uaddr_len = sizeof(short);
1306         } else {
1307                 struct unix_address *addr = u->addr;
1308
1309                 *uaddr_len = addr->len;
1310                 memcpy(sunaddr, addr->name, *uaddr_len);
1311         }
1312         unix_state_unlock(sk);
1313         sock_put(sk);
1314 out:
1315         return err;
1316 }
1317
1318 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1319 {
1320         int i;
1321
1322         scm->fp = UNIXCB(skb).fp;
1323         UNIXCB(skb).fp = NULL;
1324
1325         for (i = scm->fp->count-1; i >= 0; i--)
1326                 unix_notinflight(scm->fp->fp[i]);
1327 }
1328
1329 static void unix_destruct_scm(struct sk_buff *skb)
1330 {
1331         struct scm_cookie scm;
1332         memset(&scm, 0, sizeof(scm));
1333         scm.pid  = UNIXCB(skb).pid;
1334         scm.cred = UNIXCB(skb).cred;
1335         if (UNIXCB(skb).fp)
1336                 unix_detach_fds(&scm, skb);
1337
1338         /* Alas, it calls VFS */
1339         /* So fscking what? fput() had been SMP-safe since the last Summer */
1340         scm_destroy(&scm);
1341         sock_wfree(skb);
1342 }
1343
1344 #define MAX_RECURSION_LEVEL 4
1345
1346 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1347 {
1348         int i;
1349         unsigned char max_level = 0;
1350         int unix_sock_count = 0;
1351
1352         for (i = scm->fp->count - 1; i >= 0; i--) {
1353                 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1354
1355                 if (sk) {
1356                         unix_sock_count++;
1357                         max_level = max(max_level,
1358                                         unix_sk(sk)->recursion_level);
1359                 }
1360         }
1361         if (unlikely(max_level > MAX_RECURSION_LEVEL))
1362                 return -ETOOMANYREFS;
1363
1364         /*
1365          * Need to duplicate file references for the sake of garbage
1366          * collection.  Otherwise a socket in the fps might become a
1367          * candidate for GC while the skb is not yet queued.
1368          */
1369         UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1370         if (!UNIXCB(skb).fp)
1371                 return -ENOMEM;
1372
1373         if (unix_sock_count) {
1374                 for (i = scm->fp->count - 1; i >= 0; i--)
1375                         unix_inflight(scm->fp->fp[i]);
1376         }
1377         return max_level;
1378 }
1379
1380 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1381 {
1382         int err = 0;
1383
1384         UNIXCB(skb).pid  = get_pid(scm->pid);
1385         if (scm->cred)
1386                 UNIXCB(skb).cred = get_cred(scm->cred);
1387         UNIXCB(skb).fp = NULL;
1388         if (scm->fp && send_fds)
1389                 err = unix_attach_fds(scm, skb);
1390
1391         skb->destructor = unix_destruct_scm;
1392         return err;
1393 }
1394
1395 /*
1396  * Some apps rely on write() giving SCM_CREDENTIALS
1397  * We include credentials if source or destination socket
1398  * asserted SOCK_PASSCRED.
1399  */
1400 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1401                             const struct sock *other)
1402 {
1403         if (UNIXCB(skb).cred)
1404                 return;
1405         if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1406             !other->sk_socket ||
1407             test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1408                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1409                 UNIXCB(skb).cred = get_current_cred();
1410         }
1411 }
1412
1413 /*
1414  *      Send AF_UNIX data.
1415  */
1416
1417 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1418                               struct msghdr *msg, size_t len)
1419 {
1420         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1421         struct sock *sk = sock->sk;
1422         struct net *net = sock_net(sk);
1423         struct unix_sock *u = unix_sk(sk);
1424         struct sockaddr_un *sunaddr = msg->msg_name;
1425         struct sock *other = NULL;
1426         int namelen = 0; /* fake GCC */
1427         int err;
1428         unsigned hash;
1429         struct sk_buff *skb;
1430         long timeo;
1431         struct scm_cookie tmp_scm;
1432         int max_level;
1433
1434         if (NULL == siocb->scm)
1435                 siocb->scm = &tmp_scm;
1436         wait_for_unix_gc();
1437         err = scm_send(sock, msg, siocb->scm, false);
1438         if (err < 0)
1439                 return err;
1440
1441         err = -EOPNOTSUPP;
1442         if (msg->msg_flags&MSG_OOB)
1443                 goto out;
1444
1445         if (msg->msg_namelen) {
1446                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1447                 if (err < 0)
1448                         goto out;
1449                 namelen = err;
1450         } else {
1451                 sunaddr = NULL;
1452                 err = -ENOTCONN;
1453                 other = unix_peer_get(sk);
1454                 if (!other)
1455                         goto out;
1456         }
1457
1458         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1459             && (err = unix_autobind(sock)) != 0)
1460                 goto out;
1461
1462         err = -EMSGSIZE;
1463         if (len > sk->sk_sndbuf - 32)
1464                 goto out;
1465
1466         skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1467         if (skb == NULL)
1468                 goto out;
1469
1470         err = unix_scm_to_skb(siocb->scm, skb, true);
1471         if (err < 0)
1472                 goto out_free;
1473         max_level = err + 1;
1474         unix_get_secdata(siocb->scm, skb);
1475
1476         skb_reset_transport_header(skb);
1477         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1478         if (err)
1479                 goto out_free;
1480
1481         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1482
1483 restart:
1484         if (!other) {
1485                 err = -ECONNRESET;
1486                 if (sunaddr == NULL)
1487                         goto out_free;
1488
1489                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1490                                         hash, &err);
1491                 if (other == NULL)
1492                         goto out_free;
1493         }
1494
1495         if (sk_filter(other, skb) < 0) {
1496                 /* Toss the packet but do not return any error to the sender */
1497                 err = len;
1498                 goto out_free;
1499         }
1500
1501         unix_state_lock(other);
1502         err = -EPERM;
1503         if (!unix_may_send(sk, other))
1504                 goto out_unlock;
1505
1506         if (sock_flag(other, SOCK_DEAD)) {
1507                 /*
1508                  *      Check with 1003.1g - what should
1509                  *      datagram error
1510                  */
1511                 unix_state_unlock(other);
1512                 sock_put(other);
1513
1514                 err = 0;
1515                 unix_state_lock(sk);
1516                 if (unix_peer(sk) == other) {
1517                         unix_peer(sk) = NULL;
1518                         unix_state_unlock(sk);
1519
1520                         unix_dgram_disconnected(sk, other);
1521                         sock_put(other);
1522                         err = -ECONNREFUSED;
1523                 } else {
1524                         unix_state_unlock(sk);
1525                 }
1526
1527                 other = NULL;
1528                 if (err)
1529                         goto out_free;
1530                 goto restart;
1531         }
1532
1533         err = -EPIPE;
1534         if (other->sk_shutdown & RCV_SHUTDOWN)
1535                 goto out_unlock;
1536
1537         if (sk->sk_type != SOCK_SEQPACKET) {
1538                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1539                 if (err)
1540                         goto out_unlock;
1541         }
1542
1543         if (unix_peer(other) != sk && unix_recvq_full(other)) {
1544                 if (!timeo) {
1545                         err = -EAGAIN;
1546                         goto out_unlock;
1547                 }
1548
1549                 timeo = unix_wait_for_peer(other, timeo);
1550
1551                 err = sock_intr_errno(timeo);
1552                 if (signal_pending(current))
1553                         goto out_free;
1554
1555                 goto restart;
1556         }
1557
1558         if (sock_flag(other, SOCK_RCVTSTAMP))
1559                 __net_timestamp(skb);
1560         maybe_add_creds(skb, sock, other);
1561         skb_queue_tail(&other->sk_receive_queue, skb);
1562         if (max_level > unix_sk(other)->recursion_level)
1563                 unix_sk(other)->recursion_level = max_level;
1564         unix_state_unlock(other);
1565         other->sk_data_ready(other, len);
1566         sock_put(other);
1567         scm_destroy(siocb->scm);
1568         return len;
1569
1570 out_unlock:
1571         unix_state_unlock(other);
1572 out_free:
1573         kfree_skb(skb);
1574 out:
1575         if (other)
1576                 sock_put(other);
1577         scm_destroy(siocb->scm);
1578         return err;
1579 }
1580
1581
1582 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1583                                struct msghdr *msg, size_t len)
1584 {
1585         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1586         struct sock *sk = sock->sk;
1587         struct sock *other = NULL;
1588         int err, size;
1589         struct sk_buff *skb;
1590         int sent = 0;
1591         struct scm_cookie tmp_scm;
1592         bool fds_sent = false;
1593         int max_level;
1594
1595         if (NULL == siocb->scm)
1596                 siocb->scm = &tmp_scm;
1597         wait_for_unix_gc();
1598         err = scm_send(sock, msg, siocb->scm, false);
1599         if (err < 0)
1600                 return err;
1601
1602         err = -EOPNOTSUPP;
1603         if (msg->msg_flags&MSG_OOB)
1604                 goto out_err;
1605
1606         if (msg->msg_namelen) {
1607                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1608                 goto out_err;
1609         } else {
1610                 err = -ENOTCONN;
1611                 other = unix_peer(sk);
1612                 if (!other)
1613                         goto out_err;
1614         }
1615
1616         if (sk->sk_shutdown & SEND_SHUTDOWN)
1617                 goto pipe_err;
1618
1619         while (sent < len) {
1620                 /*
1621                  *      Optimisation for the fact that under 0.01% of X
1622                  *      messages typically need breaking up.
1623                  */
1624
1625                 size = len-sent;
1626
1627                 /* Keep two messages in the pipe so it schedules better */
1628                 if (size > ((sk->sk_sndbuf >> 1) - 64))
1629                         size = (sk->sk_sndbuf >> 1) - 64;
1630
1631                 if (size > SKB_MAX_ALLOC)
1632                         size = SKB_MAX_ALLOC;
1633
1634                 /*
1635                  *      Grab a buffer
1636                  */
1637
1638                 skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT,
1639                                           &err);
1640
1641                 if (skb == NULL)
1642                         goto out_err;
1643
1644                 /*
1645                  *      If you pass two values to the sock_alloc_send_skb
1646                  *      it tries to grab the large buffer with GFP_NOFS
1647                  *      (which can fail easily), and if it fails grab the
1648                  *      fallback size buffer which is under a page and will
1649                  *      succeed. [Alan]
1650                  */
1651                 size = min_t(int, size, skb_tailroom(skb));
1652
1653
1654                 /* Only send the fds in the first buffer */
1655                 err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1656                 if (err < 0) {
1657                         kfree_skb(skb);
1658                         goto out_err;
1659                 }
1660                 max_level = err + 1;
1661                 fds_sent = true;
1662
1663                 err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
1664                 if (err) {
1665                         kfree_skb(skb);
1666                         goto out_err;
1667                 }
1668
1669                 unix_state_lock(other);
1670
1671                 if (sock_flag(other, SOCK_DEAD) ||
1672                     (other->sk_shutdown & RCV_SHUTDOWN))
1673                         goto pipe_err_free;
1674
1675                 maybe_add_creds(skb, sock, other);
1676                 skb_queue_tail(&other->sk_receive_queue, skb);
1677                 if (max_level > unix_sk(other)->recursion_level)
1678                         unix_sk(other)->recursion_level = max_level;
1679                 unix_state_unlock(other);
1680                 other->sk_data_ready(other, size);
1681                 sent += size;
1682         }
1683
1684         scm_destroy(siocb->scm);
1685         siocb->scm = NULL;
1686
1687         return sent;
1688
1689 pipe_err_free:
1690         unix_state_unlock(other);
1691         kfree_skb(skb);
1692 pipe_err:
1693         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1694                 send_sig(SIGPIPE, current, 0);
1695         err = -EPIPE;
1696 out_err:
1697         scm_destroy(siocb->scm);
1698         siocb->scm = NULL;
1699         return sent ? : err;
1700 }
1701
1702 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1703                                   struct msghdr *msg, size_t len)
1704 {
1705         int err;
1706         struct sock *sk = sock->sk;
1707
1708         err = sock_error(sk);
1709         if (err)
1710                 return err;
1711
1712         if (sk->sk_state != TCP_ESTABLISHED)
1713                 return -ENOTCONN;
1714
1715         if (msg->msg_namelen)
1716                 msg->msg_namelen = 0;
1717
1718         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1719 }
1720
1721 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1722                               struct msghdr *msg, size_t size,
1723                               int flags)
1724 {
1725         struct sock *sk = sock->sk;
1726
1727         if (sk->sk_state != TCP_ESTABLISHED)
1728                 return -ENOTCONN;
1729
1730         return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1731 }
1732
1733 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1734 {
1735         struct unix_sock *u = unix_sk(sk);
1736
1737         msg->msg_namelen = 0;
1738         if (u->addr) {
1739                 msg->msg_namelen = u->addr->len;
1740                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1741         }
1742 }
1743
1744 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1745                               struct msghdr *msg, size_t size,
1746                               int flags)
1747 {
1748         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1749         struct scm_cookie tmp_scm;
1750         struct sock *sk = sock->sk;
1751         struct unix_sock *u = unix_sk(sk);
1752         int noblock = flags & MSG_DONTWAIT;
1753         struct sk_buff *skb;
1754         int err;
1755
1756         err = -EOPNOTSUPP;
1757         if (flags&MSG_OOB)
1758                 goto out;
1759
1760         msg->msg_namelen = 0;
1761
1762         err = mutex_lock_interruptible(&u->readlock);
1763         if (err) {
1764                 err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1765                 goto out;
1766         }
1767
1768         skb = skb_recv_datagram(sk, flags, noblock, &err);
1769         if (!skb) {
1770                 unix_state_lock(sk);
1771                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1772                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1773                     (sk->sk_shutdown & RCV_SHUTDOWN))
1774                         err = 0;
1775                 unix_state_unlock(sk);
1776                 goto out_unlock;
1777         }
1778
1779         wake_up_interruptible_sync_poll(&u->peer_wait,
1780                                         POLLOUT | POLLWRNORM | POLLWRBAND);
1781
1782         if (msg->msg_name)
1783                 unix_copy_addr(msg, skb->sk);
1784
1785         if (size > skb->len)
1786                 size = skb->len;
1787         else if (size < skb->len)
1788                 msg->msg_flags |= MSG_TRUNC;
1789
1790         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1791         if (err)
1792                 goto out_free;
1793
1794         if (sock_flag(sk, SOCK_RCVTSTAMP))
1795                 __sock_recv_timestamp(msg, sk, skb);
1796
1797         if (!siocb->scm) {
1798                 siocb->scm = &tmp_scm;
1799                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1800         }
1801         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
1802         unix_set_secdata(siocb->scm, skb);
1803
1804         if (!(flags & MSG_PEEK)) {
1805                 if (UNIXCB(skb).fp)
1806                         unix_detach_fds(siocb->scm, skb);
1807         } else {
1808                 /* It is questionable: on PEEK we could:
1809                    - do not return fds - good, but too simple 8)
1810                    - return fds, and do not return them on read (old strategy,
1811                      apparently wrong)
1812                    - clone fds (I chose it for now, it is the most universal
1813                      solution)
1814
1815                    POSIX 1003.1g does not actually define this clearly
1816                    at all. POSIX 1003.1g doesn't define a lot of things
1817                    clearly however!
1818
1819                 */
1820                 if (UNIXCB(skb).fp)
1821                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1822         }
1823         err = size;
1824
1825         scm_recv(sock, msg, siocb->scm, flags);
1826
1827 out_free:
1828         skb_free_datagram(sk, skb);
1829 out_unlock:
1830         mutex_unlock(&u->readlock);
1831 out:
1832         return err;
1833 }
1834
1835 /*
1836  *      Sleep until data has arrive. But check for races..
1837  */
1838
1839 static long unix_stream_data_wait(struct sock *sk, long timeo)
1840 {
1841         DEFINE_WAIT(wait);
1842
1843         unix_state_lock(sk);
1844
1845         for (;;) {
1846                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1847
1848                 if (!skb_queue_empty(&sk->sk_receive_queue) ||
1849                     sk->sk_err ||
1850                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1851                     signal_pending(current) ||
1852                     !timeo)
1853                         break;
1854
1855                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1856                 unix_state_unlock(sk);
1857                 timeo = schedule_timeout(timeo);
1858                 unix_state_lock(sk);
1859                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1860         }
1861
1862         finish_wait(sk_sleep(sk), &wait);
1863         unix_state_unlock(sk);
1864         return timeo;
1865 }
1866
1867
1868
1869 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1870                                struct msghdr *msg, size_t size,
1871                                int flags)
1872 {
1873         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1874         struct scm_cookie tmp_scm;
1875         struct sock *sk = sock->sk;
1876         struct unix_sock *u = unix_sk(sk);
1877         struct sockaddr_un *sunaddr = msg->msg_name;
1878         int copied = 0;
1879         int check_creds = 0;
1880         int target;
1881         int err = 0;
1882         long timeo;
1883
1884         err = -EINVAL;
1885         if (sk->sk_state != TCP_ESTABLISHED)
1886                 goto out;
1887
1888         err = -EOPNOTSUPP;
1889         if (flags&MSG_OOB)
1890                 goto out;
1891
1892         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1893         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1894
1895         msg->msg_namelen = 0;
1896
1897         /* Lock the socket to prevent queue disordering
1898          * while sleeps in memcpy_tomsg
1899          */
1900
1901         if (!siocb->scm) {
1902                 siocb->scm = &tmp_scm;
1903                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1904         }
1905
1906         err = mutex_lock_interruptible(&u->readlock);
1907         if (err) {
1908                 err = sock_intr_errno(timeo);
1909                 goto out;
1910         }
1911
1912         do {
1913                 int chunk;
1914                 struct sk_buff *skb;
1915
1916                 unix_state_lock(sk);
1917                 skb = skb_peek(&sk->sk_receive_queue);
1918                 if (skb == NULL) {
1919                         unix_sk(sk)->recursion_level = 0;
1920                         if (copied >= target)
1921                                 goto unlock;
1922
1923                         /*
1924                          *      POSIX 1003.1g mandates this order.
1925                          */
1926
1927                         err = sock_error(sk);
1928                         if (err)
1929                                 goto unlock;
1930                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1931                                 goto unlock;
1932
1933                         unix_state_unlock(sk);
1934                         err = -EAGAIN;
1935                         if (!timeo)
1936                                 break;
1937                         mutex_unlock(&u->readlock);
1938
1939                         timeo = unix_stream_data_wait(sk, timeo);
1940
1941                         if (signal_pending(current)
1942                             ||  mutex_lock_interruptible(&u->readlock)) {
1943                                 err = sock_intr_errno(timeo);
1944                                 goto out;
1945                         }
1946
1947                         continue;
1948  unlock:
1949                         unix_state_unlock(sk);
1950                         break;
1951                 }
1952                 unix_state_unlock(sk);
1953
1954                 if (check_creds) {
1955                         /* Never glue messages from different writers */
1956                         if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1957                             (UNIXCB(skb).cred != siocb->scm->cred))
1958                                 break;
1959                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
1960                         /* Copy credentials */
1961                         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
1962                         check_creds = 1;
1963                 }
1964
1965                 /* Copy address just once */
1966                 if (sunaddr) {
1967                         unix_copy_addr(msg, skb->sk);
1968                         sunaddr = NULL;
1969                 }
1970
1971                 chunk = min_t(unsigned int, skb->len, size);
1972                 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1973                         if (copied == 0)
1974                                 copied = -EFAULT;
1975                         break;
1976                 }
1977                 copied += chunk;
1978                 size -= chunk;
1979
1980                 /* Mark read part of skb as used */
1981                 if (!(flags & MSG_PEEK)) {
1982                         skb_pull(skb, chunk);
1983
1984                         if (UNIXCB(skb).fp)
1985                                 unix_detach_fds(siocb->scm, skb);
1986
1987                         if (skb->len)
1988                                 break;
1989
1990                         skb_unlink(skb, &sk->sk_receive_queue);
1991                         consume_skb(skb);
1992
1993                         if (siocb->scm->fp)
1994                                 break;
1995                 } else {
1996                         /* It is questionable, see note in unix_dgram_recvmsg.
1997                          */
1998                         if (UNIXCB(skb).fp)
1999                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2000
2001                         break;
2002                 }
2003         } while (size);
2004
2005         mutex_unlock(&u->readlock);
2006         scm_recv(sock, msg, siocb->scm, flags);
2007 out:
2008         return copied ? : err;
2009 }
2010
2011 static int unix_shutdown(struct socket *sock, int mode)
2012 {
2013         struct sock *sk = sock->sk;
2014         struct sock *other;
2015
2016         mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
2017
2018         if (!mode)
2019                 return 0;
2020
2021         unix_state_lock(sk);
2022         sk->sk_shutdown |= mode;
2023         other = unix_peer(sk);
2024         if (other)
2025                 sock_hold(other);
2026         unix_state_unlock(sk);
2027         sk->sk_state_change(sk);
2028
2029         if (other &&
2030                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2031
2032                 int peer_mode = 0;
2033
2034                 if (mode&RCV_SHUTDOWN)
2035                         peer_mode |= SEND_SHUTDOWN;
2036                 if (mode&SEND_SHUTDOWN)
2037                         peer_mode |= RCV_SHUTDOWN;
2038                 unix_state_lock(other);
2039                 other->sk_shutdown |= peer_mode;
2040                 unix_state_unlock(other);
2041                 other->sk_state_change(other);
2042                 if (peer_mode == SHUTDOWN_MASK)
2043                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2044                 else if (peer_mode & RCV_SHUTDOWN)
2045                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2046         }
2047         if (other)
2048                 sock_put(other);
2049
2050         return 0;
2051 }
2052
2053 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2054 {
2055         struct sock *sk = sock->sk;
2056         long amount = 0;
2057         int err;
2058
2059         switch (cmd) {
2060         case SIOCOUTQ:
2061                 amount = sk_wmem_alloc_get(sk);
2062                 err = put_user(amount, (int __user *)arg);
2063                 break;
2064         case SIOCINQ:
2065                 {
2066                         struct sk_buff *skb;
2067
2068                         if (sk->sk_state == TCP_LISTEN) {
2069                                 err = -EINVAL;
2070                                 break;
2071                         }
2072
2073                         spin_lock(&sk->sk_receive_queue.lock);
2074                         if (sk->sk_type == SOCK_STREAM ||
2075                             sk->sk_type == SOCK_SEQPACKET) {
2076                                 skb_queue_walk(&sk->sk_receive_queue, skb)
2077                                         amount += skb->len;
2078                         } else {
2079                                 skb = skb_peek(&sk->sk_receive_queue);
2080                                 if (skb)
2081                                         amount = skb->len;
2082                         }
2083                         spin_unlock(&sk->sk_receive_queue.lock);
2084                         err = put_user(amount, (int __user *)arg);
2085                         break;
2086                 }
2087
2088         default:
2089                 err = -ENOIOCTLCMD;
2090                 break;
2091         }
2092         return err;
2093 }
2094
2095 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2096 {
2097         struct sock *sk = sock->sk;
2098         unsigned int mask;
2099
2100         sock_poll_wait(file, sk_sleep(sk), wait);
2101         mask = 0;
2102
2103         /* exceptional events? */
2104         if (sk->sk_err)
2105                 mask |= POLLERR;
2106         if (sk->sk_shutdown == SHUTDOWN_MASK)
2107                 mask |= POLLHUP;
2108         if (sk->sk_shutdown & RCV_SHUTDOWN)
2109                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2110
2111         /* readable? */
2112         if (!skb_queue_empty(&sk->sk_receive_queue))
2113                 mask |= POLLIN | POLLRDNORM;
2114
2115         /* Connection-based need to check for termination and startup */
2116         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2117             sk->sk_state == TCP_CLOSE)
2118                 mask |= POLLHUP;
2119
2120         /*
2121          * we set writable also when the other side has shut down the
2122          * connection. This prevents stuck sockets.
2123          */
2124         if (unix_writable(sk))
2125                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2126
2127         return mask;
2128 }
2129
2130 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2131                                     poll_table *wait)
2132 {
2133         struct sock *sk = sock->sk, *other;
2134         unsigned int mask, writable;
2135
2136         sock_poll_wait(file, sk_sleep(sk), wait);
2137         mask = 0;
2138
2139         /* exceptional events? */
2140         if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2141                 mask |= POLLERR;
2142         if (sk->sk_shutdown & RCV_SHUTDOWN)
2143                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2144         if (sk->sk_shutdown == SHUTDOWN_MASK)
2145                 mask |= POLLHUP;
2146
2147         /* readable? */
2148         if (!skb_queue_empty(&sk->sk_receive_queue))
2149                 mask |= POLLIN | POLLRDNORM;
2150
2151         /* Connection-based need to check for termination and startup */
2152         if (sk->sk_type == SOCK_SEQPACKET) {
2153                 if (sk->sk_state == TCP_CLOSE)
2154                         mask |= POLLHUP;
2155                 /* connection hasn't started yet? */
2156                 if (sk->sk_state == TCP_SYN_SENT)
2157                         return mask;
2158         }
2159
2160         /* No write status requested, avoid expensive OUT tests. */
2161         if (wait && !(wait->key & (POLLWRBAND | POLLWRNORM | POLLOUT)))
2162                 return mask;
2163
2164         writable = unix_writable(sk);
2165         other = unix_peer_get(sk);
2166         if (other) {
2167                 if (unix_peer(other) != sk) {
2168                         sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2169                         if (unix_recvq_full(other))
2170                                 writable = 0;
2171                 }
2172                 sock_put(other);
2173         }
2174
2175         if (writable)
2176                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2177         else
2178                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2179
2180         return mask;
2181 }
2182
2183 #ifdef CONFIG_PROC_FS
2184 static struct sock *first_unix_socket(int *i)
2185 {
2186         for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) {
2187                 if (!hlist_empty(&unix_socket_table[*i]))
2188                         return __sk_head(&unix_socket_table[*i]);
2189         }
2190         return NULL;
2191 }
2192
2193 static struct sock *next_unix_socket(int *i, struct sock *s)
2194 {
2195         struct sock *next = sk_next(s);
2196         /* More in this chain? */
2197         if (next)
2198                 return next;
2199         /* Look for next non-empty chain. */
2200         for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) {
2201                 if (!hlist_empty(&unix_socket_table[*i]))
2202                         return __sk_head(&unix_socket_table[*i]);
2203         }
2204         return NULL;
2205 }
2206
2207 struct unix_iter_state {
2208         struct seq_net_private p;
2209         int i;
2210 };
2211
2212 static struct sock *unix_seq_idx(struct seq_file *seq, loff_t pos)
2213 {
2214         struct unix_iter_state *iter = seq->private;
2215         loff_t off = 0;
2216         struct sock *s;
2217
2218         for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) {
2219                 if (sock_net(s) != seq_file_net(seq))
2220                         continue;
2221                 if (off == pos)
2222                         return s;
2223                 ++off;
2224         }
2225         return NULL;
2226 }
2227
2228 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2229         __acquires(unix_table_lock)
2230 {
2231         spin_lock(&unix_table_lock);
2232         return *pos ? unix_seq_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2233 }
2234
2235 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2236 {
2237         struct unix_iter_state *iter = seq->private;
2238         struct sock *sk = v;
2239         ++*pos;
2240
2241         if (v == SEQ_START_TOKEN)
2242                 sk = first_unix_socket(&iter->i);
2243         else
2244                 sk = next_unix_socket(&iter->i, sk);
2245         while (sk && (sock_net(sk) != seq_file_net(seq)))
2246                 sk = next_unix_socket(&iter->i, sk);
2247         return sk;
2248 }
2249
2250 static void unix_seq_stop(struct seq_file *seq, void *v)
2251         __releases(unix_table_lock)
2252 {
2253         spin_unlock(&unix_table_lock);
2254 }
2255
2256 static int unix_seq_show(struct seq_file *seq, void *v)
2257 {
2258
2259         if (v == SEQ_START_TOKEN)
2260                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2261                          "Inode Path\n");
2262         else {
2263                 struct sock *s = v;
2264                 struct unix_sock *u = unix_sk(s);
2265                 unix_state_lock(s);
2266
2267                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2268                         s,
2269                         atomic_read(&s->sk_refcnt),
2270                         0,
2271                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2272                         s->sk_type,
2273                         s->sk_socket ?
2274                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2275                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2276                         sock_i_ino(s));
2277
2278                 if (u->addr) {
2279                         int i, len;
2280                         seq_putc(seq, ' ');
2281
2282                         i = 0;
2283                         len = u->addr->len - sizeof(short);
2284                         if (!UNIX_ABSTRACT(s))
2285                                 len--;
2286                         else {
2287                                 seq_putc(seq, '@');
2288                                 i++;
2289                         }
2290                         for ( ; i < len; i++)
2291                                 seq_putc(seq, u->addr->name->sun_path[i]);
2292                 }
2293                 unix_state_unlock(s);
2294                 seq_putc(seq, '\n');
2295         }
2296
2297         return 0;
2298 }
2299
2300 static const struct seq_operations unix_seq_ops = {
2301         .start  = unix_seq_start,
2302         .next   = unix_seq_next,
2303         .stop   = unix_seq_stop,
2304         .show   = unix_seq_show,
2305 };
2306
2307 static int unix_seq_open(struct inode *inode, struct file *file)
2308 {
2309         return seq_open_net(inode, file, &unix_seq_ops,
2310                             sizeof(struct unix_iter_state));
2311 }
2312
2313 static const struct file_operations unix_seq_fops = {
2314         .owner          = THIS_MODULE,
2315         .open           = unix_seq_open,
2316         .read           = seq_read,
2317         .llseek         = seq_lseek,
2318         .release        = seq_release_net,
2319 };
2320
2321 #endif
2322
2323 static const struct net_proto_family unix_family_ops = {
2324         .family = PF_UNIX,
2325         .create = unix_create,
2326         .owner  = THIS_MODULE,
2327 };
2328
2329
2330 static int __net_init unix_net_init(struct net *net)
2331 {
2332         int error = -ENOMEM;
2333
2334         net->unx.sysctl_max_dgram_qlen = 10;
2335         if (unix_sysctl_register(net))
2336                 goto out;
2337
2338 #ifdef CONFIG_PROC_FS
2339         if (!proc_net_fops_create(net, "unix", 0, &unix_seq_fops)) {
2340                 unix_sysctl_unregister(net);
2341                 goto out;
2342         }
2343 #endif
2344         error = 0;
2345 out:
2346         return error;
2347 }
2348
2349 static void __net_exit unix_net_exit(struct net *net)
2350 {
2351         unix_sysctl_unregister(net);
2352         proc_net_remove(net, "unix");
2353 }
2354
2355 static struct pernet_operations unix_net_ops = {
2356         .init = unix_net_init,
2357         .exit = unix_net_exit,
2358 };
2359
2360 static int __init af_unix_init(void)
2361 {
2362         int rc = -1;
2363         struct sk_buff *dummy_skb;
2364
2365         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb));
2366
2367         rc = proto_register(&unix_proto, 1);
2368         if (rc != 0) {
2369                 printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2370                        __func__);
2371                 goto out;
2372         }
2373
2374         sock_register(&unix_family_ops);
2375         register_pernet_subsys(&unix_net_ops);
2376 out:
2377         return rc;
2378 }
2379
2380 static void __exit af_unix_exit(void)
2381 {
2382         sock_unregister(PF_UNIX);
2383         proto_unregister(&unix_proto);
2384         unregister_pernet_subsys(&unix_net_ops);
2385 }
2386
2387 /* Earlier than device_initcall() so that other drivers invoking
2388    request_module() don't end up in a loop when modprobe tries
2389    to use a UNIX socket. But later than subsys_initcall() because
2390    we depend on stuff initialised there */
2391 fs_initcall(af_unix_init);
2392 module_exit(af_unix_exit);
2393
2394 MODULE_LICENSE("GPL");
2395 MODULE_ALIAS_NETPROTO(PF_UNIX);