Linux 3.2.102

[pandora-kernel.git] / net / unix / af_unix.c
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c

index b595a3d..8d0b803 100644 (file)
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -149,8 +149,8 @@ static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
  
  static inline unsigned unix_hash_fold(__wsum n)
  {
-       unsigned hash = (__force unsigned)n;
-       hash ^= hash>>16;
+       unsigned int hash = (__force unsigned int)csum_fold(n);
+
         hash ^= hash>>8;
         return hash&(UNIX_HASH_SIZE-1);
  }
@@ -303,6 +303,118 @@ found:
         return s;
  }
  
+/* Support code for asymmetrically connected dgram sockets
+ *
+ * If a datagram socket is connected to a socket not itself connected
+ * to the first socket (eg, /dev/log), clients may only enqueue more
+ * messages if the present receive queue of the server socket is not
+ * "too large". This means there's a second writeability condition
+ * poll and sendmsg need to test. The dgram recv code will do a wake
+ * up on the peer_wait wait queue of a socket upon reception of a
+ * datagram which needs to be propagated to sleeping would-be writers
+ * since these might not have sent anything so far. This can't be
+ * accomplished via poll_wait because the lifetime of the server
+ * socket might be less than that of its clients if these break their
+ * association with it or if the server socket is closed while clients
+ * are still connected to it and there's no way to inform "a polling
+ * implementation" that it should let go of a certain wait queue
+ *
+ * In order to propagate a wake up, a wait_queue_t of the client
+ * socket is enqueued on the peer_wait queue of the server socket
+ * whose wake function does a wake_up on the ordinary client socket
+ * wait queue. This connection is established whenever a write (or
+ * poll for write) hit the flow control condition and broken when the
+ * association to the server socket is dissolved or after a wake up
+ * was relayed.
+ */
+
+static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
+                                     void *key)
+{
+       struct unix_sock *u;
+       wait_queue_head_t *u_sleep;
+
+       u = container_of(q, struct unix_sock, peer_wake);
+
+       __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
+                           q);
+       u->peer_wake.private = NULL;
+
+       /* relaying can only happen while the wq still exists */
+       u_sleep = sk_sleep(&u->sk);
+       if (u_sleep)
+               wake_up_interruptible_poll(u_sleep, key);
+
+       return 0;
+}
+
+static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
+{
+       struct unix_sock *u, *u_other;
+       int rc;
+
+       u = unix_sk(sk);
+       u_other = unix_sk(other);
+       rc = 0;
+       spin_lock(&u_other->peer_wait.lock);
+
+       if (!u->peer_wake.private) {
+               u->peer_wake.private = other;
+               __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
+
+               rc = 1;
+       }
+
+       spin_unlock(&u_other->peer_wait.lock);
+       return rc;
+}
+
+static void unix_dgram_peer_wake_disconnect(struct sock *sk,
+                                           struct sock *other)
+{
+       struct unix_sock *u, *u_other;
+
+       u = unix_sk(sk);
+       u_other = unix_sk(other);
+       spin_lock(&u_other->peer_wait.lock);
+
+       if (u->peer_wake.private == other) {
+               __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
+               u->peer_wake.private = NULL;
+       }
+
+       spin_unlock(&u_other->peer_wait.lock);
+}
+
+static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
+                                                  struct sock *other)
+{
+       unix_dgram_peer_wake_disconnect(sk, other);
+       wake_up_interruptible_poll(sk_sleep(sk),
+                                  POLLOUT |
+                                  POLLWRNORM |
+                                  POLLWRBAND);
+}
+
+/* preconditions:
+ *     - unix_peer(sk) == other
+ *     - association is stable
+ */
+static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
+{
+       int connected;
+
+       connected = unix_dgram_peer_wake_connect(sk, other);
+
+       if (unix_recvq_full(other))
+               return 1;
+
+       if (connected)
+               unix_dgram_peer_wake_disconnect(sk, other);
+
+       return 0;
+}
+
  static inline int unix_writable(struct sock *sk)
  {
         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
@@ -371,7 +483,7 @@ static void unix_sock_destructor(struct sock *sk)
  #endif
  }
  
-static int unix_release_sock(struct sock *sk, int embrion)
+static void unix_release_sock(struct sock *sk, int embrion)
  {
         struct unix_sock *u = unix_sk(sk);
         struct dentry *dentry;
@@ -409,6 +521,8 @@ static int unix_release_sock(struct sock *sk, int embrion)
                         skpair->sk_state_change(skpair);
                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
                 }
+
+               unix_dgram_peer_wake_disconnect(sk, skpair);
                 sock_put(skpair); /* It may now die */
                 unix_peer(sk) = NULL;
         }
@@ -444,8 +558,6 @@ static int unix_release_sock(struct sock *sk, int embrion)
  
         if (unix_tot_inflight)
                 unix_gc();              /* Garbage collect fds */
-
-       return 0;
  }
  
  static void init_peercred(struct sock *sk)
@@ -632,6 +744,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
         INIT_LIST_HEAD(&u->link);
         mutex_init(&u->readlock); /* single task reading lock */
         init_waitqueue_head(&u->peer_wait);
+       init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
         unix_insert_socket(unix_sockets_unbound, sk);
  out:
         if (sk == NULL)
@@ -682,9 +795,10 @@ static int unix_release(struct socket *sock)
         if (!sk)
                 return 0;
  
+       unix_release_sock(sk, 0);
         sock->sk = NULL;
  
-       return unix_release_sock(sk, 0);
+       return 0;
  }
  
  static int unix_autobind(struct socket *sock)
@@ -697,7 +811,9 @@ static int unix_autobind(struct socket *sock)
         int err;
         unsigned int retries = 0;
  
-       mutex_lock(&u->readlock);
+       err = mutex_lock_interruptible(&u->readlock);
+       if (err)
+               return err;
  
         err = 0;
         if (u->addr)
@@ -830,7 +946,9 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
                 goto out;
         addr_len = err;
  
-       mutex_lock(&u->readlock);
+       err = mutex_lock_interruptible(&u->readlock);
+       if (err)
+               goto out;
  
         err = -EINVAL;
         if (u->addr)
@@ -1002,6 +1120,8 @@ restart:
         if (unix_peer(sk)) {
                 struct sock *old_peer = unix_peer(sk);
                 unix_peer(sk) = other;
+               unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
+
                 unix_state_double_unlock(sk, other);
  
                 if (other != old_peer)
@@ -1237,6 +1357,15 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb)
         return 0;
  }
  
+static void unix_sock_inherit_flags(const struct socket *old,
+                                   struct socket *new)
+{
+       if (test_bit(SOCK_PASSCRED, &old->flags))
+               set_bit(SOCK_PASSCRED, &new->flags);
+       if (test_bit(SOCK_PASSSEC, &old->flags))
+               set_bit(SOCK_PASSSEC, &new->flags);
+}
+
  static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
  {
         struct sock *sk = sock->sk;
@@ -1271,6 +1400,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
         /* attach accepted sock to socket */
         unix_state_lock(tsk);
         newsock->state = SS_CONNECTED;
+       unix_sock_inherit_flags(sock, newsock);
         sock_graft(tsk, newsock);
         unix_state_unlock(tsk);
         return 0;
@@ -1324,7 +1454,7 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
         UNIXCB(skb).fp = NULL;
  
         for (i = scm->fp->count-1; i >= 0; i--)
-               unix_notinflight(scm->fp->fp[i]);
+               unix_notinflight(scm->fp->user, scm->fp->fp[i]);
  }
  
  static void unix_destruct_scm(struct sk_buff *skb)
@@ -1342,6 +1472,21 @@ static void unix_destruct_scm(struct sk_buff *skb)
         sock_wfree(skb);
  }
  
+/*
+ * The "user->unix_inflight" variable is protected by the garbage
+ * collection lock, and we just read it locklessly here. If you go
+ * over the limit, there might be a tiny race in actually noticing
+ * it across threads. Tough.
+ */
+static inline bool too_many_unix_fds(struct task_struct *p)
+{
+       struct user_struct *user = current_user();
+
+       if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
+               return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+       return false;
+}
+
  #define MAX_RECURSION_LEVEL 4
  
  static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
@@ -1350,6 +1495,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
         unsigned char max_level = 0;
         int unix_sock_count = 0;
  
+       if (too_many_unix_fds(current))
+               return -ETOOMANYREFS;
+
         for (i = scm->fp->count - 1; i >= 0; i--) {
                 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
  
@@ -1371,10 +1519,8 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
         if (!UNIXCB(skb).fp)
                 return -ENOMEM;
  
-       if (unix_sock_count) {
-               for (i = scm->fp->count - 1; i >= 0; i--)
-                       unix_inflight(scm->fp->fp[i]);
-       }
+       for (i = scm->fp->count - 1; i >= 0; i--)
+               unix_inflight(scm->fp->user, scm->fp->fp[i]);
         return max_level;
  }
  
@@ -1431,11 +1577,12 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
         long timeo;
         struct scm_cookie tmp_scm;
         int max_level;
+       int sk_locked;
  
         if (NULL == siocb->scm)
                 siocb->scm = &tmp_scm;
         wait_for_unix_gc();
-       err = scm_send(sock, msg, siocb->scm);
+       err = scm_send(sock, msg, siocb->scm, false);
         if (err < 0)
                 return err;
  
@@ -1499,12 +1646,14 @@ restart:
                 goto out_free;
         }
  
+       sk_locked = 0;
         unix_state_lock(other);
+restart_locked:
         err = -EPERM;
         if (!unix_may_send(sk, other))
                 goto out_unlock;
  
-       if (sock_flag(other, SOCK_DEAD)) {
+       if (unlikely(sock_flag(other, SOCK_DEAD))) {
                 /*
                  *      Check with 1003.1g - what should
                  *      datagram error
@@ -1512,10 +1661,14 @@ restart:
                 unix_state_unlock(other);
                 sock_put(other);
  
+               if (!sk_locked)
+                       unix_state_lock(sk);
+
                 err = 0;
-               unix_state_lock(sk);
                 if (unix_peer(sk) == other) {
                         unix_peer(sk) = NULL;
+                       unix_dgram_peer_wake_disconnect_wakeup(sk, other);
+
                         unix_state_unlock(sk);
  
                         unix_dgram_disconnected(sk, other);
@@ -1541,21 +1694,43 @@ restart:
                         goto out_unlock;
         }
  
-       if (unix_peer(other) != sk && unix_recvq_full(other)) {
-               if (!timeo) {
-                       err = -EAGAIN;
-                       goto out_unlock;
+       /* other == sk && unix_peer(other) != sk if
+        * - unix_peer(sk) == NULL, destination address bound to sk
+        * - unix_peer(sk) == sk by time of get but disconnected before lock
+        */
+       if (other != sk &&
+           unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
+               if (timeo) {
+                       timeo = unix_wait_for_peer(other, timeo);
+
+                       err = sock_intr_errno(timeo);
+                       if (signal_pending(current))
+                               goto out_free;
+
+                       goto restart;
                 }
  
-               timeo = unix_wait_for_peer(other, timeo);
+               if (!sk_locked) {
+                       unix_state_unlock(other);
+                       unix_state_double_lock(sk, other);
+               }
  
-               err = sock_intr_errno(timeo);
-               if (signal_pending(current))
-                       goto out_free;
+               if (unix_peer(sk) != other ||
+                   unix_dgram_peer_wake_me(sk, other)) {
+                       err = -EAGAIN;
+                       sk_locked = 1;
+                       goto out_unlock;
+               }
  
-               goto restart;
+               if (!sk_locked) {
+                       sk_locked = 1;
+                       goto restart_locked;
+               }
         }
  
+       if (unlikely(sk_locked))
+               unix_state_unlock(sk);
+
         if (sock_flag(other, SOCK_RCVTSTAMP))
                 __net_timestamp(skb);
         maybe_add_creds(skb, sock, other);
@@ -1569,6 +1744,8 @@ restart:
         return len;
  
  out_unlock:
+       if (sk_locked)
+               unix_state_unlock(sk);
         unix_state_unlock(other);
  out_free:
         kfree_skb(skb);
@@ -1596,7 +1773,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
         if (NULL == siocb->scm)
                 siocb->scm = &tmp_scm;
         wait_for_unix_gc();
-       err = scm_send(sock, msg, siocb->scm);
+       err = scm_send(sock, msg, siocb->scm, false);
         if (err < 0)
                 return err;
  
@@ -1735,7 +1912,6 @@ static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
  {
         struct unix_sock *u = unix_sk(sk);
  
-       msg->msg_namelen = 0;
         if (u->addr) {
                 msg->msg_namelen = u->addr->len;
                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
@@ -1758,11 +1934,12 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
         if (flags&MSG_OOB)
                 goto out;
  
-       msg->msg_namelen = 0;
-
         err = mutex_lock_interruptible(&u->readlock);
-       if (err) {
-               err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
+       if (unlikely(err)) {
+               /* recvmsg() in non blocking mode is supposed to return -EAGAIN
+                * sk_rcvtimeo is not honored by mutex_lock_interruptible()
+                */
+               err = noblock ? -EAGAIN : -ERESTARTSYS;
                 goto out;
         }
  
@@ -1857,6 +2034,10 @@ static long unix_stream_data_wait(struct sock *sk, long timeo)
                 unix_state_unlock(sk);
                 timeo = schedule_timeout(timeo);
                 unix_state_lock(sk);
+
+               if (sock_flag(sk, SOCK_DEAD))
+                       break;
+
                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
         }
  
@@ -1877,23 +2058,24 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
         struct unix_sock *u = unix_sk(sk);
         struct sockaddr_un *sunaddr = msg->msg_name;
         int copied = 0;
+       int noblock = flags & MSG_DONTWAIT;
         int check_creds = 0;
         int target;
         int err = 0;
         long timeo;
  
-       err = -EINVAL;
-       if (sk->sk_state != TCP_ESTABLISHED)
+       if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
+               err = -EINVAL;
                 goto out;
+       }
  
-       err = -EOPNOTSUPP;
-       if (flags&MSG_OOB)
+       if (unlikely(flags & MSG_OOB)) {
+               err = -EOPNOTSUPP;
                 goto out;
+       }
  
         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
-       timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
-
-       msg->msg_namelen = 0;
+       timeo = sock_rcvtimeo(sk, noblock);
  
         /* Lock the socket to prevent queue disordering
          * while sleeps in memcpy_tomsg
@@ -1904,18 +2086,18 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
                 memset(&tmp_scm, 0, sizeof(tmp_scm));
         }
  
-       err = mutex_lock_interruptible(&u->readlock);
-       if (err) {
-               err = sock_intr_errno(timeo);
-               goto out;
-       }
+       mutex_lock(&u->readlock);
  
         do {
                 int chunk;
                 struct sk_buff *skb;
  
                 unix_state_lock(sk);
-               skb = skb_dequeue(&sk->sk_receive_queue);
+               if (sock_flag(sk, SOCK_DEAD)) {
+                       err = -ECONNRESET;
+                       goto unlock;
+               }
+               skb = skb_peek(&sk->sk_receive_queue);
                 if (skb == NULL) {
                         unix_sk(sk)->recursion_level = 0;
                         if (copied >= target)
@@ -1932,19 +2114,22 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
                                 goto unlock;
  
                         unix_state_unlock(sk);
-                       err = -EAGAIN;
-                       if (!timeo)
+                       if (!timeo) {
+                               err = -EAGAIN;
                                 break;
+                       }
+
                         mutex_unlock(&u->readlock);
  
                         timeo = unix_stream_data_wait(sk, timeo);
  
-                       if (signal_pending(current)
-                           ||  mutex_lock_interruptible(&u->readlock)) {
+                       if (signal_pending(current)) {
                                 err = sock_intr_errno(timeo);
+                               scm_destroy(siocb->scm);
                                 goto out;
                         }
  
+                       mutex_lock(&u->readlock);
                         continue;
   unlock:
                         unix_state_unlock(sk);
@@ -1955,12 +2140,9 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
                 if (check_creds) {
                         /* Never glue messages from different writers */
                         if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
-                           (UNIXCB(skb).cred != siocb->scm->cred)) {
-                               skb_queue_head(&sk->sk_receive_queue, skb);
-                               sk->sk_data_ready(sk, skb->len);
+                           (UNIXCB(skb).cred != siocb->scm->cred))
                                 break;
-                       }
-               } else {
+               } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
                         /* Copy credentials */
                         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
                         check_creds = 1;
@@ -1974,8 +2156,6 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
  
                 chunk = min_t(unsigned int, skb->len, size);
                 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
-                       skb_queue_head(&sk->sk_receive_queue, skb);
-                       sk->sk_data_ready(sk, skb->len);
                         if (copied == 0)
                                 copied = -EFAULT;
                         break;
@@ -1990,13 +2170,10 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
                         if (UNIXCB(skb).fp)
                                 unix_detach_fds(siocb->scm, skb);
  
-                       /* put the skb back if we didn't use it up.. */
-                       if (skb->len) {
-                               skb_queue_head(&sk->sk_receive_queue, skb);
-                               sk->sk_data_ready(sk, skb->len);
+                       if (skb->len)
                                 break;
-                       }
  
+                       skb_unlink(skb, &sk->sk_receive_queue);
                         consume_skb(skb);
  
                         if (siocb->scm->fp)
@@ -2007,9 +2184,6 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
                         if (UNIXCB(skb).fp)
                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
  
-                       /* put message back and return */
-                       skb_queue_head(&sk->sk_receive_queue, skb);
-                       sk->sk_data_ready(sk, skb->len);
                         break;
                 }
         } while (size);
@@ -2174,14 +2348,16 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
                 return mask;
  
         writable = unix_writable(sk);
-       other = unix_peer_get(sk);
-       if (other) {
-               if (unix_peer(other) != sk) {
-                       sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
-                       if (unix_recvq_full(other))
-                               writable = 0;
-               }
-               sock_put(other);
+       if (writable) {
+               unix_state_lock(sk);
+
+               other = unix_peer(sk);
+               if (other && unix_peer(other) != sk &&
+                   unix_recvq_full(other) &&
+                   unix_dgram_peer_wake_me(sk, other))
+                       writable = 0;
+
+               unix_state_unlock(sk);
         }
  
         if (writable)