af_unix: fix EPOLLET regression for stream sockets
authorEric Dumazet <eric.dumazet@gmail.com>
Sat, 28 Jan 2012 16:11:03 +0000 (16:11 +0000)
committerDavid S. Miller <davem@davemloft.net>
Mon, 30 Jan 2012 17:45:07 +0000 (12:45 -0500)
Commit 0884d7aa24 (AF_UNIX: Fix poll blocking problem when reading from
a stream socket) added a regression for epoll() in Edge Triggered mode
(EPOLLET)

Appropriate fix is to use skb_peek()/skb_unlink() instead of
skb_dequeue(), and only call skb_unlink() when skb is fully consumed.

This remove the need to requeue a partial skb into sk_receive_queue head
and the extra sk->sk_data_ready() calls that added the regression.

This is safe because once skb is given to sk_receive_queue, it is not
modified by a writer, and readers are serialized by u->readlock mutex.

This also reduce number of spinlock acquisition for small reads or
MSG_PEEK users so should improve overall performance.

Reported-by: Nick Mathewson <nickm@freehaven.net>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Alexey Moiseytsev <himeraster@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
net/unix/af_unix.c

index aad8fb6..85d3bb7 100644 (file)
@@ -1918,7 +1918,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
                struct sk_buff *skb;
 
                unix_state_lock(sk);
-               skb = skb_dequeue(&sk->sk_receive_queue);
+               skb = skb_peek(&sk->sk_receive_queue);
                if (skb == NULL) {
                        unix_sk(sk)->recursion_level = 0;
                        if (copied >= target)
@@ -1958,11 +1958,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
                if (check_creds) {
                        /* Never glue messages from different writers */
                        if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
-                           (UNIXCB(skb).cred != siocb->scm->cred)) {
-                               skb_queue_head(&sk->sk_receive_queue, skb);
-                               sk->sk_data_ready(sk, skb->len);
+                           (UNIXCB(skb).cred != siocb->scm->cred))
                                break;
-                       }
                } else {
                        /* Copy credentials */
                        scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
@@ -1977,8 +1974,6 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 
                chunk = min_t(unsigned int, skb->len, size);
                if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
-                       skb_queue_head(&sk->sk_receive_queue, skb);
-                       sk->sk_data_ready(sk, skb->len);
                        if (copied == 0)
                                copied = -EFAULT;
                        break;
@@ -1993,13 +1988,10 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
                        if (UNIXCB(skb).fp)
                                unix_detach_fds(siocb->scm, skb);
 
-                       /* put the skb back if we didn't use it up.. */
-                       if (skb->len) {
-                               skb_queue_head(&sk->sk_receive_queue, skb);
-                               sk->sk_data_ready(sk, skb->len);
+                       if (skb->len)
                                break;
-                       }
 
+                       skb_unlink(skb, &sk->sk_receive_queue);
                        consume_skb(skb);
 
                        if (siocb->scm->fp)
@@ -2010,9 +2002,6 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
                        if (UNIXCB(skb).fp)
                                siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
 
-                       /* put message back and return */
-                       skb_queue_head(&sk->sk_receive_queue, skb);
-                       sk->sk_data_ready(sk, skb->len);
                        break;
                }
        } while (size);