Merge branch 'rmobile-latest' of git://git.kernel.org/pub/scm/linux/kernel/git/lethal...
[pandora-kernel.git] / kernel / futex.c
index b766d28..6570c45 100644 (file)
@@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
        return NULL;
 }
 
-static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
+static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
+                                     u32 uval, u32 newval)
 {
-       u32 curval;
+       int ret;
 
        pagefault_disable();
-       curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+       ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
        pagefault_enable();
 
-       return curval;
+       return ret;
 }
 
 static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                struct task_struct *task, int set_waiters)
 {
        int lock_taken, ret, ownerdied = 0;
-       u32 uval, newval, curval;
+       u32 uval, newval, curval, vpid = task_pid_vnr(task);
 
 retry:
        ret = lock_taken = 0;
@@ -684,19 +685,17 @@ retry:
         * (by doing a 0 -> TID atomic cmpxchg), while holding all
         * the locks. It will most likely not succeed.
         */
-       newval = task_pid_vnr(task);
+       newval = vpid;
        if (set_waiters)
                newval |= FUTEX_WAITERS;
 
-       curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-
-       if (unlikely(curval == -EFAULT))
+       if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
                return -EFAULT;
 
        /*
         * Detect deadlocks.
         */
-       if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+       if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
                return -EDEADLK;
 
        /*
@@ -723,14 +722,12 @@ retry:
         */
        if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
                /* Keep the OWNER_DIED bit */
-               newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+               newval = (curval & ~FUTEX_TID_MASK) | vpid;
                ownerdied = 0;
                lock_taken = 1;
        }
 
-       curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-       if (unlikely(curval == -EFAULT))
+       if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
                return -EFAULT;
        if (unlikely(curval != uval))
                goto retry;
@@ -775,6 +772,24 @@ retry:
        return ret;
 }
 
+/**
+ * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+static void __unqueue_futex(struct futex_q *q)
+{
+       struct futex_hash_bucket *hb;
+
+       if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr)
+                       || plist_node_empty(&q->list)))
+               return;
+
+       hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+       plist_del(&q->list, &hb->chain);
+}
+
 /*
  * The hash bucket lock must be held when this is called.
  * Afterwards, the futex_q must not be accessed.
@@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q)
         */
        get_task_struct(p);
 
-       plist_del(&q->list, &q->list.plist);
+       __unqueue_futex(q);
        /*
         * The waiting task can free the futex_q as soon as
         * q->lock_ptr = NULL is written, without taking any locks. A
@@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 
                newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 
-               curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-               if (curval == -EFAULT)
+               if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
                        ret = -EFAULT;
                else if (curval != uval)
                        ret = -EINVAL;
@@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
         * There is no waiter, so we unlock the futex. The owner died
         * bit has not to be preserved here. We are the owner:
         */
-       oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
-
-       if (oldval == -EFAULT)
-               return oldval;
+       if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
+               return -EFAULT;
        if (oldval != uval)
                return -EAGAIN;
 
@@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
                plist_del(&q->list, &hb1->chain);
                plist_add(&q->list, &hb2->chain);
                q->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-               q->list.plist.spinlock = &hb2->lock;
-#endif
        }
        get_futex_key_refs(key2);
        q->key = *key2;
@@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
        get_futex_key_refs(key);
        q->key = *key;
 
-       WARN_ON(plist_node_empty(&q->list));
-       plist_del(&q->list, &q->list.plist);
+       __unqueue_futex(q);
 
        WARN_ON(!q->rt_waiter);
        q->rt_waiter = NULL;
 
        q->lock_ptr = &hb->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-       q->list.plist.spinlock = &hb->lock;
-#endif
 
        wake_up_state(q->task, TASK_NORMAL);
 }
@@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
        prio = min(current->normal_prio, MAX_RT_PRIO);
 
        plist_node_init(&q->list, prio);
-#ifdef CONFIG_DEBUG_PI_LIST
-       q->list.plist.spinlock = &hb->lock;
-#endif
        plist_add(&q->list, &hb->chain);
        q->task = current;
        spin_unlock(&hb->lock);
@@ -1504,8 +1505,7 @@ retry:
                        spin_unlock(lock_ptr);
                        goto retry;
                }
-               WARN_ON(plist_node_empty(&q->list));
-               plist_del(&q->list, &q->list.plist);
+               __unqueue_futex(q);
 
                BUG_ON(q->pi_state);
 
@@ -1525,8 +1525,7 @@ retry:
 static void unqueue_me_pi(struct futex_q *q)
        __releases(q->lock_ptr)
 {
-       WARN_ON(plist_node_empty(&q->list));
-       plist_del(&q->list, &q->list.plist);
+       __unqueue_futex(q);
 
        BUG_ON(!q->pi_state);
        free_pi_state(q->pi_state);
@@ -1556,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 
        /*
         * We are here either because we stole the rtmutex from the
-        * pending owner or we are the pending owner which failed to
-        * get the rtmutex. We have to replace the pending owner TID
-        * in the user space variable. This must be atomic as we have
-        * to preserve the owner died bit here.
+        * previous highest priority waiter or we are the highest priority
+        * waiter but failed to get the rtmutex the first time.
+        * We have to replace the newowner TID in the user space variable.
+        * This must be atomic as we have to preserve the owner died bit here.
         *
         * Note: We write the user space value _before_ changing the pi_state
         * because we can fault here. Imagine swapped out pages or a fork
@@ -1578,9 +1577,7 @@ retry:
        while (1) {
                newval = (uval & FUTEX_OWNER_DIED) | newtid;
 
-               curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-               if (curval == -EFAULT)
+               if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
                        goto handle_fault;
                if (curval == uval)
                        break;
@@ -1608,8 +1605,8 @@ retry:
 
        /*
         * To handle the page fault we need to drop the hash bucket
-        * lock here. That gives the other task (either the pending
-        * owner itself or the task which stole the rtmutex) the
+        * lock here. That gives the other task (either the highest priority
+        * waiter itself or the task which stole the rtmutex) the
         * chance to try the fixup of the pi_state. So once we are
         * back from handling the fault we need to check the pi_state
         * after reacquiring the hash bucket lock and before trying to
@@ -1685,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
                /*
                 * pi_state is incorrect, some other task did a lock steal and
                 * we returned due to timeout or signal without taking the
-                * rt_mutex. Too late. We can access the rt_mutex_owner without
-                * locking, as the other task is now blocked on the hash bucket
-                * lock. Fix the state up.
+                * rt_mutex. Too late.
                 */
+               raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+               if (!owner)
+                       owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
+               raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
                ret = fixup_pi_state_owner(uaddr, q, owner);
                goto out;
        }
 
        /*
         * Paranoia check. If we did not take the lock, then we should not be
-        * the owner, nor the pending owner, of the rt_mutex.
+        * the owner of the rt_mutex.
         */
        if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
                printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
@@ -1781,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
         *
         * The basic logical guarantee of a futex is that it blocks ONLY
         * if cond(var) is known to be true at the time of blocking, for
-        * any cond.  If we queued after testing *uaddr, that would open
-        * a race condition where we could block indefinitely with
+        * any cond.  If we locked the hash-bucket after testing *uaddr, that
+        * would open a race condition where we could block indefinitely with
         * cond(var) false, which would violate the guarantee.
         *
-        * A consequence is that futex_wait() can return zero and absorb
-        * a wakeup when *uaddr != val on entry to the syscall.  This is
-        * rare, but normal.
+        * On the other hand, we insert q and release the hash-bucket only
+        * after testing *uaddr.  This guarantees that futex_wait() will NOT
+        * absorb a wakeup if *uaddr does not match the desired values
+        * while the syscall executes.
         */
 retry:
        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
@@ -2046,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
-       u32 uval;
        struct plist_head *head;
        union futex_key key = FUTEX_KEY_INIT;
+       u32 uval, vpid = task_pid_vnr(current);
        int ret;
 
 retry:
@@ -2057,7 +2057,7 @@ retry:
        /*
         * We release only a lock we actually own:
         */
-       if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
+       if ((uval & FUTEX_TID_MASK) != vpid)
                return -EPERM;
 
        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
@@ -2072,17 +2072,14 @@ retry:
         * again. If it succeeds then we can return without waking
         * anyone else up:
         */
-       if (!(uval & FUTEX_OWNER_DIED))
-               uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
-
-
-       if (unlikely(uval == -EFAULT))
+       if (!(uval & FUTEX_OWNER_DIED) &&
+           cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
                goto pi_faulted;
        /*
         * Rare case: we managed to release the lock atomically,
         * no need to wake anyone else up:
         */
-       if (unlikely(uval == task_pid_vnr(current)))
+       if (unlikely(uval == vpid))
                goto out_unlock;
 
        /*
@@ -2167,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
                 * We were woken prior to requeue by a timeout or a signal.
                 * Unqueue the futex_q and determine which it was.
                 */
-               plist_del(&q->list, &q->list.plist);
+               plist_del(&q->list, &hb->chain);
 
                /* Handle spurious wakeups gracefully */
                ret = -EWOULDBLOCK;
@@ -2421,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
                        goto err_unlock;
                ret = -EPERM;
                pcred = __task_cred(p);
+               /* If victim is in different user_ns, then uids are not
+                  comparable, so we must have CAP_SYS_PTRACE */
+               if (cred->user->user_ns != pcred->user->user_ns) {
+                       if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+                               goto err_unlock;
+                       goto ok;
+               }
+               /* If victim is in same user_ns, then uids are comparable */
                if (cred->euid != pcred->euid &&
                    cred->euid != pcred->uid &&
-                   !capable(CAP_SYS_PTRACE))
+                   !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
                        goto err_unlock;
+ok:
                head = p->robust_list;
                rcu_read_unlock();
        }
@@ -2463,11 +2469,20 @@ retry:
                 * userspace.
                 */
                mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-               nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
-
-               if (nval == -EFAULT)
-                       return -1;
-
+               /*
+                * We are not holding a lock here, but we want to have
+                * the pagefault_disable/enable() protection because
+                * we want to handle the fault gracefully. If the
+                * access fails we try to fault in the futex with R/W
+                * verification via get_user_pages. get_user() above
+                * does not guarantee R/W access. If that fails we
+                * give up and leave the futex locked.
+                */
+               if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+                       if (fault_in_user_writeable(uaddr))
+                               return -1;
+                       goto retry;
+               }
                if (nval != uval)
                        goto retry;
 
@@ -2678,8 +2693,7 @@ static int __init futex_init(void)
         * implementation, the non-functional ones will return
         * -ENOSYS.
         */
-       curval = cmpxchg_futex_value_locked(NULL, 0, 0);
-       if (curval == -EFAULT)
+       if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
                futex_cmpxchg_enabled = 1;
 
        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {