futex: Fix potential use-after-free in FUTEX_REQUEUE_PI
[pandora-kernel.git] / kernel / futex.c
index ea87f4d..ad69658 100644 (file)
@@ -59,6 +59,8 @@
 #include <linux/magic.h>
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
+#include <linux/ptrace.h>
+#include <linux/hugetlb.h>
 
 #include <asm/futex.h>
 
@@ -283,7 +285,7 @@ again:
                put_page(page);
                /* serialize against __split_huge_page_splitting() */
                local_irq_disable();
-               if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+               if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
                        page_head = compound_head(page);
                        /*
                         * page_head is valid pointer but we must pin
@@ -314,17 +316,29 @@ again:
 #endif
 
        lock_page(page_head);
+
+       /*
+        * If page_head->mapping is NULL, then it cannot be a PageAnon
+        * page; but it might be the ZERO_PAGE or in the gate area or
+        * in a special mapping (all cases which we are happy to fail);
+        * or it may have been a good file page when get_user_pages_fast
+        * found it, but truncated or holepunched or subjected to
+        * invalidate_complete_page2 before we got the page lock (also
+        * cases which we are happy to fail).  And we hold a reference,
+        * so refcount care in invalidate_complete_page's remove_mapping
+        * prevents drop_caches from setting mapping to NULL beneath us.
+        *
+        * The case we do have to guard against is when memory pressure made
+        * shmem_writepage move it from filecache to swapcache beneath us:
+        * an unlikely race, but we do need to retry for page_head->mapping.
+        */
        if (!page_head->mapping) {
+               int shmem_swizzled = PageSwapCache(page_head);
                unlock_page(page_head);
                put_page(page_head);
-               /*
-               * ZERO_PAGE pages don't have a mapping. Avoid a busy loop
-               * trying to find one. RW mapping would have COW'd (and thus
-               * have a mapping) so this page is RO and won't ever change.
-               */
-               if ((page_head == ZERO_PAGE(address)))
-                       return -EFAULT;
-               goto again;
+               if (shmem_swizzled)
+                       goto again;
+               return -EFAULT;
        }
 
        /*
@@ -350,7 +364,7 @@ again:
        } else {
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
                key->shared.inode = page_head->mapping->host;
-               key->shared.pgoff = page_head->index;
+               key->shared.pgoff = basepage_index(page);
        }
 
        get_futex_key_refs(key);
@@ -470,8 +484,14 @@ static struct futex_pi_state * alloc_pi_state(void)
        return pi_state;
 }
 
+/*
+ * Must be called with the hb lock held.
+ */
 static void free_pi_state(struct futex_pi_state *pi_state)
 {
+       if (!pi_state)
+               return;
+
        if (!atomic_dec_and_test(&pi_state->refcount))
                return;
 
@@ -574,6 +594,55 @@ void exit_pi_state_list(struct task_struct *curr)
        raw_spin_unlock_irq(&curr->pi_lock);
 }
 
+/*
+ * We need to check the following states:
+ *
+ *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
+ *
+ * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
+ * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
+ *
+ * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
+ *
+ * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
+ * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
+ *
+ * [6]  Found  | Found    | task      | 0         | 1      | Valid
+ *
+ * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
+ *
+ * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
+ * [9]  Found  | Found    | task      | 0         | 0      | Invalid
+ * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
+ *
+ * [1] Indicates that the kernel can acquire the futex atomically. We
+ *     came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+ *
+ * [2] Valid, if TID does not belong to a kernel thread. If no matching
+ *      thread is found then it indicates that the owner TID has died.
+ *
+ * [3] Invalid. The waiter is queued on a non PI futex
+ *
+ * [4] Valid state after exit_robust_list(), which sets the user space
+ *     value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
+ *
+ * [5] The user space value got manipulated between exit_robust_list()
+ *     and exit_pi_state_list()
+ *
+ * [6] Valid state after exit_pi_state_list() which sets the new owner in
+ *     the pi_state but cannot access the user space value.
+ *
+ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
+ *
+ * [8] Owner and user space value match
+ *
+ * [9] There is no transient state which sets the user space TID to 0
+ *     except exit_robust_list(), but this is indicated by the
+ *     FUTEX_OWNER_DIED bit. See [4]
+ *
+ * [10] There is no transient state which leaves owner and user space
+ *     TID out of sync.
+ */
 static int
 lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                union futex_key *key, struct futex_pi_state **ps)
@@ -589,12 +658,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        plist_for_each_entry_safe(this, next, head, list) {
                if (match_futex(&this->key, key)) {
                        /*
-                        * Another waiter already exists - bump up
-                        * the refcount and return its pi_state:
+                        * Sanity check the waiter before increasing
+                        * the refcount and attaching to it.
                         */
                        pi_state = this->pi_state;
                        /*
-                        * Userspace might have messed up non-PI and PI futexes
+                        * Userspace might have messed up non-PI and
+                        * PI futexes [3]
                         */
                        if (unlikely(!pi_state))
                                return -EINVAL;
@@ -602,34 +672,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                        WARN_ON(!atomic_read(&pi_state->refcount));
 
                        /*
-                        * When pi_state->owner is NULL then the owner died
-                        * and another waiter is on the fly. pi_state->owner
-                        * is fixed up by the task which acquires
-                        * pi_state->rt_mutex.
-                        *
-                        * We do not check for pid == 0 which can happen when
-                        * the owner died and robust_list_exit() cleared the
-                        * TID.
+                        * Handle the owner died case:
                         */
-                       if (pid && pi_state->owner) {
+                       if (uval & FUTEX_OWNER_DIED) {
                                /*
-                                * Bail out if user space manipulated the
-                                * futex value.
+                                * exit_pi_state_list sets owner to NULL and
+                                * wakes the topmost waiter. The task which
+                                * acquires the pi_state->rt_mutex will fixup
+                                * owner.
                                 */
-                               if (pid != task_pid_vnr(pi_state->owner))
+                               if (!pi_state->owner) {
+                                       /*
+                                        * No pi state owner, but the user
+                                        * space TID is not 0. Inconsistent
+                                        * state. [5]
+                                        */
+                                       if (pid)
+                                               return -EINVAL;
+                                       /*
+                                        * Take a ref on the state and
+                                        * return. [4]
+                                        */
+                                       goto out_state;
+                               }
+
+                               /*
+                                * If TID is 0, then either the dying owner
+                                * has not yet executed exit_pi_state_list()
+                                * or some waiter acquired the rtmutex in the
+                                * pi state, but did not yet fixup the TID in
+                                * user space.
+                                *
+                                * Take a ref on the state and return. [6]
+                                */
+                               if (!pid)
+                                       goto out_state;
+                       } else {
+                               /*
+                                * If the owner died bit is not set,
+                                * then the pi_state must have an
+                                * owner. [7]
+                                */
+                               if (!pi_state->owner)
                                        return -EINVAL;
                        }
 
+                       /*
+                        * Bail out if user space manipulated the
+                        * futex value. If pi state exists then the
+                        * owner TID must be the same as the user
+                        * space TID. [9/10]
+                        */
+                       if (pid != task_pid_vnr(pi_state->owner))
+                               return -EINVAL;
+
+               out_state:
                        atomic_inc(&pi_state->refcount);
                        *ps = pi_state;
-
                        return 0;
                }
        }
 
        /*
         * We are the first waiter - try to look up the real owner and attach
-        * the new pi_state to it, but bail out when TID = 0
+        * the new pi_state to it, but bail out when TID = 0 [1]
         */
        if (!pid)
                return -ESRCH;
@@ -637,6 +743,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        if (!p)
                return -ESRCH;
 
+       if (!p->mm) {
+               put_task_struct(p);
+               return -EPERM;
+       }
+
        /*
         * We need to look at the task state flags to figure out,
         * whether the task is exiting. To protect against the do_exit
@@ -657,6 +768,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                return ret;
        }
 
+       /*
+        * No existing pi state. First waiter. [2]
+        */
        pi_state = alloc_pi_state();
 
        /*
@@ -703,7 +817,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                struct futex_pi_state **ps,
                                struct task_struct *task, int set_waiters)
 {
-       int lock_taken, ret, ownerdied = 0;
+       int lock_taken, ret, force_take = 0;
        u32 uval, newval, curval, vpid = task_pid_vnr(task);
 
 retry:
@@ -728,10 +842,18 @@ retry:
                return -EDEADLK;
 
        /*
-        * Surprise - we got the lock. Just return to userspace:
+        * Surprise - we got the lock, but we do not trust user space at all.
         */
-       if (unlikely(!curval))
-               return 1;
+       if (unlikely(!curval)) {
+               /*
+                * We verify whether there is kernel state for this
+                * futex. If not, we can safely assume, that the 0 ->
+                * TID transition is correct. If state exists, we do
+                * not bother to fixup the user space state as it was
+                * corrupted already.
+                */
+               return futex_top_waiter(hb, key) ? -EINVAL : 1;
+       }
 
        uval = curval;
 
@@ -742,17 +864,15 @@ retry:
        newval = curval | FUTEX_WAITERS;
 
        /*
-        * There are two cases, where a futex might have no owner (the
-        * owner TID is 0): OWNER_DIED. We take over the futex in this
-        * case. We also do an unconditional take over, when the owner
-        * of the futex died.
-        *
-        * This is safe as we are protected by the hash bucket lock !
+        * Should we force take the futex? See below.
         */
-       if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
-               /* Keep the OWNER_DIED bit */
+       if (unlikely(force_take)) {
+               /*
+                * Keep the OWNER_DIED and the WAITERS bit and set the
+                * new TID value.
+                */
                newval = (curval & ~FUTEX_TID_MASK) | vpid;
-               ownerdied = 0;
+               force_take = 0;
                lock_taken = 1;
        }
 
@@ -762,7 +882,7 @@ retry:
                goto retry;
 
        /*
-        * We took the lock due to owner died take over.
+        * We took the lock due to forced take over.
         */
        if (unlikely(lock_taken))
                return 1;
@@ -777,20 +897,25 @@ retry:
                switch (ret) {
                case -ESRCH:
                        /*
-                        * No owner found for this futex. Check if the
-                        * OWNER_DIED bit is set to figure out whether
-                        * this is a robust futex or not.
+                        * We failed to find an owner for this
+                        * futex. So we have no pi_state to block
+                        * on. This can happen in two cases:
+                        *
+                        * 1) The owner died
+                        * 2) A stale FUTEX_WAITERS bit
+                        *
+                        * Re-read the futex value.
                         */
                        if (get_futex_value_locked(&curval, uaddr))
                                return -EFAULT;
 
                        /*
-                        * We simply start over in case of a robust
-                        * futex. The code above will take the futex
-                        * and return happy.
+                        * If the owner died or we have a stale
+                        * WAITERS bit the owner TID in the user space
+                        * futex is 0.
                         */
-                       if (curval & FUTEX_OWNER_DIED) {
-                               ownerdied = 1;
+                       if (!(curval & FUTEX_TID_MASK)) {
+                               force_take = 1;
                                goto retry;
                        }
                default:
@@ -827,6 +952,9 @@ static void wake_futex(struct futex_q *q)
 {
        struct task_struct *p = q->task;
 
+       if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+               return;
+
        /*
         * We set q->lock_ptr = NULL _before_ we wake up the task. If
         * a non-futex wake up happens on another CPU then the task
@@ -855,6 +983,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        struct task_struct *new_owner;
        struct futex_pi_state *pi_state = this->pi_state;
        u32 uninitialized_var(curval), newval;
+       int ret = 0;
 
        if (!pi_state)
                return -EINVAL;
@@ -878,23 +1007,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
                new_owner = this->task;
 
        /*
-        * We pass it to the next owner. (The WAITERS bit is always
-        * kept enabled while there is PI state around. We must also
-        * preserve the owner died bit.)
+        * We pass it to the next owner. The WAITERS bit is always
+        * kept enabled while there is PI state around. We cleanup the
+        * owner died bit, because we are the owner.
         */
-       if (!(uval & FUTEX_OWNER_DIED)) {
-               int ret = 0;
-
-               newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+       newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 
-               if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
-                       ret = -EFAULT;
-               else if (curval != uval)
-                       ret = -EINVAL;
-               if (ret) {
-                       raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
-                       return ret;
-               }
+       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+               ret = -EFAULT;
+       else if (curval != uval)
+               ret = -EINVAL;
+       if (ret) {
+               raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+               return ret;
        }
 
        raw_spin_lock_irq(&pi_state->owner->pi_lock);
@@ -1062,6 +1187,10 @@ retry_private:
 
        plist_for_each_entry_safe(this, next, head, list) {
                if (match_futex (&this->key, &key1)) {
+                       if (this->pi_state || this->rt_waiter) {
+                               ret = -EINVAL;
+                               goto out_unlock;
+                       }
                        wake_futex(this);
                        if (++ret >= nr_wake)
                                break;
@@ -1074,6 +1203,10 @@ retry_private:
                op_ret = 0;
                plist_for_each_entry_safe(this, next, head, list) {
                        if (match_futex (&this->key, &key2)) {
+                               if (this->pi_state || this->rt_waiter) {
+                                       ret = -EINVAL;
+                                       goto out_unlock;
+                               }
                                wake_futex(this);
                                if (++op_ret >= nr_wake2)
                                        break;
@@ -1082,6 +1215,7 @@ retry_private:
                ret += op_ret;
        }
 
+out_unlock:
        double_unlock_hb(hb1, hb2);
 out_put_keys:
        put_futex_key(&key2);
@@ -1164,7 +1298,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
  *
  * Returns:
  *  0 - failed to acquire the lock atomicly
- *  1 - acquired the lock
+ * >0 - acquired the lock, return value is vpid of the top_waiter
  * <0 - error
  */
 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1175,7 +1309,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 {
        struct futex_q *top_waiter = NULL;
        u32 curval;
-       int ret;
+       int ret, vpid;
 
        if (get_futex_value_locked(&curval, pifutex))
                return -EFAULT;
@@ -1203,11 +1337,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
         * the contended case or if set_waiters is 1.  The pi_state is returned
         * in ps in contended cases.
         */
+       vpid = task_pid_vnr(top_waiter->task);
        ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
                                   set_waiters);
-       if (ret == 1)
+       if (ret == 1) {
                requeue_pi_wake_futex(top_waiter, key2, hb2);
-
+               return vpid;
+       }
        return ret;
 }
 
@@ -1239,9 +1375,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
        struct futex_hash_bucket *hb1, *hb2;
        struct plist_head *head1;
        struct futex_q *this, *next;
-       u32 curval2;
 
        if (requeue_pi) {
+               /*
+                * Requeue PI only works on two distinct uaddrs. This
+                * check is only valid for private futexes. See below.
+                */
+               if (uaddr1 == uaddr2)
+                       return -EINVAL;
+
                /*
                 * requeue_pi requires a pi_state, try to allocate it now
                 * without any locks in case it fails.
@@ -1263,15 +1405,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
        }
 
 retry:
-       if (pi_state != NULL) {
-               /*
-                * We will have to lookup the pi_state again, so free this one
-                * to keep the accounting correct.
-                */
-               free_pi_state(pi_state);
-               pi_state = NULL;
-       }
-
        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
@@ -1280,6 +1413,15 @@ retry:
        if (unlikely(ret != 0))
                goto out_put_key1;
 
+       /*
+        * The check above which compares uaddrs is not sufficient for
+        * shared futexes. We need to compare the keys:
+        */
+       if (requeue_pi && match_futex(&key1, &key2)) {
+               ret = -EINVAL;
+               goto out_put_keys;
+       }
+
        hb1 = hash_futex(&key1);
        hb2 = hash_futex(&key2);
 
@@ -1325,22 +1467,33 @@ retry_private:
                 * At this point the top_waiter has either taken uaddr2 or is
                 * waiting on it.  If the former, then the pi_state will not
                 * exist yet, look it up one more time to ensure we have a
-                * reference to it.
+                * reference to it. If the lock was taken, ret contains the
+                * vpid of the top waiter task.
                 */
-               if (ret == 1) {
+               if (ret > 0) {
                        WARN_ON(pi_state);
                        drop_count++;
                        task_count++;
-                       ret = get_futex_value_locked(&curval2, uaddr2);
-                       if (!ret)
-                               ret = lookup_pi_state(curval2, hb2, &key2,
-                                                     &pi_state);
+                       /*
+                        * If we acquired the lock, then the user
+                        * space value of uaddr2 should be vpid. It
+                        * cannot be changed by the top waiter as it
+                        * is blocked on hb2 lock if it tries to do
+                        * so. If something fiddled with it behind our
+                        * back the pi state lookup might unearth
+                        * it. So we rather use the known value than
+                        * rereading and handing potential crap to
+                        * lookup_pi_state.
+                        */
+                       ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
                }
 
                switch (ret) {
                case 0:
                        break;
                case -EFAULT:
+                       free_pi_state(pi_state);
+                       pi_state = NULL;
                        double_unlock_hb(hb1, hb2);
                        put_futex_key(&key2);
                        put_futex_key(&key1);
@@ -1350,6 +1503,8 @@ retry_private:
                        goto out;
                case -EAGAIN:
                        /* The owner was exiting, try again. */
+                       free_pi_state(pi_state);
+                       pi_state = NULL;
                        double_unlock_hb(hb1, hb2);
                        put_futex_key(&key2);
                        put_futex_key(&key1);
@@ -1371,9 +1526,13 @@ retry_private:
                /*
                 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
                 * be paired with each other and no other futex ops.
+                *
+                * We should never be requeueing a futex_q with a pi_state,
+                * which is awaiting a futex_unlock_pi().
                 */
                if ((requeue_pi && !this->rt_waiter) ||
-                   (!requeue_pi && this->rt_waiter)) {
+                   (!requeue_pi && this->rt_waiter) ||
+                   this->pi_state) {
                        ret = -EINVAL;
                        break;
                }
@@ -1422,6 +1581,7 @@ retry_private:
        }
 
 out_unlock:
+       free_pi_state(pi_state);
        double_unlock_hb(hb1, hb2);
 
        /*
@@ -1438,8 +1598,6 @@ out_put_keys:
 out_put_key1:
        put_futex_key(&key1);
 out:
-       if (pi_state != NULL)
-               free_pi_state(pi_state);
        return ret ? ret : task_count;
 }
 
@@ -2100,9 +2258,10 @@ retry:
        /*
         * To avoid races, try to do the TID -> 0 atomic transition
         * again. If it succeeds then we can return without waking
-        * anyone else up:
+        * anyone else up. We only try this if neither the waiters nor
+        * the owner died bit are set.
         */
-       if (!(uval & FUTEX_OWNER_DIED) &&
+       if (!(uval & ~FUTEX_TID_MASK) &&
            cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
                goto pi_faulted;
        /*
@@ -2134,11 +2293,9 @@ retry:
        /*
         * No waiters - kernel unlocks the futex:
         */
-       if (!(uval & FUTEX_OWNER_DIED)) {
-               ret = unlock_futex_pi(uaddr, uval);
-               if (ret == -EFAULT)
-                       goto pi_faulted;
-       }
+       ret = unlock_futex_pi(uaddr, uval);
+       if (ret == -EFAULT)
+               goto pi_faulted;
 
 out_unlock:
        spin_unlock(&hb->lock);
@@ -2218,11 +2375,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
  * @uaddr2:    the pi futex we will take prior to returning to user-space
  *
  * The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and
- * complete the acquisition of the rt_mutex prior to returning to userspace.
- * This ensures the rt_mutex maintains an owner when it has waiters; without
- * one, the pi logic wouldn't know which task to boost/deboost, if there was a
- * need to.
+ * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
+ * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
+ * without one, the pi logic would not know which task to boost/deboost, if
+ * there was a need to.
  *
  * We call schedule in futex_wait_queue_me() when we enqueue and return there
  * via the following:
@@ -2253,12 +2410,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct rt_mutex_waiter rt_waiter;
-       struct rt_mutex *pi_mutex = NULL;
        struct futex_hash_bucket *hb;
        union futex_key key2 = FUTEX_KEY_INIT;
        struct futex_q q = futex_q_init;
        int res, ret;
 
+       if (uaddr == uaddr2)
+               return -EINVAL;
+
        if (!bitset)
                return -EINVAL;
 
@@ -2295,6 +2454,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
        if (ret)
                goto out_key2;
 
+       /*
+        * The check above which compares uaddrs is not sufficient for
+        * shared futexes. We need to compare the keys:
+        */
+       if (match_futex(&q.key, &key2)) {
+               queue_unlock(&q, hb);
+               ret = -EINVAL;
+               goto out_put_keys;
+       }
+
        /* Queue the futex_q, drop the hb lock, wait for wakeup. */
        futex_wait_queue_me(hb, &q, to);
 
@@ -2322,15 +2491,22 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                if (q.pi_state && (q.pi_state->owner != current)) {
                        spin_lock(q.lock_ptr);
                        ret = fixup_pi_state_owner(uaddr2, &q, current);
+                       /*
+                        * Drop the reference to the pi state which
+                        * the requeue_pi() code acquired for us.
+                        */
+                       free_pi_state(q.pi_state);
                        spin_unlock(q.lock_ptr);
                }
        } else {
+               struct rt_mutex *pi_mutex;
+
                /*
                 * We have been woken up by futex_unlock_pi(), a timeout, or a
                 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
                 * the pi_state.
                 */
-               WARN_ON(!&q.pi_state);
+               WARN_ON(!q.pi_state);
                pi_mutex = &q.pi_state->pi_mutex;
                ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
                debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2348,18 +2524,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                if (res)
                        ret = (res < 0) ? res : 0;
 
+               /*
+                * If fixup_pi_state_owner() faulted and was unable to handle
+                * the fault, unlock the rt_mutex and return the fault to
+                * userspace.
+                */
+               if (ret && rt_mutex_owner(pi_mutex) == current)
+                       rt_mutex_unlock(pi_mutex);
+
                /* Unqueue and drop the lock. */
                unqueue_me_pi(&q);
        }
 
-       /*
-        * If fixup_pi_state_owner() faulted and was unable to handle the
-        * fault, unlock the rt_mutex and return the fault to userspace.
-        */
-       if (ret == -EFAULT) {
-               if (rt_mutex_owner(pi_mutex) == current)
-                       rt_mutex_unlock(pi_mutex);
-       } else if (ret == -EINTR) {
+       if (ret == -EINTR) {
                /*
                 * We've already been requeued, but cannot restart by calling
                 * futex_lock_pi() directly. We could restart this syscall, but
@@ -2431,40 +2608,29 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 {
        struct robust_list_head __user *head;
        unsigned long ret;
-       const struct cred *cred = current_cred(), *pcred;
+       struct task_struct *p;
 
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
 
+       rcu_read_lock();
+
+       ret = -ESRCH;
        if (!pid)
-               head = current->robust_list;
+               p = current;
        else {
-               struct task_struct *p;
-
-               ret = -ESRCH;
-               rcu_read_lock();
                p = find_task_by_vpid(pid);
                if (!p)
                        goto err_unlock;
-               ret = -EPERM;
-               pcred = __task_cred(p);
-               /* If victim is in different user_ns, then uids are not
-                  comparable, so we must have CAP_SYS_PTRACE */
-               if (cred->user->user_ns != pcred->user->user_ns) {
-                       if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
-                               goto err_unlock;
-                       goto ok;
-               }
-               /* If victim is in same user_ns, then uids are comparable */
-               if (cred->euid != pcred->euid &&
-                   cred->euid != pcred->uid &&
-                   !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
-                       goto err_unlock;
-ok:
-               head = p->robust_list;
-               rcu_read_unlock();
        }
 
+       ret = -EPERM;
+       if (!ptrace_may_access(p, PTRACE_MODE_READ))
+               goto err_unlock;
+
+       head = p->robust_list;
+       rcu_read_unlock();
+
        if (put_user(sizeof(*head), len_ptr))
                return -EFAULT;
        return put_user(head, head_ptr);
@@ -2628,6 +2794,16 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                        return -ENOSYS;
        }
 
+       switch (cmd) {
+       case FUTEX_LOCK_PI:
+       case FUTEX_UNLOCK_PI:
+       case FUTEX_TRYLOCK_PI:
+       case FUTEX_WAIT_REQUEUE_PI:
+       case FUTEX_CMP_REQUEUE_PI:
+               if (!futex_cmpxchg_enabled)
+                       return -ENOSYS;
+       }
+
        switch (cmd) {
        case FUTEX_WAIT:
                val3 = FUTEX_BITSET_MATCH_ANY;
@@ -2649,16 +2825,13 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
                break;
        case FUTEX_LOCK_PI:
-               if (futex_cmpxchg_enabled)
-                       ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
+               ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
                break;
        case FUTEX_UNLOCK_PI:
-               if (futex_cmpxchg_enabled)
-                       ret = futex_unlock_pi(uaddr, flags);
+               ret = futex_unlock_pi(uaddr, flags);
                break;
        case FUTEX_TRYLOCK_PI:
-               if (futex_cmpxchg_enabled)
-                       ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
+               ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
                break;
        case FUTEX_WAIT_REQUEUE_PI:
                val3 = FUTEX_BITSET_MATCH_ANY;
@@ -2733,4 +2906,4 @@ static int __init futex_init(void)
 
        return 0;
 }
-__initcall(futex_init);
+core_initcall(futex_init);