cgroup: notify_on_release may not be triggered in some cases
[linux-2.6.git] / kernel / futex.c
index c6bef6e..3717e7b 100644 (file)
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/magic.h>
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
+#include <linux/ptrace.h>
 
 #include <asm/futex.h>
 
@@ -218,6 +219,8 @@ static void drop_futex_key_refs(union futex_key *key)
  * @uaddr:     virtual address of the futex
  * @fshared:   0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
  * @key:       address where result is stored.
+ * @rw:                mapping needs to be read/write (values: VERIFY_READ,
+ *              VERIFY_WRITE)
  *
  * Returns a negative error code or 0
  * The key words are stored in *key on success.
@@ -229,12 +232,12 @@ static void drop_futex_key_refs(union futex_key *key)
  * lock_page() might sleep, the caller should not hold a spinlock.
  */
 static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
        struct page *page, *page_head;
-       int err;
+       int err, ro = 0;
 
        /*
         * The futex address must be "naturally" aligned.
@@ -262,8 +265,18 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 
 again:
        err = get_user_pages_fast(address, 1, 1, &page);
+       /*
+        * If write access is not required (eg. FUTEX_WAIT), try
+        * and get read-only access.
+        */
+       if (err == -EFAULT && rw == VERIFY_READ) {
+               err = get_user_pages_fast(address, 1, 0, &page);
+               ro = 1;
+       }
        if (err < 0)
                return err;
+       else
+               err = 0;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        page_head = page;
@@ -302,10 +315,29 @@ again:
 #endif
 
        lock_page(page_head);
+
+       /*
+        * If page_head->mapping is NULL, then it cannot be a PageAnon
+        * page; but it might be the ZERO_PAGE or in the gate area or
+        * in a special mapping (all cases which we are happy to fail);
+        * or it may have been a good file page when get_user_pages_fast
+        * found it, but truncated or holepunched or subjected to
+        * invalidate_complete_page2 before we got the page lock (also
+        * cases which we are happy to fail).  And we hold a reference,
+        * so refcount care in invalidate_complete_page's remove_mapping
+        * prevents drop_caches from setting mapping to NULL beneath us.
+        *
+        * The case we do have to guard against is when memory pressure made
+        * shmem_writepage move it from filecache to swapcache beneath us:
+        * an unlikely race, but we do need to retry for page_head->mapping.
+        */
        if (!page_head->mapping) {
+               int shmem_swizzled = PageSwapCache(page_head);
                unlock_page(page_head);
                put_page(page_head);
-               goto again;
+               if (shmem_swizzled)
+                       goto again;
+               return -EFAULT;
        }
 
        /*
@@ -316,6 +348,15 @@ again:
         * the object not the particular process.
         */
        if (PageAnon(page_head)) {
+               /*
+                * A RO anonymous page will never change and thus doesn't make
+                * sense for futex operations.
+                */
+               if (ro) {
+                       err = -EFAULT;
+                       goto out;
+               }
+
                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                key->private.mm = mm;
                key->private.address = address;
@@ -327,9 +368,10 @@ again:
 
        get_futex_key_refs(key);
 
+out:
        unlock_page(page_head);
        put_page(page_head);
-       return 0;
+       return err;
 }
 
 static inline void put_futex_key(union futex_key *key)
@@ -355,8 +397,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
        int ret;
 
        down_read(&mm->mmap_sem);
-       ret = get_user_pages(current, mm, (unsigned long)uaddr,
-                            1, 1, 0, NULL, NULL);
+       ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
+                              FAULT_FLAG_WRITE);
        up_read(&mm->mmap_sem);
 
        return ret < 0 ? ret : 0;
@@ -782,8 +824,8 @@ static void __unqueue_futex(struct futex_q *q)
 {
        struct futex_hash_bucket *hb;
 
-       if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr)
-                       || plist_node_empty(&q->list)))
+       if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
+           || WARN_ON(plist_node_empty(&q->list)))
                return;
 
        hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
@@ -825,7 +867,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 {
        struct task_struct *new_owner;
        struct futex_pi_state *pi_state = this->pi_state;
-       u32 curval, newval;
+       u32 uninitialized_var(curval), newval;
 
        if (!pi_state)
                return -EINVAL;
@@ -887,7 +929,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 
 static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
 {
-       u32 oldval;
+       u32 uninitialized_var(oldval);
 
        /*
         * There is no waiter, so we unlock the futex. The owner died
@@ -940,7 +982,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
        if (!bitset)
                return -EINVAL;
 
-       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
 
@@ -986,10 +1028,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
        int ret, op_ret;
 
 retry:
-       ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
+       ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
-       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
+       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out_put_key1;
 
@@ -1243,10 +1285,11 @@ retry:
                pi_state = NULL;
        }
 
-       ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
+       ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
-       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
+       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+                           requeue_pi ? VERIFY_WRITE : VERIFY_READ);
        if (unlikely(ret != 0))
                goto out_put_key1;
 
@@ -1546,7 +1589,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
        struct task_struct *oldowner = pi_state->owner;
-       u32 uval, curval, newval;
+       u32 uval, uninitialized_var(curval), newval;
        int ret;
 
        /* Owner died? */
@@ -1555,10 +1598,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 
        /*
         * We are here either because we stole the rtmutex from the
-        * pending owner or we are the pending owner which failed to
-        * get the rtmutex. We have to replace the pending owner TID
-        * in the user space variable. This must be atomic as we have
-        * to preserve the owner died bit here.
+        * previous highest priority waiter or we are the highest priority
+        * waiter but failed to get the rtmutex the first time.
+        * We have to replace the newowner TID in the user space variable.
+        * This must be atomic as we have to preserve the owner died bit here.
         *
         * Note: We write the user space value _before_ changing the pi_state
         * because we can fault here. Imagine swapped out pages or a fork
@@ -1605,8 +1648,8 @@ retry:
 
        /*
         * To handle the page fault we need to drop the hash bucket
-        * lock here. That gives the other task (either the pending
-        * owner itself or the task which stole the rtmutex) the
+        * lock here. That gives the other task (either the highest priority
+        * waiter itself or the task which stole the rtmutex) the
         * chance to try the fixup of the pi_state. So once we are
         * back from handling the fault we need to check the pi_state
         * after reacquiring the hash bucket lock and before trying to
@@ -1682,18 +1725,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
                /*
                 * pi_state is incorrect, some other task did a lock steal and
                 * we returned due to timeout or signal without taking the
-                * rt_mutex. Too late. We can access the rt_mutex_owner without
-                * locking, as the other task is now blocked on the hash bucket
-                * lock. Fix the state up.
+                * rt_mutex. Too late.
                 */
+               raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+               if (!owner)
+                       owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
+               raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
                ret = fixup_pi_state_owner(uaddr, q, owner);
                goto out;
        }
 
        /*
         * Paranoia check. If we did not take the lock, then we should not be
-        * the owner, nor the pending owner, of the rt_mutex.
+        * the owner of the rt_mutex.
         */
        if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
                printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
@@ -1761,7 +1806,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
  *
  * Returns:
  *  0 - uaddr contains val and hb has been locked
- * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
  */
 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
                           struct futex_q *q, struct futex_hash_bucket **hb)
@@ -1788,7 +1833,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
         * while the syscall executes.
         */
 retry:
-       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
        if (unlikely(ret != 0))
                return ret;
 
@@ -1884,7 +1929,7 @@ retry:
        restart->futex.val = val;
        restart->futex.time = abs_time->tv64;
        restart->futex.bitset = bitset;
-       restart->futex.flags = flags;
+       restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
 
        ret = -ERESTART_RESTARTBLOCK;
 
@@ -1939,7 +1984,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
        }
 
 retry:
-       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out;
 
@@ -2058,7 +2103,7 @@ retry:
        if ((uval & FUTEX_TID_MASK) != vpid)
                return -EPERM;
 
-       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out;
 
@@ -2186,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
  * @uaddr2:    the pi futex we will take prior to returning to user-space
  *
  * The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and
- * complete the acquisition of the rt_mutex prior to returning to userspace.
- * This ensures the rt_mutex maintains an owner when it has waiters; without
- * one, the pi logic wouldn't know which task to boost/deboost, if there was a
- * need to.
+ * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
+ * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
+ * without one, the pi logic would not know which task to boost/deboost, if
+ * there was a need to.
  *
  * We call schedule in futex_wait_queue_me() when we enqueue and return there
  * via the following:
@@ -2227,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
        struct futex_q q = futex_q_init;
        int res, ret;
 
+       if (uaddr == uaddr2)
+               return -EINVAL;
+
        if (!bitset)
                return -EINVAL;
 
@@ -2247,7 +2295,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
        debug_rt_mutex_init_waiter(&rt_waiter);
        rt_waiter.task = NULL;
 
-       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
+       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out;
 
@@ -2298,7 +2346,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
                 * the pi_state.
                 */
-               WARN_ON(!&q.pi_state);
+               WARN_ON(!q.pi_state);
                pi_mutex = &q.pi_state->pi_mutex;
                ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
                debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2325,7 +2373,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
         * fault, unlock the rt_mutex and return the fault to userspace.
         */
        if (ret == -EFAULT) {
-               if (rt_mutex_owner(pi_mutex) == current)
+               if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
                        rt_mutex_unlock(pi_mutex);
        } else if (ret == -EINTR) {
                /*
@@ -2399,31 +2447,31 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 {
        struct robust_list_head __user *head;
        unsigned long ret;
-       const struct cred *cred = current_cred(), *pcred;
+       struct task_struct *p;
 
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
 
+       WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
+
+       rcu_read_lock();
+
+       ret = -ESRCH;
        if (!pid)
-               head = current->robust_list;
+               p = current;
        else {
-               struct task_struct *p;
-
-               ret = -ESRCH;
-               rcu_read_lock();
                p = find_task_by_vpid(pid);
                if (!p)
                        goto err_unlock;
-               ret = -EPERM;
-               pcred = __task_cred(p);
-               if (cred->euid != pcred->euid &&
-                   cred->euid != pcred->uid &&
-                   !capable(CAP_SYS_PTRACE))
-                       goto err_unlock;
-               head = p->robust_list;
-               rcu_read_unlock();
        }
 
+       ret = -EPERM;
+       if (!ptrace_may_access(p, PTRACE_MODE_READ))
+               goto err_unlock;
+
+       head = p->robust_list;
+       rcu_read_unlock();
+
        if (put_user(sizeof(*head), len_ptr))
                return -EFAULT;
        return put_user(head, head_ptr);
@@ -2440,7 +2488,7 @@ err_unlock:
  */
 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
 {
-       u32 uval, nval, mval;
+       u32 uval, uninitialized_var(nval), mval;
 
 retry:
        if (get_user(uval, uaddr))
@@ -2458,9 +2506,20 @@ retry:
                 * userspace.
                 */
                mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-               if (futex_atomic_cmpxchg_inatomic(&nval, uaddr, uval, mval))
-                       return -1;
-
+               /*
+                * We are not holding a lock here, but we want to have
+                * the pagefault_disable/enable() protection because
+                * we want to handle the fault gracefully. If the
+                * access fails we try to fault in the futex with R/W
+                * verification via get_user_pages. get_user() above
+                * does not guarantee R/W access. If that fails we
+                * give up and leave the futex locked.
+                */
+               if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+                       if (fault_in_user_writeable(uaddr))
+                               return -1;
+                       goto retry;
+               }
                if (nval != uval)
                        goto retry;
 
@@ -2564,7 +2623,7 @@ void exit_robust_list(struct task_struct *curr)
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                u32 __user *uaddr2, u32 val2, u32 val3)
 {
-       int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
+       int cmd = op & FUTEX_CMD_MASK;
        unsigned int flags = 0;
 
        if (!(op & FUTEX_PRIVATE_FLAG))
@@ -2577,49 +2636,44 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
        }
 
        switch (cmd) {
+       case FUTEX_LOCK_PI:
+       case FUTEX_UNLOCK_PI:
+       case FUTEX_TRYLOCK_PI:
+       case FUTEX_WAIT_REQUEUE_PI:
+       case FUTEX_CMP_REQUEUE_PI:
+               if (!futex_cmpxchg_enabled)
+                       return -ENOSYS;
+       }
+
+       switch (cmd) {
        case FUTEX_WAIT:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAIT_BITSET:
-               ret = futex_wait(uaddr, flags, val, timeout, val3);
-               break;
+               return futex_wait(uaddr, flags, val, timeout, val3);
        case FUTEX_WAKE:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAKE_BITSET:
-               ret = futex_wake(uaddr, flags, val, val3);
-               break;
+               return futex_wake(uaddr, flags, val, val3);
        case FUTEX_REQUEUE:
-               ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
-               break;
+               return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
        case FUTEX_CMP_REQUEUE:
-               ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
-               break;
+               return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
        case FUTEX_WAKE_OP:
-               ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
-               break;
+               return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
        case FUTEX_LOCK_PI:
-               if (futex_cmpxchg_enabled)
-                       ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
-               break;
+               return futex_lock_pi(uaddr, flags, val, timeout, 0);
        case FUTEX_UNLOCK_PI:
-               if (futex_cmpxchg_enabled)
-                       ret = futex_unlock_pi(uaddr, flags);
-               break;
+               return futex_unlock_pi(uaddr, flags);
        case FUTEX_TRYLOCK_PI:
-               if (futex_cmpxchg_enabled)
-                       ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
-               break;
+               return futex_lock_pi(uaddr, flags, 0, timeout, 1);
        case FUTEX_WAIT_REQUEUE_PI:
                val3 = FUTEX_BITSET_MATCH_ANY;
-               ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
-                                           uaddr2);
-               break;
+               return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+                                            uaddr2);
        case FUTEX_CMP_REQUEUE_PI:
-               ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
-               break;
-       default:
-               ret = -ENOSYS;
+               return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
        }
-       return ret;
+       return -ENOSYS;
 }
 
 
@@ -2675,7 +2729,7 @@ static int __init futex_init(void)
                futex_cmpxchg_enabled = 1;
 
        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
-               plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
+               plist_head_init(&futex_queues[i].chain);
                spin_lock_init(&futex_queues[i].lock);
        }