exofs: Move exofs specific osd operations out of ios.c
[linux-2.6.git] / fs / namei.c
index a343163..14ab8d3 100644 (file)
@@ -70,7 +70,7 @@
  * name indicated by the symlink. The old code always complained that the
  * name already exists, due to not following the symlink even if its target
  * is nonexistent.  The new semantics affects also mknod() and link() when
- * the name is a symlink pointing to a non-existant name.
+ * the name is a symlink pointing to a non-existent name.
  *
  * I don't know which semantics is the right one, since I have no access
  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
@@ -179,10 +179,13 @@ EXPORT_SYMBOL(putname);
 static int acl_permission_check(struct inode *inode, int mask, unsigned int flags,
                int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
 {
-       umode_t                 mode = inode->i_mode;
+       unsigned int mode = inode->i_mode;
 
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 
+       if (current_user_ns() != inode_userns(inode))
+               goto other_perms;
+
        if (current_fsuid() == inode->i_uid)
                mode >>= 6;
        else {
@@ -196,6 +199,7 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
                        mode >>= 3;
        }
 
+other_perms:
        /*
         * If the DACs are ok we don't need any capability check.
         */
@@ -234,10 +238,11 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
 
        /*
         * Read/write DACs are always overridable.
-        * Executable DACs are overridable if at least one exec bit is set.
+        * Executable DACs are overridable for all directories and
+        * for non-directories that have least one exec bit set.
         */
        if (!(mask & MAY_EXEC) || execute_ok(inode))
-               if (capable(CAP_DAC_OVERRIDE))
+               if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
                        return 0;
 
        /*
@@ -245,7 +250,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
         */
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
        if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
-               if (capable(CAP_DAC_READ_SEARCH))
+               if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
                        return 0;
 
        return -EACCES;
@@ -387,79 +392,28 @@ void path_put(struct path *path)
 }
 EXPORT_SYMBOL(path_put);
 
-/**
- * nameidata_drop_rcu - drop this nameidata out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
+/*
  * Path walking has 2 modes, rcu-walk and ref-walk (see
- * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
- * to drop out of rcu-walk mode and take normal reference counts on dentries
- * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
- * refcounts at the last known good point before rcu-walk got stuck, so
- * ref-walk may continue from there. If this is not successful (eg. a seqcount
- * has changed), then failure is returned and path walk restarts from the
- * beginning in ref-walk mode.
- *
- * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
- * ref-walk. Must be called from rcu-walk context.
+ * Documentation/filesystems/path-lookup.txt).  In situations when we can't
+ * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
+ * normal reference counts on dentries and vfsmounts to transition to rcu-walk
+ * mode.  Refcounts are grabbed at the last known good point before rcu-walk
+ * got stuck, so ref-walk may continue from there. If this is not successful
+ * (eg. a seqcount has changed), then failure is returned and it's up to caller
+ * to restart the path walk from the beginning in ref-walk mode.
  */
-static int nameidata_drop_rcu(struct nameidata *nd)
-{
-       struct fs_struct *fs = current->fs;
-       struct dentry *dentry = nd->path.dentry;
-       int want_root = 0;
-
-       BUG_ON(!(nd->flags & LOOKUP_RCU));
-       if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
-               want_root = 1;
-               spin_lock(&fs->lock);
-               if (nd->root.mnt != fs->root.mnt ||
-                               nd->root.dentry != fs->root.dentry)
-                       goto err_root;
-       }
-       spin_lock(&dentry->d_lock);
-       if (!__d_rcu_to_refcount(dentry, nd->seq))
-               goto err;
-       BUG_ON(nd->inode != dentry->d_inode);
-       spin_unlock(&dentry->d_lock);
-       if (want_root) {
-               path_get(&nd->root);
-               spin_unlock(&fs->lock);
-       }
-       mntget(nd->path.mnt);
-
-       rcu_read_unlock();
-       br_read_unlock(vfsmount_lock);
-       nd->flags &= ~LOOKUP_RCU;
-       return 0;
-err:
-       spin_unlock(&dentry->d_lock);
-err_root:
-       if (want_root)
-               spin_unlock(&fs->lock);
-       return -ECHILD;
-}
-
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
-{
-       if (nd->flags & LOOKUP_RCU)
-               return nameidata_drop_rcu(nd);
-       return 0;
-}
 
 /**
- * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * @dentry: dentry to drop
+ * unlazy_walk - try to switch to ref-walk mode.
+ * @nd: nameidata pathwalk data
+ * @dentry: child of nd->path.dentry or NULL
  * Returns: 0 on success, -ECHILD on failure
  *
- * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
- * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
- * @nd. Must be called from rcu-walk context.
+ * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
+ * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
+ * @nd or NULL.  Must be called from rcu-walk context.
  */
-static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
+static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 {
        struct fs_struct *fs = current->fs;
        struct dentry *parent = nd->path.dentry;
@@ -474,18 +428,27 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
                        goto err_root;
        }
        spin_lock(&parent->d_lock);
-       spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-       if (!__d_rcu_to_refcount(dentry, nd->seq))
-               goto err;
-       /*
-        * If the sequence check on the child dentry passed, then the child has
-        * not been removed from its parent. This means the parent dentry must
-        * be valid and able to take a reference at this point.
-        */
-       BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
-       BUG_ON(!parent->d_count);
-       parent->d_count++;
-       spin_unlock(&dentry->d_lock);
+       if (!dentry) {
+               if (!__d_rcu_to_refcount(parent, nd->seq))
+                       goto err_parent;
+               BUG_ON(nd->inode != parent->d_inode);
+       } else {
+               if (dentry->d_parent != parent)
+                       goto err_parent;
+               spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+               if (!__d_rcu_to_refcount(dentry, nd->seq))
+                       goto err_child;
+               /*
+                * If the sequence check on the child dentry passed, then
+                * the child has not been removed from its parent. This
+                * means the parent dentry must be valid and able to take
+                * a reference at this point.
+                */
+               BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+               BUG_ON(!parent->d_count);
+               parent->d_count++;
+               spin_unlock(&dentry->d_lock);
+       }
        spin_unlock(&parent->d_lock);
        if (want_root) {
                path_get(&nd->root);
@@ -497,8 +460,10 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
        br_read_unlock(vfsmount_lock);
        nd->flags &= ~LOOKUP_RCU;
        return 0;
-err:
+
+err_child:
        spin_unlock(&dentry->d_lock);
+err_parent:
        spin_unlock(&parent->d_lock);
 err_root:
        if (want_root)
@@ -506,59 +471,6 @@ err_root:
        return -ECHILD;
 }
 
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
-{
-       if (nd->flags & LOOKUP_RCU) {
-               if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
-                       nd->flags &= ~LOOKUP_RCU;
-                       if (!(nd->flags & LOOKUP_ROOT))
-                               nd->root.mnt = NULL;
-                       rcu_read_unlock();
-                       br_read_unlock(vfsmount_lock);
-                       return -ECHILD;
-               }
-       }
-       return 0;
-}
-
-/**
- * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
- * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
- * nd->path should be the final element of the lookup, so nd->root is discarded.
- * Must be called from rcu-walk context.
- */
-static int nameidata_drop_rcu_last(struct nameidata *nd)
-{
-       struct dentry *dentry = nd->path.dentry;
-
-       BUG_ON(!(nd->flags & LOOKUP_RCU));
-       nd->flags &= ~LOOKUP_RCU;
-       if (!(nd->flags & LOOKUP_ROOT))
-               nd->root.mnt = NULL;
-       spin_lock(&dentry->d_lock);
-       if (!__d_rcu_to_refcount(dentry, nd->seq))
-               goto err_unlock;
-       BUG_ON(nd->inode != dentry->d_inode);
-       spin_unlock(&dentry->d_lock);
-
-       mntget(nd->path.mnt);
-
-       rcu_read_unlock();
-       br_read_unlock(vfsmount_lock);
-
-       return 0;
-
-err_unlock:
-       spin_unlock(&dentry->d_lock);
-       rcu_read_unlock();
-       br_read_unlock(vfsmount_lock);
-       return -ECHILD;
-}
-
 /**
  * release_open_intent - free up open intent resources
  * @nd: pointer to nameidata
@@ -602,26 +514,39 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
        return dentry;
 }
 
-/*
- * handle_reval_path - force revalidation of a dentry
- *
- * In some situations the path walking code will trust dentries without
- * revalidating them. This causes problems for filesystems that depend on
- * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
- * (which indicates that it's possible for the dentry to go stale), force
- * a d_revalidate call before proceeding.
+/**
+ * complete_walk - successful completion of path walk
+ * @nd:  pointer nameidata
  *
- * Returns 0 if the revalidation was successful. If the revalidation fails,
- * either return the error returned by d_revalidate or -ESTALE if the
- * revalidation it just returned 0. If d_revalidate returns 0, we attempt to
- * invalidate the dentry. It's up to the caller to handle putting references
- * to the path if necessary.
+ * If we had been in RCU mode, drop out of it and legitimize nd->path.
+ * Revalidate the final result, unless we'd already done that during
+ * the path walk or the filesystem doesn't ask for it.  Return 0 on
+ * success, -error on failure.  In case of failure caller does not
+ * need to drop nd->path.
  */
-static inline int handle_reval_path(struct nameidata *nd)
+static int complete_walk(struct nameidata *nd)
 {
        struct dentry *dentry = nd->path.dentry;
        int status;
 
+       if (nd->flags & LOOKUP_RCU) {
+               nd->flags &= ~LOOKUP_RCU;
+               if (!(nd->flags & LOOKUP_ROOT))
+                       nd->root.mnt = NULL;
+               spin_lock(&dentry->d_lock);
+               if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
+                       spin_unlock(&dentry->d_lock);
+                       rcu_read_unlock();
+                       br_read_unlock(vfsmount_lock);
+                       return -ECHILD;
+               }
+               BUG_ON(nd->inode != dentry->d_inode);
+               spin_unlock(&dentry->d_lock);
+               mntget(nd->path.mnt);
+               rcu_read_unlock();
+               br_read_unlock(vfsmount_lock);
+       }
+
        if (likely(!(nd->flags & LOOKUP_JUMPED)))
                return 0;
 
@@ -639,6 +564,7 @@ static inline int handle_reval_path(struct nameidata *nd)
        if (!status)
                status = -ESTALE;
 
+       path_put(&nd->path);
        return status;
 }
 
@@ -654,6 +580,7 @@ static inline int handle_reval_path(struct nameidata *nd)
 static inline int exec_permission(struct inode *inode, unsigned int flags)
 {
        int ret;
+       struct user_namespace *ns = inode_userns(inode);
 
        if (inode->i_op->permission) {
                ret = inode->i_op->permission(inode, MAY_EXEC, flags);
@@ -666,7 +593,8 @@ static inline int exec_permission(struct inode *inode, unsigned int flags)
        if (ret == -ECHILD)
                return ret;
 
-       if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
+       if (ns_capable(ns, CAP_DAC_OVERRIDE) ||
+                       ns_capable(ns, CAP_DAC_READ_SEARCH))
                goto ok;
 
        return ret;
@@ -691,6 +619,7 @@ static __always_inline void set_root_rcu(struct nameidata *nd)
                do {
                        seq = read_seqcount_begin(&fs->seq);
                        nd->root = fs->root;
+                       nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
                } while (read_seqcount_retry(&fs->seq, seq));
        }
 }
@@ -737,20 +666,36 @@ static inline void path_to_nameidata(const struct path *path,
        nd->path.dentry = path->dentry;
 }
 
+static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+{
+       struct inode *inode = link->dentry->d_inode;
+       if (!IS_ERR(cookie) && inode->i_op->put_link)
+               inode->i_op->put_link(link->dentry, nd, cookie);
+       path_put(link);
+}
+
 static __always_inline int
-__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
+follow_link(struct path *link, struct nameidata *nd, void **p)
 {
        int error;
        struct dentry *dentry = link->dentry;
 
        BUG_ON(nd->flags & LOOKUP_RCU);
 
-       touch_atime(link->mnt, dentry);
-       nd_set_link(nd, NULL);
-
        if (link->mnt == nd->path.mnt)
                mntget(link->mnt);
 
+       if (unlikely(current->total_link_count >= 40)) {
+               *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
+               path_put(&nd->path);
+               return -ELOOP;
+       }
+       cond_resched();
+       current->total_link_count++;
+
+       touch_atime(link->mnt, dentry);
+       nd_set_link(nd, NULL);
+
        error = security_inode_follow_link(link->dentry, nd);
        if (error) {
                *p = ERR_PTR(error); /* no ->put_link(), please */
@@ -779,40 +724,6 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
        return error;
 }
 
-/*
- * This limits recursive symlink follows to 8, while
- * limiting consecutive symlinks to 40.
- *
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups. 
- */
-static inline int do_follow_link(struct path *path, struct nameidata *nd)
-{
-       void *cookie;
-       int err = -ELOOP;
-
-       if (current->link_count >= MAX_NESTED_LINKS)
-               goto loop;
-       if (current->total_link_count >= 40)
-               goto loop;
-       BUG_ON(nd->depth >= MAX_NESTED_LINKS);
-       cond_resched();
-       current->link_count++;
-       current->total_link_count++;
-       nd->depth++;
-       err = __do_follow_link(path, nd, &cookie);
-       if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
-               path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
-       path_put(path);
-       current->link_count--;
-       nd->depth--;
-       return err;
-loop:
-       path_put_conditional(path, nd);
-       path_put(&nd->path);
-       return err;
-}
-
 static int follow_up_rcu(struct path *path)
 {
        struct vfsmount *parent;
@@ -904,6 +815,11 @@ static int follow_automount(struct path *path, unsigned flags,
        if (!mnt) /* mount collision */
                return 0;
 
+       if (!*need_mntput) {
+               /* lock_mount() may release path->mnt on error */
+               mntget(path->mnt);
+               *need_mntput = true;
+       }
        err = finish_automount(mnt, path);
 
        switch (err) {
@@ -911,12 +827,9 @@ static int follow_automount(struct path *path, unsigned flags,
                /* Someone else made a mount here whilst we were busy */
                return 0;
        case 0:
-               dput(path->dentry);
-               if (*need_mntput)
-                       mntput(path->mnt);
+               path_put(path);
                path->mnt = mnt;
                path->dentry = dget(mnt->mnt_root);
-               *need_mntput = true;
                return 0;
        default:
                return err;
@@ -936,9 +849,10 @@ static int follow_automount(struct path *path, unsigned flags,
  */
 static int follow_managed(struct path *path, unsigned flags)
 {
+       struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
        unsigned managed;
        bool need_mntput = false;
-       int ret;
+       int ret = 0;
 
        /* Given that we're not holding a lock here, we retain the value in a
         * local variable for each dentry as we look at it so that we don't see
@@ -951,10 +865,9 @@ static int follow_managed(struct path *path, unsigned flags)
                if (managed & DCACHE_MANAGE_TRANSIT) {
                        BUG_ON(!path->dentry->d_op);
                        BUG_ON(!path->dentry->d_op->d_manage);
-                       ret = path->dentry->d_op->d_manage(path->dentry,
-                                                          false, false);
+                       ret = path->dentry->d_op->d_manage(path->dentry, false);
                        if (ret < 0)
-                               return ret == -EISDIR ? 0 : ret;
+                               break;
                }
 
                /* Transit to a mounted filesystem. */
@@ -980,14 +893,19 @@ static int follow_managed(struct path *path, unsigned flags)
                if (managed & DCACHE_NEED_AUTOMOUNT) {
                        ret = follow_automount(path, flags, &need_mntput);
                        if (ret < 0)
-                               return ret == -EISDIR ? 0 : ret;
+                               break;
                        continue;
                }
 
                /* We didn't change the current path point */
                break;
        }
-       return 0;
+
+       if (need_mntput && path->mnt == mnt)
+               mntput(path->mnt);
+       if (ret == -EISDIR)
+               ret = 0;
+       return ret;
 }
 
 int follow_down_one(struct path *path)
@@ -1005,38 +923,62 @@ int follow_down_one(struct path *path)
        return 0;
 }
 
+static inline bool managed_dentry_might_block(struct dentry *dentry)
+{
+       return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
+               dentry->d_op->d_manage(dentry, true) < 0);
+}
+
 /*
- * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
- * meet a managed dentry and we're not walking to "..".  True is returned to
- * continue, false to abort.
+ * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
+ * we meet a managed dentry that would need blocking.
  */
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
-                              struct inode **inode, bool reverse_transit)
+                              struct inode **inode)
 {
-       while (d_mountpoint(path->dentry)) {
+       for (;;) {
                struct vfsmount *mounted;
-               if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
-                   !reverse_transit &&
-                   path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+               /*
+                * Don't forget we might have a non-mountpoint managed dentry
+                * that wants to block transit.
+                */
+               if (unlikely(managed_dentry_might_block(path->dentry)))
                        return false;
+
+               if (!d_mountpoint(path->dentry))
+                       break;
+
                mounted = __lookup_mnt(path->mnt, path->dentry, 1);
                if (!mounted)
                        break;
                path->mnt = mounted;
                path->dentry = mounted->mnt_root;
                nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+               /*
+                * Update the inode too. We don't need to re-check the
+                * dentry sequence number here after this d_inode read,
+                * because a mount-point is always pinned.
+                */
                *inode = path->dentry->d_inode;
        }
-
-       if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-               return reverse_transit;
        return true;
 }
 
-static int follow_dotdot_rcu(struct nameidata *nd)
+static void follow_mount_rcu(struct nameidata *nd)
 {
-       struct inode *inode = nd->inode;
+       while (d_mountpoint(nd->path.dentry)) {
+               struct vfsmount *mounted;
+               mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
+               if (!mounted)
+                       break;
+               nd->path.mnt = mounted;
+               nd->path.dentry = mounted->mnt_root;
+               nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+       }
+}
 
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
        set_root_rcu(nd);
 
        while (1) {
@@ -1052,7 +994,6 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                        seq = read_seqcount_begin(&parent->d_seq);
                        if (read_seqcount_retry(&old->d_seq, nd->seq))
                                goto failed;
-                       inode = parent->d_inode;
                        nd->path.dentry = parent;
                        nd->seq = seq;
                        break;
@@ -1060,10 +1001,9 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                if (!follow_up_rcu(&nd->path))
                        break;
                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-               inode = nd->path.dentry->d_inode;
        }
-       __follow_mount_rcu(nd, &nd->path, &inode, true);
-       nd->inode = inode;
+       follow_mount_rcu(nd);
+       nd->inode = nd->path.dentry->d_inode;
        return 0;
 
 failed:
@@ -1079,11 +1019,8 @@ failed:
  * Follow down to the covering mount currently visible to userspace.  At each
  * point, the filesystem owning that dentry may be queried as to whether the
  * caller is permitted to proceed or not.
- *
- * Care must be taken as namespace_sem may be held (indicated by mounting_here
- * being true).
  */
-int follow_down(struct path *path, bool mounting_here)
+int follow_down(struct path *path)
 {
        unsigned managed;
        int ret;
@@ -1104,7 +1041,7 @@ int follow_down(struct path *path, bool mounting_here)
                        BUG_ON(!path->dentry->d_op);
                        BUG_ON(!path->dentry->d_op->d_manage);
                        ret = path->dentry->d_op->d_manage(
-                               path->dentry, mounting_here, false);
+                               path->dentry, false);
                        if (ret < 0)
                                return ret == -EISDIR ? 0 : ret;
                }
@@ -1237,16 +1174,14 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
                }
                path->mnt = mnt;
                path->dentry = dentry;
-               if (likely(__follow_mount_rcu(nd, path, inode, false)))
-                       return 0;
+               if (unlikely(!__follow_mount_rcu(nd, path, inode)))
+                       goto unlazy;
+               if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+                       goto unlazy;
+               return 0;
 unlazy:
-               if (dentry) {
-                       if (nameidata_dentry_drop_rcu(nd, dentry))
-                               return -ECHILD;
-               } else {
-                       if (nameidata_drop_rcu(nd))
-                               return -ECHILD;
-               }
+               if (unlazy_walk(nd, dentry))
+                       return -ECHILD;
        } else {
                dentry = __d_lookup(parent, name);
        }
@@ -1302,7 +1237,7 @@ static inline int may_lookup(struct nameidata *nd)
                int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
                if (err != -ECHILD)
                        return err;
-               if (nameidata_drop_rcu(nd))
+               if (unlazy_walk(nd, NULL))
                        return -ECHILD;
        }
        return exec_permission(nd->inode, 0);
@@ -1356,8 +1291,12 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
                return -ENOENT;
        }
        if (unlikely(inode->i_op->follow_link) && follow) {
-               if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
-                       return -ECHILD;
+               if (nd->flags & LOOKUP_RCU) {
+                       if (unlikely(unlazy_walk(nd, path->dentry))) {
+                               terminate_walk(nd);
+                               return -ECHILD;
+                       }
+               }
                BUG_ON(inode != path->dentry->d_inode);
                return 1;
        }
@@ -1367,6 +1306,43 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
 }
 
 /*
+ * This limits recursive symlink follows to 8, while
+ * limiting consecutive symlinks to 40.
+ *
+ * Without that kind of total limit, nasty chains of consecutive
+ * symlinks can cause almost arbitrarily long lookups.
+ */
+static inline int nested_symlink(struct path *path, struct nameidata *nd)
+{
+       int res;
+
+       if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
+               path_put_conditional(path, nd);
+               path_put(&nd->path);
+               return -ELOOP;
+       }
+       BUG_ON(nd->depth >= MAX_NESTED_LINKS);
+
+       nd->depth++;
+       current->link_count++;
+
+       do {
+               struct path link = *path;
+               void *cookie;
+
+               res = follow_link(&link, nd, &cookie);
+               if (!res)
+                       res = walk_component(nd, path, &nd->last,
+                                            nd->last_type, LOOKUP_FOLLOW);
+               put_link(nd, &link, cookie);
+       } while (res > 0);
+
+       current->link_count--;
+       nd->depth--;
+       return res;
+}
+
+/*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
  * the final dentry. We expect 'base' to be positive and a directory.
@@ -1385,9 +1361,6 @@ static int link_path_walk(const char *name, struct nameidata *nd)
        if (!*name)
                return 0;
 
-       if (nd->depth)
-               lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
-
        /* At this point we know we have a real path component. */
        for(;;) {
                unsigned long hash;
@@ -1440,14 +1413,14 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                        goto last_component;
                while (*++name == '/');
                if (!*name)
-                       goto last_with_slashes;
+                       goto last_component;
 
                err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
                if (err < 0)
                        return err;
 
                if (err) {
-                       err = do_follow_link(&next, nd);
+                       err = nested_symlink(&next, nd);
                        if (err)
                                return err;
                }
@@ -1457,30 +1430,11 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                continue;
                /* here ends the main loop */
 
-last_with_slashes:
-               lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 last_component:
                /* Clear LOOKUP_CONTINUE iff it was previously unset */
                nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
-               if (lookup_flags & LOOKUP_PARENT) {
-                       nd->last = this;
-                       nd->last_type = type;
-                       return 0;
-               }
-               err = walk_component(nd, &next, &this, type,
-                                       lookup_flags & LOOKUP_FOLLOW);
-               if (err < 0)
-                       return err;
-               if (err) {
-                       err = do_follow_link(&next, nd);
-                       if (err)
-                               return err;
-               }
-               if (lookup_flags & LOOKUP_DIRECTORY) {
-                       err = -ENOTDIR; 
-                       if (!nd->inode->i_op->lookup)
-                               break;
-               }
+               nd->last = this;
+               nd->last_type = type;
                return 0;
        }
        terminate_walk(nd);
@@ -1629,44 +1583,25 @@ static int path_lookupat(int dfd, const char *name,
        err = link_path_walk(name, nd);
 
        if (!err && !(flags & LOOKUP_PARENT)) {
-               int count = 0;
                err = lookup_last(nd, &path);
                while (err > 0) {
                        void *cookie;
                        struct path link = path;
-                       struct inode *inode = link.dentry->d_inode;
-
-                       if (count++ > 32) {
-                               path_put_conditional(&path, nd);
-                               path_put(&nd->path);
-                               err = -ELOOP;
-                               break;
-                       }
-                       cond_resched();
                        nd->flags |= LOOKUP_PARENT;
-                       err = __do_follow_link(&link, nd, &cookie);
+                       err = follow_link(&link, nd, &cookie);
                        if (!err)
                                err = lookup_last(nd, &path);
-                       if (!IS_ERR(cookie) && inode->i_op->put_link)
-                               inode->i_op->put_link(link.dentry, nd, cookie);
-                       path_put(&link);
+                       put_link(nd, &link, cookie);
                }
        }
 
-       if (nd->flags & LOOKUP_RCU) {
-               /* went all way through without dropping RCU */
-               BUG_ON(err);
-               if (nameidata_drop_rcu_last(nd))
-                       err = -ECHILD;
-       }
-
        if (!err)
-               err = handle_reval_path(nd);
+               err = complete_walk(nd);
 
        if (!err && nd->flags & LOOKUP_DIRECTORY) {
                if (!nd->inode->i_op->lookup) {
                        path_put(&nd->path);
-                       return -ENOTDIR;
+                       err = -ENOTDIR;
                }
        }
 
@@ -1858,11 +1793,15 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
 
        if (!(dir->i_mode & S_ISVTX))
                return 0;
+       if (current_user_ns() != inode_userns(inode))
+               goto other_userns;
        if (inode->i_uid == fsuid)
                return 0;
        if (dir->i_uid == fsuid)
                return 0;
-       return !capable(CAP_FOWNER);
+
+other_userns:
+       return !ns_capable(inode_userns(inode), CAP_FOWNER);
 }
 
 /*
@@ -2042,7 +1981,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
        }
 
        /* O_NOATIME can only be set by the owner or superuser */
-       if (flag & O_NOATIME && !is_owner_or_cap(inode))
+       if (flag & O_NOATIME && !inode_owner_or_capable(inode))
                return -EPERM;
 
        /*
@@ -2123,13 +2062,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                        return ERR_PTR(error);
                /* fallthrough */
        case LAST_ROOT:
-               if (nd->flags & LOOKUP_RCU) {
-                       if (nameidata_drop_rcu_last(nd))
-                               return ERR_PTR(-ECHILD);
-               }
-               error = handle_reval_path(nd);
+               error = complete_walk(nd);
                if (error)
-                       goto exit;
+                       return ERR_PTR(error);
                audit_inode(pathname, nd->path.dentry);
                if (open_flag & O_CREAT) {
                        error = -EISDIR;
@@ -2137,10 +2072,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                }
                goto ok;
        case LAST_BIND:
-               /* can't be RCU mode here */
-               error = handle_reval_path(nd);
+               error = complete_walk(nd);
                if (error)
-                       goto exit;
+                       return ERR_PTR(error);
                audit_inode(pathname, dir);
                goto ok;
        }
@@ -2159,10 +2093,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                if (error) /* symlink */
                        return NULL;
                /* sayonara */
-               if (nd->flags & LOOKUP_RCU) {
-                       if (nameidata_drop_rcu_last(nd))
-                               return ERR_PTR(-ECHILD);
-               }
+               error = complete_walk(nd);
+               if (error)
+                       return ERR_PTR(-ECHILD);
 
                error = -ENOTDIR;
                if (nd->flags & LOOKUP_DIRECTORY) {
@@ -2174,11 +2107,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        }
 
        /* create side of things */
-
-       if (nd->flags & LOOKUP_RCU) {
-               if (nameidata_drop_rcu_last(nd))
-                       return ERR_PTR(-ECHILD);
-       }
+       error = complete_walk(nd);
+       if (error)
+               return ERR_PTR(error);
 
        audit_inode(pathname, dir);
        error = -EISDIR;
@@ -2308,7 +2239,6 @@ static struct file *path_openat(int dfd, const char *pathname,
        struct file *base = NULL;
        struct file *filp;
        struct path path;
-       int count = 0;
        int error;
 
        filp = get_empty_filp();
@@ -2332,35 +2262,21 @@ static struct file *path_openat(int dfd, const char *pathname,
        filp = do_last(nd, &path, op, pathname);
        while (unlikely(!filp)) { /* trailing symlink */
                struct path link = path;
-               struct inode *linki = link.dentry->d_inode;
                void *cookie;
-               if (!(nd->flags & LOOKUP_FOLLOW) || count++ == 32) {
+               if (!(nd->flags & LOOKUP_FOLLOW)) {
                        path_put_conditional(&path, nd);
                        path_put(&nd->path);
                        filp = ERR_PTR(-ELOOP);
                        break;
                }
-               /*
-                * This is subtle. Instead of calling do_follow_link() we do
-                * the thing by hands. The reason is that this way we have zero
-                * link_count and path_walk() (called from ->follow_link)
-                * honoring LOOKUP_PARENT.  After that we have the parent and
-                * last component, i.e. we are in the same situation as after
-                * the first path_walk().  Well, almost - if the last component
-                * is normal we get its copy stored in nd->last.name and we will
-                * have to putname() it when we are done. Procfs-like symlinks
-                * just set LAST_BIND.
-                */
                nd->flags |= LOOKUP_PARENT;
                nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-               error = __do_follow_link(&link, nd, &cookie);
+               error = follow_link(&link, nd, &cookie);
                if (unlikely(error))
                        filp = ERR_PTR(error);
                else
                        filp = do_last(nd, &path, op, pathname);
-               if (!IS_ERR(cookie) && linki->i_op->put_link)
-                       linki->i_op->put_link(link.dentry, nd, cookie);
-               path_put(&link);
+               put_link(nd, &link, cookie);
        }
 out:
        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
@@ -2471,7 +2387,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        if (error)
                return error;
 
-       if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
+       if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
+           !ns_capable(inode_userns(dir), CAP_MKNOD))
                return -EPERM;
 
        if (!dir->i_op->mknod)
@@ -2632,10 +2549,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
 }
 
 /*
- * We try to drop the dentry early: we should have
- * a usage count of 2 if we're the only user of this
- * dentry, and if that is true (possibly after pruning
- * the dcache), then we drop the dentry now.
+ * The dentry_unhash() helper will try to drop the dentry early: we
+ * should have a usage count of 2 if we're the only user of this
+ * dentry, and if that is true (possibly after pruning the dcache),
+ * then we drop the dentry now.
  *
  * A low-level filesystem can, if it choses, legally
  * do a
@@ -2648,10 +2565,9 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
  */
 void dentry_unhash(struct dentry *dentry)
 {
-       dget(dentry);
        shrink_dcache_parent(dentry);
        spin_lock(&dentry->d_lock);
-       if (dentry->d_count == 2)
+       if (dentry->d_count == 1)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
 }
@@ -2667,25 +2583,27 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
                return -EPERM;
 
        mutex_lock(&dentry->d_inode->i_mutex);
-       dentry_unhash(dentry);
+
+       error = -EBUSY;
        if (d_mountpoint(dentry))
-               error = -EBUSY;
-       else {
-               error = security_inode_rmdir(dir, dentry);
-               if (!error) {
-                       error = dir->i_op->rmdir(dir, dentry);
-                       if (!error) {
-                               dentry->d_inode->i_flags |= S_DEAD;
-                               dont_mount(dentry);
-                       }
-               }
-       }
+               goto out;
+
+       error = security_inode_rmdir(dir, dentry);
+       if (error)
+               goto out;
+
+       shrink_dcache_parent(dentry);
+       error = dir->i_op->rmdir(dir, dentry);
+       if (error)
+               goto out;
+
+       dentry->d_inode->i_flags |= S_DEAD;
+       dont_mount(dentry);
+
+out:
        mutex_unlock(&dentry->d_inode->i_mutex);
-       if (!error) {
+       if (!error)
                d_delete(dentry);
-       }
-       dput(dentry);
-
        return error;
 }
 
@@ -2719,6 +2637,10 @@ static long do_rmdir(int dfd, const char __user *pathname)
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto exit2;
+       if (!dentry->d_inode) {
+               error = -ENOENT;
+               goto exit3;
+       }
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto exit3;
@@ -2807,8 +2729,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)
                if (nd.last.name[nd.last.len])
                        goto slashes;
                inode = dentry->d_inode;
-               if (inode)
-                       ihold(inode);
+               if (!inode)
+                       goto slashes;
+               ihold(inode);
                error = mnt_want_write(nd.path.mnt);
                if (error)
                        goto exit2;
@@ -3056,12 +2979,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
  *        HOWEVER, it relies on the assumption that any object with ->lookup()
  *        has no more than 1 dentry.  If "hybrid" objects will ever appear,
  *        we'd better make sure that there's no link(2) for them.
- *     d) some filesystems don't support opened-but-unlinked directories,
- *        either because of layout or because they are not ready to deal with
- *        all cases correctly. The latter will be fixed (taking this sort of
- *        stuff into VFS), but the former is not going away. Solution: the same
- *        trick as in rmdir().
- *     e) conversion from fhandle to dentry may come in the wrong moment - when
+ *     d) conversion from fhandle to dentry may come in the wrong moment - when
  *        we are removing the target. Solution: we will have to grab ->i_mutex
  *        in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
  *        ->i_mutex on parents, which works but leads to some truly excessive
@@ -3071,7 +2989,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
                          struct inode *new_dir, struct dentry *new_dentry)
 {
        int error = 0;
-       struct inode *target;
+       struct inode *target = new_dentry->d_inode;
 
        /*
         * If we are going to change the parent - check write permissions,
@@ -3087,26 +3005,26 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
        if (error)
                return error;
 
-       target = new_dentry->d_inode;
        if (target)
                mutex_lock(&target->i_mutex);
-       if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
-               error = -EBUSY;
-       else {
-               if (target)
-                       dentry_unhash(new_dentry);
-               error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-       }
+
+       error = -EBUSY;
+       if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
+               goto out;
+
+       if (target)
+               shrink_dcache_parent(new_dentry);
+       error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+       if (error)
+               goto out;
+
        if (target) {
-               if (!error) {
-                       target->i_flags |= S_DEAD;
-                       dont_mount(new_dentry);
-               }
-               mutex_unlock(&target->i_mutex);
-               if (d_unhashed(new_dentry))
-                       d_rehash(new_dentry);
-               dput(new_dentry);
+               target->i_flags |= S_DEAD;
+               dont_mount(new_dentry);
        }
+out:
+       if (target)
+               mutex_unlock(&target->i_mutex);
        if (!error)
                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
                        d_move(old_dentry,new_dentry);
@@ -3116,7 +3034,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
                            struct inode *new_dir, struct dentry *new_dentry)
 {
-       struct inode *target;
+       struct inode *target = new_dentry->d_inode;
        int error;
 
        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -3124,19 +3042,22 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
                return error;
 
        dget(new_dentry);
-       target = new_dentry->d_inode;
        if (target)
                mutex_lock(&target->i_mutex);
+
+       error = -EBUSY;
        if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
-               error = -EBUSY;
-       else
-               error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-       if (!error) {
-               if (target)
-                       dont_mount(new_dentry);
-               if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
-                       d_move(old_dentry, new_dentry);
-       }
+               goto out;
+
+       error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+       if (error)
+               goto out;
+
+       if (target)
+               dont_mount(new_dentry);
+       if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+               d_move(old_dentry, new_dentry);
+out:
        if (target)
                mutex_unlock(&target->i_mutex);
        dput(new_dentry);