exofs: Move exofs specific osd operations out of ios.c
[linux-2.6.git] / fs / namei.c
index a98f7f1..14ab8d3 100644 (file)
@@ -70,7 +70,7 @@
  * name indicated by the symlink. The old code always complained that the
  * name already exists, due to not following the symlink even if its target
  * is nonexistent.  The new semantics affects also mknod() and link() when
- * the name is a symlink pointing to a non-existant name.
+ * the name is a symlink pointing to a non-existent name.
  *
  * I don't know which semantics is the right one, since I have no access
  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
@@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page)
        return retval;
 }
 
-char * getname(const char __user * filename)
+static char *getname_flags(const char __user * filename, int flags)
 {
        char *tmp, *result;
 
@@ -147,14 +147,21 @@ char * getname(const char __user * filename)
 
                result = tmp;
                if (retval < 0) {
-                       __putname(tmp);
-                       result = ERR_PTR(retval);
+                       if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+                               __putname(tmp);
+                               result = ERR_PTR(retval);
+                       }
                }
        }
        audit_getname(result);
        return result;
 }
 
+char *getname(const char __user * filename)
+{
+       return getname_flags(filename, 0);
+}
+
 #ifdef CONFIG_AUDITSYSCALL
 void putname(const char *name)
 {
@@ -172,10 +179,13 @@ EXPORT_SYMBOL(putname);
 static int acl_permission_check(struct inode *inode, int mask, unsigned int flags,
                int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
 {
-       umode_t                 mode = inode->i_mode;
+       unsigned int mode = inode->i_mode;
 
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 
+       if (current_user_ns() != inode_userns(inode))
+               goto other_perms;
+
        if (current_fsuid() == inode->i_uid)
                mode >>= 6;
        else {
@@ -189,6 +199,7 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
                        mode >>= 3;
        }
 
+other_perms:
        /*
         * If the DACs are ok we don't need any capability check.
         */
@@ -227,10 +238,11 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
 
        /*
         * Read/write DACs are always overridable.
-        * Executable DACs are overridable if at least one exec bit is set.
+        * Executable DACs are overridable for all directories and
+        * for non-directories that have least one exec bit set.
         */
        if (!(mask & MAY_EXEC) || execute_ok(inode))
-               if (capable(CAP_DAC_OVERRIDE))
+               if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
                        return 0;
 
        /*
@@ -238,7 +250,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
         */
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
        if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
-               if (capable(CAP_DAC_READ_SEARCH))
+               if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
                        return 0;
 
        return -EACCES;
@@ -380,111 +392,65 @@ void path_put(struct path *path)
 }
 EXPORT_SYMBOL(path_put);
 
-/**
- * nameidata_drop_rcu - drop this nameidata out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
+/*
  * Path walking has 2 modes, rcu-walk and ref-walk (see
- * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
- * to drop out of rcu-walk mode and take normal reference counts on dentries
- * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
- * refcounts at the last known good point before rcu-walk got stuck, so
- * ref-walk may continue from there. If this is not successful (eg. a seqcount
- * has changed), then failure is returned and path walk restarts from the
- * beginning in ref-walk mode.
- *
- * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
- * ref-walk. Must be called from rcu-walk context.
+ * Documentation/filesystems/path-lookup.txt).  In situations when we can't
+ * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
+ * normal reference counts on dentries and vfsmounts to transition to rcu-walk
+ * mode.  Refcounts are grabbed at the last known good point before rcu-walk
+ * got stuck, so ref-walk may continue from there. If this is not successful
+ * (eg. a seqcount has changed), then failure is returned and it's up to caller
+ * to restart the path walk from the beginning in ref-walk mode.
  */
-static int nameidata_drop_rcu(struct nameidata *nd)
-{
-       struct fs_struct *fs = current->fs;
-       struct dentry *dentry = nd->path.dentry;
-
-       BUG_ON(!(nd->flags & LOOKUP_RCU));
-       if (nd->root.mnt) {
-               spin_lock(&fs->lock);
-               if (nd->root.mnt != fs->root.mnt ||
-                               nd->root.dentry != fs->root.dentry)
-                       goto err_root;
-       }
-       spin_lock(&dentry->d_lock);
-       if (!__d_rcu_to_refcount(dentry, nd->seq))
-               goto err;
-       BUG_ON(nd->inode != dentry->d_inode);
-       spin_unlock(&dentry->d_lock);
-       if (nd->root.mnt) {
-               path_get(&nd->root);
-               spin_unlock(&fs->lock);
-       }
-       mntget(nd->path.mnt);
-
-       rcu_read_unlock();
-       br_read_unlock(vfsmount_lock);
-       nd->flags &= ~LOOKUP_RCU;
-       return 0;
-err:
-       spin_unlock(&dentry->d_lock);
-err_root:
-       if (nd->root.mnt)
-               spin_unlock(&fs->lock);
-       return -ECHILD;
-}
-
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
-{
-       if (nd->flags & LOOKUP_RCU)
-               return nameidata_drop_rcu(nd);
-       return 0;
-}
 
 /**
- * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * @dentry: dentry to drop
+ * unlazy_walk - try to switch to ref-walk mode.
+ * @nd: nameidata pathwalk data
+ * @dentry: child of nd->path.dentry or NULL
  * Returns: 0 on success, -ECHILD on failure
  *
- * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
- * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
- * @nd. Must be called from rcu-walk context.
+ * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
+ * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
+ * @nd or NULL.  Must be called from rcu-walk context.
  */
-static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
+static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 {
        struct fs_struct *fs = current->fs;
        struct dentry *parent = nd->path.dentry;
-
-       /*
-        * It can be possible to revalidate the dentry that we started
-        * the path walk with. force_reval_path may also revalidate the
-        * dentry already committed to the nameidata.
-        */
-       if (unlikely(parent == dentry))
-               return nameidata_drop_rcu(nd);
+       int want_root = 0;
 
        BUG_ON(!(nd->flags & LOOKUP_RCU));
-       if (nd->root.mnt) {
+       if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+               want_root = 1;
                spin_lock(&fs->lock);
                if (nd->root.mnt != fs->root.mnt ||
                                nd->root.dentry != fs->root.dentry)
                        goto err_root;
        }
        spin_lock(&parent->d_lock);
-       spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-       if (!__d_rcu_to_refcount(dentry, nd->seq))
-               goto err;
-       /*
-        * If the sequence check on the child dentry passed, then the child has
-        * not been removed from its parent. This means the parent dentry must
-        * be valid and able to take a reference at this point.
-        */
-       BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
-       BUG_ON(!parent->d_count);
-       parent->d_count++;
-       spin_unlock(&dentry->d_lock);
+       if (!dentry) {
+               if (!__d_rcu_to_refcount(parent, nd->seq))
+                       goto err_parent;
+               BUG_ON(nd->inode != parent->d_inode);
+       } else {
+               if (dentry->d_parent != parent)
+                       goto err_parent;
+               spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+               if (!__d_rcu_to_refcount(dentry, nd->seq))
+                       goto err_child;
+               /*
+                * If the sequence check on the child dentry passed, then
+                * the child has not been removed from its parent. This
+                * means the parent dentry must be valid and able to take
+                * a reference at this point.
+                */
+               BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+               BUG_ON(!parent->d_count);
+               parent->d_count++;
+               spin_unlock(&dentry->d_lock);
+       }
        spin_unlock(&parent->d_lock);
-       if (nd->root.mnt) {
+       if (want_root) {
                path_get(&nd->root);
                spin_unlock(&fs->lock);
        }
@@ -494,67 +460,17 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
        br_read_unlock(vfsmount_lock);
        nd->flags &= ~LOOKUP_RCU;
        return 0;
-err:
+
+err_child:
        spin_unlock(&dentry->d_lock);
+err_parent:
        spin_unlock(&parent->d_lock);
 err_root:
-       if (nd->root.mnt)
+       if (want_root)
                spin_unlock(&fs->lock);
        return -ECHILD;
 }
 
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
-{
-       if (nd->flags & LOOKUP_RCU)
-               return nameidata_dentry_drop_rcu(nd, dentry);
-       return 0;
-}
-
-/**
- * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
- * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
- * nd->path should be the final element of the lookup, so nd->root is discarded.
- * Must be called from rcu-walk context.
- */
-static int nameidata_drop_rcu_last(struct nameidata *nd)
-{
-       struct dentry *dentry = nd->path.dentry;
-
-       BUG_ON(!(nd->flags & LOOKUP_RCU));
-       nd->flags &= ~LOOKUP_RCU;
-       nd->root.mnt = NULL;
-       spin_lock(&dentry->d_lock);
-       if (!__d_rcu_to_refcount(dentry, nd->seq))
-               goto err_unlock;
-       BUG_ON(nd->inode != dentry->d_inode);
-       spin_unlock(&dentry->d_lock);
-
-       mntget(nd->path.mnt);
-
-       rcu_read_unlock();
-       br_read_unlock(vfsmount_lock);
-
-       return 0;
-
-err_unlock:
-       spin_unlock(&dentry->d_lock);
-       rcu_read_unlock();
-       br_read_unlock(vfsmount_lock);
-       return -ECHILD;
-}
-
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
-{
-       if (likely(nd->flags & LOOKUP_RCU))
-               return nameidata_drop_rcu_last(nd);
-       return 0;
-}
-
 /**
  * release_open_intent - free up open intent resources
  * @nd: pointer to nameidata
@@ -571,25 +487,9 @@ void release_open_intent(struct nameidata *nd)
        }
 }
 
-/*
- * Call d_revalidate and handle filesystems that request rcu-walk
- * to be dropped. This may be called and return in rcu-walk mode,
- * regardless of success or error. If -ECHILD is returned, the caller
- * must return -ECHILD back up the path walk stack so path walk may
- * be restarted in ref-walk mode.
- */
-static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
+static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-       int status;
-
-       status = dentry->d_op->d_revalidate(dentry, nd);
-       if (status == -ECHILD) {
-               if (nameidata_dentry_drop_rcu(nd, dentry))
-                       return status;
-               status = dentry->d_op->d_revalidate(dentry, nd);
-       }
-
-       return status;
+       return dentry->d_op->d_revalidate(dentry, nd);
 }
 
 static struct dentry *
@@ -614,76 +514,57 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
        return dentry;
 }
 
-static inline struct dentry *
-do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd)
+/**
+ * complete_walk - successful completion of path walk
+ * @nd:  pointer nameidata
+ *
+ * If we had been in RCU mode, drop out of it and legitimize nd->path.
+ * Revalidate the final result, unless we'd already done that during
+ * the path walk or the filesystem doesn't ask for it.  Return 0 on
+ * success, -error on failure.  In case of failure caller does not
+ * need to drop nd->path.
+ */
+static int complete_walk(struct nameidata *nd)
 {
-       int status = dentry->d_op->d_revalidate(dentry, nd);
-       if (likely(status > 0))
-               return dentry;
-       if (status == -ECHILD) {
-               if (nameidata_dentry_drop_rcu(nd, dentry))
-                       return ERR_PTR(-ECHILD);
-               return do_revalidate(dentry, nd);
-       }
-       if (status < 0)
-               return ERR_PTR(status);
-       /* Don't d_invalidate in rcu-walk mode */
-       if (nameidata_dentry_drop_rcu(nd, dentry))
-               return ERR_PTR(-ECHILD);
-       if (!d_invalidate(dentry)) {
-               dput(dentry);
-               dentry = NULL;
+       struct dentry *dentry = nd->path.dentry;
+       int status;
+
+       if (nd->flags & LOOKUP_RCU) {
+               nd->flags &= ~LOOKUP_RCU;
+               if (!(nd->flags & LOOKUP_ROOT))
+                       nd->root.mnt = NULL;
+               spin_lock(&dentry->d_lock);
+               if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
+                       spin_unlock(&dentry->d_lock);
+                       rcu_read_unlock();
+                       br_read_unlock(vfsmount_lock);
+                       return -ECHILD;
+               }
+               BUG_ON(nd->inode != dentry->d_inode);
+               spin_unlock(&dentry->d_lock);
+               mntget(nd->path.mnt);
+               rcu_read_unlock();
+               br_read_unlock(vfsmount_lock);
        }
-       return dentry;
-}
 
-static inline int need_reval_dot(struct dentry *dentry)
-{
-       if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
+       if (likely(!(nd->flags & LOOKUP_JUMPED)))
                return 0;
 
-       if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
+       if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
                return 0;
 
-       return 1;
-}
-
-/*
- * force_reval_path - force revalidation of a dentry
- *
- * In some situations the path walking code will trust dentries without
- * revalidating them. This causes problems for filesystems that depend on
- * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
- * (which indicates that it's possible for the dentry to go stale), force
- * a d_revalidate call before proceeding.
- *
- * Returns 0 if the revalidation was successful. If the revalidation fails,
- * either return the error returned by d_revalidate or -ESTALE if the
- * revalidation it just returned 0. If d_revalidate returns 0, we attempt to
- * invalidate the dentry. It's up to the caller to handle putting references
- * to the path if necessary.
- */
-static int
-force_reval_path(struct path *path, struct nameidata *nd)
-{
-       int status;
-       struct dentry *dentry = path->dentry;
-
-       /*
-        * only check on filesystems where it's possible for the dentry to
-        * become stale.
-        */
-       if (!need_reval_dot(dentry))
+       if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
                return 0;
 
+       /* Note: we do not d_invalidate() */
        status = d_revalidate(dentry, nd);
        if (status > 0)
                return 0;
 
-       if (!status) {
-               d_invalidate(dentry);
+       if (!status)
                status = -ESTALE;
-       }
+
+       path_put(&nd->path);
        return status;
 }
 
@@ -699,6 +580,7 @@ force_reval_path(struct path *path, struct nameidata *nd)
 static inline int exec_permission(struct inode *inode, unsigned int flags)
 {
        int ret;
+       struct user_namespace *ns = inode_userns(inode);
 
        if (inode->i_op->permission) {
                ret = inode->i_op->permission(inode, MAY_EXEC, flags);
@@ -711,7 +593,8 @@ static inline int exec_permission(struct inode *inode, unsigned int flags)
        if (ret == -ECHILD)
                return ret;
 
-       if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
+       if (ns_capable(ns, CAP_DAC_OVERRIDE) ||
+                       ns_capable(ns, CAP_DAC_READ_SEARCH))
                goto ok;
 
        return ret;
@@ -736,6 +619,7 @@ static __always_inline void set_root_rcu(struct nameidata *nd)
                do {
                        seq = read_seqcount_begin(&fs->seq);
                        nd->root = fs->root;
+                       nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
                } while (read_seqcount_retry(&fs->seq, seq));
        }
 }
@@ -752,6 +636,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                path_put(&nd->path);
                nd->path = nd->root;
                path_get(&nd->root);
+               nd->flags |= LOOKUP_JUMPED;
        }
        nd->inode = nd->path.dentry->d_inode;
 
@@ -781,19 +666,42 @@ static inline void path_to_nameidata(const struct path *path,
        nd->path.dentry = path->dentry;
 }
 
+static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+{
+       struct inode *inode = link->dentry->d_inode;
+       if (!IS_ERR(cookie) && inode->i_op->put_link)
+               inode->i_op->put_link(link->dentry, nd, cookie);
+       path_put(link);
+}
+
 static __always_inline int
-__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
+follow_link(struct path *link, struct nameidata *nd, void **p)
 {
        int error;
        struct dentry *dentry = link->dentry;
 
        BUG_ON(nd->flags & LOOKUP_RCU);
 
+       if (link->mnt == nd->path.mnt)
+               mntget(link->mnt);
+
+       if (unlikely(current->total_link_count >= 40)) {
+               *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
+               path_put(&nd->path);
+               return -ELOOP;
+       }
+       cond_resched();
+       current->total_link_count++;
+
        touch_atime(link->mnt, dentry);
        nd_set_link(nd, NULL);
 
-       if (link->mnt == nd->path.mnt)
-               mntget(link->mnt);
+       error = security_inode_follow_link(link->dentry, nd);
+       if (error) {
+               *p = ERR_PTR(error); /* no ->put_link(), please */
+               path_put(&nd->path);
+               return error;
+       }
 
        nd->last_type = LAST_BIND;
        *p = dentry->d_inode->i_op->follow_link(dentry, nd);
@@ -804,55 +712,18 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
                if (s)
                        error = __vfs_follow_link(nd, s);
                else if (nd->last_type == LAST_BIND) {
-                       error = force_reval_path(&nd->path, nd);
-                       if (error)
+                       nd->flags |= LOOKUP_JUMPED;
+                       nd->inode = nd->path.dentry->d_inode;
+                       if (nd->inode->i_op->follow_link) {
+                               /* stepped on a _really_ weird one */
                                path_put(&nd->path);
+                               error = -ELOOP;
+                       }
                }
        }
        return error;
 }
 
-/*
- * This limits recursive symlink follows to 8, while
- * limiting consecutive symlinks to 40.
- *
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups. 
- */
-static inline int do_follow_link(struct path *path, struct nameidata *nd)
-{
-       void *cookie;
-       int err = -ELOOP;
-
-       /* We drop rcu-walk here */
-       if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
-               return -ECHILD;
-
-       if (current->link_count >= MAX_NESTED_LINKS)
-               goto loop;
-       if (current->total_link_count >= 40)
-               goto loop;
-       BUG_ON(nd->depth >= MAX_NESTED_LINKS);
-       cond_resched();
-       err = security_inode_follow_link(path->dentry, nd);
-       if (err)
-               goto loop;
-       current->link_count++;
-       current->total_link_count++;
-       nd->depth++;
-       err = __do_follow_link(path, nd, &cookie);
-       if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
-               path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
-       path_put(path);
-       current->link_count--;
-       nd->depth--;
-       return err;
-loop:
-       path_put_conditional(path, nd);
-       path_put(&nd->path);
-       return err;
-}
-
 static int follow_up_rcu(struct path *path)
 {
        struct vfsmount *parent;
@@ -944,6 +815,11 @@ static int follow_automount(struct path *path, unsigned flags,
        if (!mnt) /* mount collision */
                return 0;
 
+       if (!*need_mntput) {
+               /* lock_mount() may release path->mnt on error */
+               mntget(path->mnt);
+               *need_mntput = true;
+       }
        err = finish_automount(mnt, path);
 
        switch (err) {
@@ -951,12 +827,9 @@ static int follow_automount(struct path *path, unsigned flags,
                /* Someone else made a mount here whilst we were busy */
                return 0;
        case 0:
-               dput(path->dentry);
-               if (*need_mntput)
-                       mntput(path->mnt);
+               path_put(path);
                path->mnt = mnt;
                path->dentry = dget(mnt->mnt_root);
-               *need_mntput = true;
                return 0;
        default:
                return err;
@@ -976,9 +849,10 @@ static int follow_automount(struct path *path, unsigned flags,
  */
 static int follow_managed(struct path *path, unsigned flags)
 {
+       struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
        unsigned managed;
        bool need_mntput = false;
-       int ret;
+       int ret = 0;
 
        /* Given that we're not holding a lock here, we retain the value in a
         * local variable for each dentry as we look at it so that we don't see
@@ -991,10 +865,9 @@ static int follow_managed(struct path *path, unsigned flags)
                if (managed & DCACHE_MANAGE_TRANSIT) {
                        BUG_ON(!path->dentry->d_op);
                        BUG_ON(!path->dentry->d_op->d_manage);
-                       ret = path->dentry->d_op->d_manage(path->dentry,
-                                                          false, false);
+                       ret = path->dentry->d_op->d_manage(path->dentry, false);
                        if (ret < 0)
-                               return ret == -EISDIR ? 0 : ret;
+                               break;
                }
 
                /* Transit to a mounted filesystem. */
@@ -1020,14 +893,19 @@ static int follow_managed(struct path *path, unsigned flags)
                if (managed & DCACHE_NEED_AUTOMOUNT) {
                        ret = follow_automount(path, flags, &need_mntput);
                        if (ret < 0)
-                               return ret == -EISDIR ? 0 : ret;
+                               break;
                        continue;
                }
 
                /* We didn't change the current path point */
                break;
        }
-       return 0;
+
+       if (need_mntput && path->mnt == mnt)
+               mntput(path->mnt);
+       if (ret == -EISDIR)
+               ret = 0;
+       return ret;
 }
 
 int follow_down_one(struct path *path)
@@ -1045,38 +923,62 @@ int follow_down_one(struct path *path)
        return 0;
 }
 
+static inline bool managed_dentry_might_block(struct dentry *dentry)
+{
+       return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
+               dentry->d_op->d_manage(dentry, true) < 0);
+}
+
 /*
- * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
- * meet a managed dentry and we're not walking to "..".  True is returned to
- * continue, false to abort.
+ * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
+ * we meet a managed dentry that would need blocking.
  */
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
-                              struct inode **inode, bool reverse_transit)
+                              struct inode **inode)
 {
-       while (d_mountpoint(path->dentry)) {
+       for (;;) {
                struct vfsmount *mounted;
-               if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
-                   !reverse_transit &&
-                   path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+               /*
+                * Don't forget we might have a non-mountpoint managed dentry
+                * that wants to block transit.
+                */
+               if (unlikely(managed_dentry_might_block(path->dentry)))
                        return false;
+
+               if (!d_mountpoint(path->dentry))
+                       break;
+
                mounted = __lookup_mnt(path->mnt, path->dentry, 1);
                if (!mounted)
                        break;
                path->mnt = mounted;
                path->dentry = mounted->mnt_root;
                nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+               /*
+                * Update the inode too. We don't need to re-check the
+                * dentry sequence number here after this d_inode read,
+                * because a mount-point is always pinned.
+                */
                *inode = path->dentry->d_inode;
        }
-
-       if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-               return reverse_transit;
        return true;
 }
 
-static int follow_dotdot_rcu(struct nameidata *nd)
+static void follow_mount_rcu(struct nameidata *nd)
 {
-       struct inode *inode = nd->inode;
+       while (d_mountpoint(nd->path.dentry)) {
+               struct vfsmount *mounted;
+               mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
+               if (!mounted)
+                       break;
+               nd->path.mnt = mounted;
+               nd->path.dentry = mounted->mnt_root;
+               nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+       }
+}
 
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
        set_root_rcu(nd);
 
        while (1) {
@@ -1091,8 +993,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 
                        seq = read_seqcount_begin(&parent->d_seq);
                        if (read_seqcount_retry(&old->d_seq, nd->seq))
-                               return -ECHILD;
-                       inode = parent->d_inode;
+                               goto failed;
                        nd->path.dentry = parent;
                        nd->seq = seq;
                        break;
@@ -1100,23 +1001,26 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                if (!follow_up_rcu(&nd->path))
                        break;
                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-               inode = nd->path.dentry->d_inode;
        }
-       __follow_mount_rcu(nd, &nd->path, &inode, true);
-       nd->inode = inode;
-
+       follow_mount_rcu(nd);
+       nd->inode = nd->path.dentry->d_inode;
        return 0;
+
+failed:
+       nd->flags &= ~LOOKUP_RCU;
+       if (!(nd->flags & LOOKUP_ROOT))
+               nd->root.mnt = NULL;
+       rcu_read_unlock();
+       br_read_unlock(vfsmount_lock);
+       return -ECHILD;
 }
 
 /*
  * Follow down to the covering mount currently visible to userspace.  At each
  * point, the filesystem owning that dentry may be queried as to whether the
  * caller is permitted to proceed or not.
- *
- * Care must be taken as namespace_sem may be held (indicated by mounting_here
- * being true).
  */
-int follow_down(struct path *path, bool mounting_here)
+int follow_down(struct path *path)
 {
        unsigned managed;
        int ret;
@@ -1137,7 +1041,7 @@ int follow_down(struct path *path, bool mounting_here)
                        BUG_ON(!path->dentry->d_op);
                        BUG_ON(!path->dentry->d_op->d_manage);
                        ret = path->dentry->d_op->d_manage(
-                               path->dentry, mounting_here, false);
+                               path->dentry, false);
                        if (ret < 0)
                                return ret == -EISDIR ? 0 : ret;
                }
@@ -1239,68 +1143,83 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 {
        struct vfsmount *mnt = nd->path.mnt;
        struct dentry *dentry, *parent = nd->path.dentry;
-       struct inode *dir;
+       int need_reval = 1;
+       int status = 1;
        int err;
 
        /*
-        * See if the low-level filesystem might want
-        * to use its own hash..
-        */
-       if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-               err = parent->d_op->d_hash(parent, nd->inode, name);
-               if (err < 0)
-                       return err;
-       }
-
-       /*
         * Rename seqlock is not required here because in the off chance
         * of a false negative due to a concurrent rename, we're going to
         * do the non-racy lookup, below.
         */
        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;
-
                *inode = nd->inode;
                dentry = __d_lookup_rcu(parent, name, &seq, inode);
-               if (!dentry) {
-                       if (nameidata_drop_rcu(nd))
-                               return -ECHILD;
-                       goto need_lookup;
-               }
+               if (!dentry)
+                       goto unlazy;
+
                /* Memory barrier in read_seqcount_begin of child is enough */
                if (__read_seqcount_retry(&parent->d_seq, nd->seq))
                        return -ECHILD;
-
                nd->seq = seq;
+
                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-                       dentry = do_revalidate_rcu(dentry, nd);
-                       if (!dentry)
-                               goto need_lookup;
-                       if (IS_ERR(dentry))
-                               goto fail;
-                       if (!(nd->flags & LOOKUP_RCU))
-                               goto done;
+                       status = d_revalidate(dentry, nd);
+                       if (unlikely(status <= 0)) {
+                               if (status != -ECHILD)
+                                       need_reval = 0;
+                               goto unlazy;
+                       }
                }
                path->mnt = mnt;
                path->dentry = dentry;
-               if (likely(__follow_mount_rcu(nd, path, inode, false)))
-                       return 0;
-               if (nameidata_drop_rcu(nd))
+               if (unlikely(!__follow_mount_rcu(nd, path, inode)))
+                       goto unlazy;
+               if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+                       goto unlazy;
+               return 0;
+unlazy:
+               if (unlazy_walk(nd, dentry))
                        return -ECHILD;
-               /* fallthru */
+       } else {
+               dentry = __d_lookup(parent, name);
        }
-       dentry = __d_lookup(parent, name);
-       if (!dentry)
-               goto need_lookup;
-found:
-       if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-               dentry = do_revalidate(dentry, nd);
-               if (!dentry)
-                       goto need_lookup;
-               if (IS_ERR(dentry))
-                       goto fail;
+
+retry:
+       if (unlikely(!dentry)) {
+               struct inode *dir = parent->d_inode;
+               BUG_ON(nd->inode != dir);
+
+               mutex_lock(&dir->i_mutex);
+               dentry = d_lookup(parent, name);
+               if (likely(!dentry)) {
+                       dentry = d_alloc_and_lookup(parent, name, nd);
+                       if (IS_ERR(dentry)) {
+                               mutex_unlock(&dir->i_mutex);
+                               return PTR_ERR(dentry);
+                       }
+                       /* known good */
+                       need_reval = 0;
+                       status = 1;
+               }
+               mutex_unlock(&dir->i_mutex);
+       }
+       if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
+               status = d_revalidate(dentry, nd);
+       if (unlikely(status <= 0)) {
+               if (status < 0) {
+                       dput(dentry);
+                       return status;
+               }
+               if (!d_invalidate(dentry)) {
+                       dput(dentry);
+                       dentry = NULL;
+                       need_reval = 1;
+                       goto retry;
+               }
        }
-done:
+
        path->mnt = mnt;
        path->dentry = dentry;
        err = follow_managed(path, nd->flags);
@@ -1310,39 +1229,117 @@ done:
        }
        *inode = path->dentry->d_inode;
        return 0;
+}
 
-need_lookup:
-       dir = parent->d_inode;
-       BUG_ON(nd->inode != dir);
+static inline int may_lookup(struct nameidata *nd)
+{
+       if (nd->flags & LOOKUP_RCU) {
+               int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+               if (err != -ECHILD)
+                       return err;
+               if (unlazy_walk(nd, NULL))
+                       return -ECHILD;
+       }
+       return exec_permission(nd->inode, 0);
+}
 
-       mutex_lock(&dir->i_mutex);
-       /*
-        * First re-do the cached lookup just in case it was created
-        * while we waited for the directory semaphore, or the first
-        * lookup failed due to an unrelated rename.
-        *
-        * This could use version numbering or similar to avoid unnecessary
-        * cache lookups, but then we'd have to do the first lookup in the
-        * non-racy way. However in the common case here, everything should
-        * be hot in cache, so would it be a big win?
-        */
-       dentry = d_lookup(parent, name);
-       if (likely(!dentry)) {
-               dentry = d_alloc_and_lookup(parent, name, nd);
-               mutex_unlock(&dir->i_mutex);
-               if (IS_ERR(dentry))
-                       goto fail;
-               goto done;
+static inline int handle_dots(struct nameidata *nd, int type)
+{
+       if (type == LAST_DOTDOT) {
+               if (nd->flags & LOOKUP_RCU) {
+                       if (follow_dotdot_rcu(nd))
+                               return -ECHILD;
+               } else
+                       follow_dotdot(nd);
+       }
+       return 0;
+}
+
+static void terminate_walk(struct nameidata *nd)
+{
+       if (!(nd->flags & LOOKUP_RCU)) {
+               path_put(&nd->path);
+       } else {
+               nd->flags &= ~LOOKUP_RCU;
+               if (!(nd->flags & LOOKUP_ROOT))
+                       nd->root.mnt = NULL;
+               rcu_read_unlock();
+               br_read_unlock(vfsmount_lock);
        }
+}
+
+static inline int walk_component(struct nameidata *nd, struct path *path,
+               struct qstr *name, int type, int follow)
+{
+       struct inode *inode;
+       int err;
        /*
-        * Uhhuh! Nasty case: the cache was re-populated while
-        * we waited on the semaphore. Need to revalidate.
+        * "." and ".." are special - ".." especially so because it has
+        * to be able to know about the current root directory and
+        * parent relationships.
         */
-       mutex_unlock(&dir->i_mutex);
-       goto found;
+       if (unlikely(type != LAST_NORM))
+               return handle_dots(nd, type);
+       err = do_lookup(nd, name, path, &inode);
+       if (unlikely(err)) {
+               terminate_walk(nd);
+               return err;
+       }
+       if (!inode) {
+               path_to_nameidata(path, nd);
+               terminate_walk(nd);
+               return -ENOENT;
+       }
+       if (unlikely(inode->i_op->follow_link) && follow) {
+               if (nd->flags & LOOKUP_RCU) {
+                       if (unlikely(unlazy_walk(nd, path->dentry))) {
+                               terminate_walk(nd);
+                               return -ECHILD;
+                       }
+               }
+               BUG_ON(inode != path->dentry->d_inode);
+               return 1;
+       }
+       path_to_nameidata(path, nd);
+       nd->inode = inode;
+       return 0;
+}
 
-fail:
-       return PTR_ERR(dentry);
+/*
+ * This limits recursive symlink follows to 8, while
+ * limiting consecutive symlinks to 40.
+ *
+ * Without that kind of total limit, nasty chains of consecutive
+ * symlinks can cause almost arbitrarily long lookups.
+ */
+static inline int nested_symlink(struct path *path, struct nameidata *nd)
+{
+       int res;
+
+       if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
+               path_put_conditional(path, nd);
+               path_put(&nd->path);
+               return -ELOOP;
+       }
+       BUG_ON(nd->depth >= MAX_NESTED_LINKS);
+
+       nd->depth++;
+       current->link_count++;
+
+       do {
+               struct path link = *path;
+               void *cookie;
+
+               res = follow_link(&link, nd, &cookie);
+               if (!res)
+                       res = walk_component(nd, path, &nd->last,
+                                            nd->last_type, LOOKUP_FOLLOW);
+               put_link(nd, &link, cookie);
+       } while (res > 0);
+
+       current->link_count--;
+       nd->depth--;
+       return res;
 }
 
 /*
@@ -1362,30 +1359,18 @@ static int link_path_walk(const char *name, struct nameidata *nd)
        while (*name=='/')
                name++;
        if (!*name)
-               goto return_reval;
-
-       if (nd->depth)
-               lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
+               return 0;
 
        /* At this point we know we have a real path component. */
        for(;;) {
-               struct inode *inode;
                unsigned long hash;
                struct qstr this;
                unsigned int c;
+               int type;
 
                nd->flags |= LOOKUP_CONTINUE;
-               if (nd->flags & LOOKUP_RCU) {
-                       err = exec_permission(nd->inode, IPERM_FLAG_RCU);
-                       if (err == -ECHILD) {
-                               if (nameidata_drop_rcu(nd))
-                                       return -ECHILD;
-                               goto exec_again;
-                       }
-               } else {
-exec_again:
-                       err = exec_permission(nd->inode, 0);
-               }
+
+               err = may_lookup(nd);
                if (err)
                        break;
 
@@ -1401,53 +1386,43 @@ exec_again:
                this.len = name - (const char *) this.name;
                this.hash = end_name_hash(hash);
 
+               type = LAST_NORM;
+               if (this.name[0] == '.') switch (this.len) {
+                       case 2:
+                               if (this.name[1] == '.') {
+                                       type = LAST_DOTDOT;
+                                       nd->flags |= LOOKUP_JUMPED;
+                               }
+                               break;
+                       case 1:
+                               type = LAST_DOT;
+               }
+               if (likely(type == LAST_NORM)) {
+                       struct dentry *parent = nd->path.dentry;
+                       nd->flags &= ~LOOKUP_JUMPED;
+                       if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
+                               err = parent->d_op->d_hash(parent, nd->inode,
+                                                          &this);
+                               if (err < 0)
+                                       break;
+                       }
+               }
+
                /* remove trailing slashes? */
                if (!c)
                        goto last_component;
                while (*++name == '/');
                if (!*name)
-                       goto last_with_slashes;
+                       goto last_component;
 
-               /*
-                * "." and ".." are special - ".." especially so because it has
-                * to be able to know about the current root directory and
-                * parent relationships.
-                */
-               if (this.name[0] == '.') switch (this.len) {
-                       default:
-                               break;
-                       case 2:
-                               if (this.name[1] != '.')
-                                       break;
-                               if (nd->flags & LOOKUP_RCU) {
-                                       if (follow_dotdot_rcu(nd))
-                                               return -ECHILD;
-                               } else
-                                       follow_dotdot(nd);
-                               /* fallthrough */
-                       case 1:
-                               continue;
-               }
-               /* This does the actual lookups.. */
-               err = do_lookup(nd, &this, &next, &inode);
-               if (err)
-                       break;
-               err = -ENOENT;
-               if (!inode)
-                       goto out_dput;
+               err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
+               if (err < 0)
+                       return err;
 
-               if (inode->i_op->follow_link) {
-                       BUG_ON(inode != next.dentry->d_inode);
-                       err = do_follow_link(&next, nd);
+               if (err) {
+                       err = nested_symlink(&next, nd);
                        if (err)
-                               goto return_err;
-                       nd->inode = nd->path.dentry->d_inode;
-                       err = -ENOENT;
-                       if (!nd->inode)
-                               break;
-               } else {
-                       path_to_nameidata(&next, nd);
-                       nd->inode = inode;
+                               return err;
                }
                err = -ENOTDIR; 
                if (!nd->inode->i_op->lookup)
@@ -1455,207 +1430,109 @@ exec_again:
                continue;
                /* here ends the main loop */
 
-last_with_slashes:
-               lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 last_component:
                /* Clear LOOKUP_CONTINUE iff it was previously unset */
                nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
-               if (lookup_flags & LOOKUP_PARENT)
-                       goto lookup_parent;
-               if (this.name[0] == '.') switch (this.len) {
-                       default:
-                               break;
-                       case 2:
-                               if (this.name[1] != '.')
-                                       break;
-                               if (nd->flags & LOOKUP_RCU) {
-                                       if (follow_dotdot_rcu(nd))
-                                               return -ECHILD;
-                               } else
-                                       follow_dotdot(nd);
-                               /* fallthrough */
-                       case 1:
-                               goto return_reval;
-               }
-               err = do_lookup(nd, &this, &next, &inode);
-               if (err)
-                       break;
-               if (inode && unlikely(inode->i_op->follow_link) &&
-                   (lookup_flags & LOOKUP_FOLLOW)) {
-                       BUG_ON(inode != next.dentry->d_inode);
-                       err = do_follow_link(&next, nd);
-                       if (err)
-                               goto return_err;
-                       nd->inode = nd->path.dentry->d_inode;
-               } else {
-                       path_to_nameidata(&next, nd);
-                       nd->inode = inode;
-               }
-               err = -ENOENT;
-               if (!nd->inode)
-                       break;
-               if (lookup_flags & LOOKUP_DIRECTORY) {
-                       err = -ENOTDIR; 
-                       if (!nd->inode->i_op->lookup)
-                               break;
-               }
-               goto return_base;
-lookup_parent:
                nd->last = this;
-               nd->last_type = LAST_NORM;
-               if (this.name[0] != '.')
-                       goto return_base;
-               if (this.len == 1)
-                       nd->last_type = LAST_DOT;
-               else if (this.len == 2 && this.name[1] == '.')
-                       nd->last_type = LAST_DOTDOT;
-               else
-                       goto return_base;
-return_reval:
-               /*
-                * We bypassed the ordinary revalidation routines.
-                * We may need to check the cached dentry for staleness.
-                */
-               if (need_reval_dot(nd->path.dentry)) {
-                       /* Note: we do not d_invalidate() */
-                       err = d_revalidate(nd->path.dentry, nd);
-                       if (!err)
-                               err = -ESTALE;
-                       if (err < 0)
-                               break;
-               }
-return_base:
-               if (nameidata_drop_rcu_last_maybe(nd))
-                       return -ECHILD;
+               nd->last_type = type;
                return 0;
-out_dput:
-               if (!(nd->flags & LOOKUP_RCU))
-                       path_put_conditional(&next, nd);
-               break;
        }
-       if (!(nd->flags & LOOKUP_RCU))
-               path_put(&nd->path);
-return_err:
+       terminate_walk(nd);
        return err;
 }
 
-static inline int path_walk_rcu(const char *name, struct nameidata *nd)
-{
-       current->total_link_count = 0;
-
-       return link_path_walk(name, nd);
-}
-
-static inline int path_walk_simple(const char *name, struct nameidata *nd)
-{
-       current->total_link_count = 0;
-
-       return link_path_walk(name, nd);
-}
-
-static int path_walk(const char *name, struct nameidata *nd)
-{
-       struct path save = nd->path;
-       int result;
-
-       current->total_link_count = 0;
-
-       /* make sure the stuff we saved doesn't go away */
-       path_get(&save);
-
-       result = link_path_walk(name, nd);
-       if (result == -ESTALE) {
-               /* nd->path had been dropped */
-               current->total_link_count = 0;
-               nd->path = save;
-               path_get(&nd->path);
-               nd->flags |= LOOKUP_REVAL;
-               result = link_path_walk(name, nd);
-       }
-
-       path_put(&save);
-
-       return result;
-}
-
-static void path_finish_rcu(struct nameidata *nd)
-{
-       if (nd->flags & LOOKUP_RCU) {
-               /* RCU dangling. Cancel it. */
-               nd->flags &= ~LOOKUP_RCU;
-               nd->root.mnt = NULL;
-               rcu_read_unlock();
-               br_read_unlock(vfsmount_lock);
-       }
-       if (nd->file)
-               fput(nd->file);
-}
-
-static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+static int path_init(int dfd, const char *name, unsigned int flags,
+                    struct nameidata *nd, struct file **fp)
 {
        int retval = 0;
        int fput_needed;
        struct file *file;
 
        nd->last_type = LAST_ROOT; /* if there are only slashes... */
-       nd->flags = flags | LOOKUP_RCU;
+       nd->flags = flags | LOOKUP_JUMPED;
        nd->depth = 0;
+       if (flags & LOOKUP_ROOT) {
+               struct inode *inode = nd->root.dentry->d_inode;
+               if (*name) {
+                       if (!inode->i_op->lookup)
+                               return -ENOTDIR;
+                       retval = inode_permission(inode, MAY_EXEC);
+                       if (retval)
+                               return retval;
+               }
+               nd->path = nd->root;
+               nd->inode = inode;
+               if (flags & LOOKUP_RCU) {
+                       br_read_lock(vfsmount_lock);
+                       rcu_read_lock();
+                       nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+               } else {
+                       path_get(&nd->path);
+               }
+               return 0;
+       }
+
        nd->root.mnt = NULL;
-       nd->file = NULL;
 
        if (*name=='/') {
-               struct fs_struct *fs = current->fs;
-               unsigned seq;
-
-               br_read_lock(vfsmount_lock);
-               rcu_read_lock();
-
-               do {
-                       seq = read_seqcount_begin(&fs->seq);
-                       nd->root = fs->root;
-                       nd->path = nd->root;
-                       nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-               } while (read_seqcount_retry(&fs->seq, seq));
-
+               if (flags & LOOKUP_RCU) {
+                       br_read_lock(vfsmount_lock);
+                       rcu_read_lock();
+                       set_root_rcu(nd);
+               } else {
+                       set_root(nd);
+                       path_get(&nd->root);
+               }
+               nd->path = nd->root;
        } else if (dfd == AT_FDCWD) {
-               struct fs_struct *fs = current->fs;
-               unsigned seq;
+               if (flags & LOOKUP_RCU) {
+                       struct fs_struct *fs = current->fs;
+                       unsigned seq;
 
-               br_read_lock(vfsmount_lock);
-               rcu_read_lock();
-
-               do {
-                       seq = read_seqcount_begin(&fs->seq);
-                       nd->path = fs->pwd;
-                       nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-               } while (read_seqcount_retry(&fs->seq, seq));
+                       br_read_lock(vfsmount_lock);
+                       rcu_read_lock();
 
+                       do {
+                               seq = read_seqcount_begin(&fs->seq);
+                               nd->path = fs->pwd;
+                               nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                       } while (read_seqcount_retry(&fs->seq, seq));
+               } else {
+                       get_fs_pwd(current->fs, &nd->path);
+               }
        } else {
                struct dentry *dentry;
 
-               file = fget_light(dfd, &fput_needed);
+               file = fget_raw_light(dfd, &fput_needed);
                retval = -EBADF;
                if (!file)
                        goto out_fail;
 
                dentry = file->f_path.dentry;
 
-               retval = -ENOTDIR;
-               if (!S_ISDIR(dentry->d_inode->i_mode))
-                       goto fput_fail;
+               if (*name) {
+                       retval = -ENOTDIR;
+                       if (!S_ISDIR(dentry->d_inode->i_mode))
+                               goto fput_fail;
 
-               retval = file_permission(file, MAY_EXEC);
-               if (retval)
-                       goto fput_fail;
+                       retval = file_permission(file, MAY_EXEC);
+                       if (retval)
+                               goto fput_fail;
+               }
 
                nd->path = file->f_path;
-               if (fput_needed)
-                       nd->file = file;
-
-               nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-               br_read_lock(vfsmount_lock);
-               rcu_read_lock();
+               if (flags & LOOKUP_RCU) {
+                       if (fput_needed)
+                               *fp = file;
+                       nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                       br_read_lock(vfsmount_lock);
+                       rcu_read_lock();
+               } else {
+                       path_get(&file->f_path);
+                       fput_light(file, fput_needed);
+               }
        }
+
        nd->inode = nd->path.dentry->d_inode;
        return 0;
 
@@ -1665,60 +1542,23 @@ out_fail:
        return retval;
 }
 
-static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+static inline int lookup_last(struct nameidata *nd, struct path *path)
 {
-       int retval = 0;
-       int fput_needed;
-       struct file *file;
-
-       nd->last_type = LAST_ROOT; /* if there are only slashes... */
-       nd->flags = flags;
-       nd->depth = 0;
-       nd->root.mnt = NULL;
+       if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
+               nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 
-       if (*name=='/') {
-               set_root(nd);
-               nd->path = nd->root;
-               path_get(&nd->root);
-       } else if (dfd == AT_FDCWD) {
-               get_fs_pwd(current->fs, &nd->path);
-       } else {
-               struct dentry *dentry;
-
-               file = fget_light(dfd, &fput_needed);
-               retval = -EBADF;
-               if (!file)
-                       goto out_fail;
-
-               dentry = file->f_path.dentry;
-
-               retval = -ENOTDIR;
-               if (!S_ISDIR(dentry->d_inode->i_mode))
-                       goto fput_fail;
-
-               retval = file_permission(file, MAY_EXEC);
-               if (retval)
-                       goto fput_fail;
-
-               nd->path = file->f_path;
-               path_get(&file->f_path);
-
-               fput_light(file, fput_needed);
-       }
-       nd->inode = nd->path.dentry->d_inode;
-       return 0;
-
-fput_fail:
-       fput_light(file, fput_needed);
-out_fail:
-       return retval;
+       nd->flags &= ~LOOKUP_PARENT;
+       return walk_component(nd, path, &nd->last, nd->last_type,
+                                       nd->flags & LOOKUP_FOLLOW);
 }
 
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int do_path_lookup(int dfd, const char *name,
+static int path_lookupat(int dfd, const char *name,
                                unsigned int flags, struct nameidata *nd)
 {
-       int retval;
+       struct file *base = NULL;
+       struct path path;
+       int err;
 
        /*
         * Path walking is largely split up into 2 different synchronisation
@@ -1734,44 +1574,68 @@ static int do_path_lookup(int dfd, const char *name,
         * be handled by restarting a traditional ref-walk (which will always
         * be able to complete).
         */
-       retval = path_init_rcu(dfd, name, flags, nd);
-       if (unlikely(retval))
-               return retval;
-       retval = path_walk_rcu(name, nd);
-       path_finish_rcu(nd);
-       if (nd->root.mnt) {
-               path_put(&nd->root);
-               nd->root.mnt = NULL;
+       err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
+
+       if (unlikely(err))
+               return err;
+
+       current->total_link_count = 0;
+       err = link_path_walk(name, nd);
+
+       if (!err && !(flags & LOOKUP_PARENT)) {
+               err = lookup_last(nd, &path);
+               while (err > 0) {
+                       void *cookie;
+                       struct path link = path;
+                       nd->flags |= LOOKUP_PARENT;
+                       err = follow_link(&link, nd, &cookie);
+                       if (!err)
+                               err = lookup_last(nd, &path);
+                       put_link(nd, &link, cookie);
+               }
        }
 
-       if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
-               /* slower, locked walk */
-               if (retval == -ESTALE)
-                       flags |= LOOKUP_REVAL;
-               retval = path_init(dfd, name, flags, nd);
-               if (unlikely(retval))
-                       return retval;
-               retval = path_walk(name, nd);
-               if (nd->root.mnt) {
-                       path_put(&nd->root);
-                       nd->root.mnt = NULL;
+       if (!err)
+               err = complete_walk(nd);
+
+       if (!err && nd->flags & LOOKUP_DIRECTORY) {
+               if (!nd->inode->i_op->lookup) {
+                       path_put(&nd->path);
+                       err = -ENOTDIR;
                }
        }
 
+       if (base)
+               fput(base);
+
+       if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+               path_put(&nd->root);
+               nd->root.mnt = NULL;
+       }
+       return err;
+}
+
+static int do_path_lookup(int dfd, const char *name,
+                               unsigned int flags, struct nameidata *nd)
+{
+       int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
+       if (unlikely(retval == -ECHILD))
+               retval = path_lookupat(dfd, name, flags, nd);
+       if (unlikely(retval == -ESTALE))
+               retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
+
        if (likely(!retval)) {
                if (unlikely(!audit_dummy_context())) {
                        if (nd->path.dentry && nd->inode)
                                audit_inode(name, nd->path.dentry);
                }
        }
-
        return retval;
 }
 
-int path_lookup(const char *name, unsigned int flags,
-                       struct nameidata *nd)
+int kern_path_parent(const char *name, struct nameidata *nd)
 {
-       return do_path_lookup(AT_FDCWD, name, flags, nd);
+       return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
 }
 
 int kern_path(const char *name, unsigned int flags, struct path *path)
@@ -1795,29 +1659,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                    const char *name, unsigned int flags,
                    struct nameidata *nd)
 {
-       int retval;
-
-       /* same as do_path_lookup */
-       nd->last_type = LAST_ROOT;
-       nd->flags = flags;
-       nd->depth = 0;
-
-       nd->path.dentry = dentry;
-       nd->path.mnt = mnt;
-       path_get(&nd->path);
-       nd->root = nd->path;
-       path_get(&nd->root);
-       nd->inode = nd->path.dentry->d_inode;
-
-       retval = path_walk(name, nd);
-       if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
-                               nd->inode))
-               audit_inode(name, nd->path.dentry);
-
-       path_put(&nd->root);
-       nd->root.mnt = NULL;
-
-       return retval;
+       nd->root.dentry = dentry;
+       nd->root.mnt = mnt;
+       /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
+       return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
 }
 
 static struct dentry *__lookup_hash(struct qstr *name,
@@ -1832,17 +1677,6 @@ static struct dentry *__lookup_hash(struct qstr *name,
                return ERR_PTR(err);
 
        /*
-        * See if the low-level filesystem might want
-        * to use its own hash..
-        */
-       if (base->d_flags & DCACHE_OP_HASH) {
-               err = base->d_op->d_hash(base, inode, name);
-               dentry = ERR_PTR(err);
-               if (err < 0)
-                       goto out;
-       }
-
-       /*
         * Don't bother with __d_lookup: callers are for creat as
         * well as unlink, so a lot of the time it would cost
         * a double lookup.
@@ -1854,7 +1688,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
 
        if (!dentry)
                dentry = d_alloc_and_lookup(base, name, nd);
-out:
+
        return dentry;
 }
 
@@ -1868,28 +1702,6 @@ static struct dentry *lookup_hash(struct nameidata *nd)
        return __lookup_hash(&nd->last, nd->path.dentry, nd);
 }
 
-static int __lookup_one_len(const char *name, struct qstr *this,
-               struct dentry *base, int len)
-{
-       unsigned long hash;
-       unsigned int c;
-
-       this->name = name;
-       this->len = len;
-       if (!len)
-               return -EACCES;
-
-       hash = init_name_hash();
-       while (len--) {
-               c = *(const unsigned char *)name++;
-               if (c == '/' || c == '\0')
-                       return -EACCES;
-               hash = partial_name_hash(c, hash);
-       }
-       this->hash = end_name_hash(hash);
-       return 0;
-}
-
 /**
  * lookup_one_len - filesystem helper to lookup single pathname component
  * @name:      pathname component to lookup
@@ -1903,14 +1715,34 @@ static int __lookup_one_len(const char *name, struct qstr *this,
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
-       int err;
        struct qstr this;
+       unsigned long hash;
+       unsigned int c;
 
        WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
 
-       err = __lookup_one_len(name, &this, base, len);
-       if (err)
-               return ERR_PTR(err);
+       this.name = name;
+       this.len = len;
+       if (!len)
+               return ERR_PTR(-EACCES);
+
+       hash = init_name_hash();
+       while (len--) {
+               c = *(const unsigned char *)name++;
+               if (c == '/' || c == '\0')
+                       return ERR_PTR(-EACCES);
+               hash = partial_name_hash(c, hash);
+       }
+       this.hash = end_name_hash(hash);
+       /*
+        * See if the low-level filesystem might want
+        * to use its own hash..
+        */
+       if (base->d_flags & DCACHE_OP_HASH) {
+               int err = base->d_op->d_hash(base, base->d_inode, &this);
+               if (err < 0)
+                       return ERR_PTR(err);
+       }
 
        return __lookup_hash(&this, base, NULL);
 }
@@ -1919,7 +1751,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
                 struct path *path)
 {
        struct nameidata nd;
-       char *tmp = getname(name);
+       char *tmp = getname_flags(name, flags);
        int err = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
 
@@ -1961,11 +1793,15 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
 
        if (!(dir->i_mode & S_ISVTX))
                return 0;
+       if (current_user_ns() != inode_userns(inode))
+               goto other_userns;
        if (inode->i_uid == fsuid)
                return 0;
        if (dir->i_uid == fsuid)
                return 0;
-       return !capable(CAP_FOWNER);
+
+other_userns:
+       return !ns_capable(inode_userns(inode), CAP_FOWNER);
 }
 
 /*
@@ -2099,12 +1935,16 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        return error;
 }
 
-int may_open(struct path *path, int acc_mode, int flag)
+static int may_open(struct path *path, int acc_mode, int flag)
 {
        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
        int error;
 
+       /* O_PATH? */
+       if (!acc_mode)
+               return 0;
+
        if (!inode)
                return -ENOENT;
 
@@ -2141,7 +1981,7 @@ int may_open(struct path *path, int acc_mode, int flag)
        }
 
        /* O_NOATIME can only be set by the owner or superuser */
-       if (flag & O_NOATIME && !is_owner_or_cap(inode))
+       if (flag & O_NOATIME && !inode_owner_or_capable(inode))
                return -EPERM;
 
        /*
@@ -2173,34 +2013,6 @@ static int handle_truncate(struct file *filp)
 }
 
 /*
- * Be careful about ever adding any more callers of this
- * function.  Its flags must be in the namei format, not
- * what get passed to sys_open().
- */
-static int __open_namei_create(struct nameidata *nd, struct path *path,
-                               int open_flag, int mode)
-{
-       int error;
-       struct dentry *dir = nd->path.dentry;
-
-       if (!IS_POSIXACL(dir->d_inode))
-               mode &= ~current_umask();
-       error = security_path_mknod(&nd->path, path->dentry, mode, 0);
-       if (error)
-               goto out_unlock;
-       error = vfs_create(dir->d_inode, path->dentry, mode, nd);
-out_unlock:
-       mutex_unlock(&dir->d_inode->i_mutex);
-       dput(nd->path.dentry);
-       nd->path.dentry = path->dentry;
-
-       if (error)
-               return error;
-       /* Don't check for write permission, don't truncate */
-       return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
-}
-
-/*
  * Note that while the flag value (low two bits) for sys_open means:
  *     00 - read-only
  *     01 - write-only
@@ -2224,126 +2036,107 @@ static inline int open_to_namei_flags(int flag)
        return flag;
 }
 
-static int open_will_truncate(int flag, struct inode *inode)
-{
-       /*
-        * We'll never write to the fs underlying
-        * a device file.
-        */
-       if (special_file(inode->i_mode))
-               return 0;
-       return (flag & O_TRUNC);
-}
-
-static struct file *finish_open(struct nameidata *nd,
-                               int open_flag, int acc_mode)
-{
-       struct file *filp;
-       int will_truncate;
-       int error;
-
-       will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
-       if (will_truncate) {
-               error = mnt_want_write(nd->path.mnt);
-               if (error)
-                       goto exit;
-       }
-       error = may_open(&nd->path, acc_mode, open_flag);
-       if (error) {
-               if (will_truncate)
-                       mnt_drop_write(nd->path.mnt);
-               goto exit;
-       }
-       filp = nameidata_to_filp(nd);
-       if (!IS_ERR(filp)) {
-               error = ima_file_check(filp, acc_mode);
-               if (error) {
-                       fput(filp);
-                       filp = ERR_PTR(error);
-               }
-       }
-       if (!IS_ERR(filp)) {
-               if (will_truncate) {
-                       error = handle_truncate(filp);
-                       if (error) {
-                               fput(filp);
-                               filp = ERR_PTR(error);
-                       }
-               }
-       }
-       /*
-        * It is now safe to drop the mnt write
-        * because the filp has had a write taken
-        * on its behalf.
-        */
-       if (will_truncate)
-               mnt_drop_write(nd->path.mnt);
-       path_put(&nd->path);
-       return filp;
-
-exit:
-       path_put(&nd->path);
-       return ERR_PTR(error);
-}
-
 /*
- * Handle O_CREAT case for do_filp_open
+ * Handle the last step of open()
  */
 static struct file *do_last(struct nameidata *nd, struct path *path,
-                           int open_flag, int acc_mode,
-                           int mode, const char *pathname)
+                           const struct open_flags *op, const char *pathname)
 {
        struct dentry *dir = nd->path.dentry;
+       struct dentry *dentry;
+       int open_flag = op->open_flag;
+       int will_truncate = open_flag & O_TRUNC;
+       int want_write = 0;
+       int acc_mode = op->acc_mode;
        struct file *filp;
-       int error = -EISDIR;
+       int error;
+
+       nd->flags &= ~LOOKUP_PARENT;
+       nd->flags |= op->intent;
 
        switch (nd->last_type) {
        case LAST_DOTDOT:
-               follow_dotdot(nd);
-               dir = nd->path.dentry;
        case LAST_DOT:
-               if (need_reval_dot(dir)) {
-                       int status = d_revalidate(nd->path.dentry, nd);
-                       if (!status)
-                               status = -ESTALE;
-                       if (status < 0) {
-                               error = status;
-                               goto exit;
-                       }
-               }
+               error = handle_dots(nd, nd->last_type);
+               if (error)
+                       return ERR_PTR(error);
                /* fallthrough */
        case LAST_ROOT:
-               goto exit;
+               error = complete_walk(nd);
+               if (error)
+                       return ERR_PTR(error);
+               audit_inode(pathname, nd->path.dentry);
+               if (open_flag & O_CREAT) {
+                       error = -EISDIR;
+                       goto exit;
+               }
+               goto ok;
        case LAST_BIND:
+               error = complete_walk(nd);
+               if (error)
+                       return ERR_PTR(error);
                audit_inode(pathname, dir);
                goto ok;
        }
 
+       if (!(open_flag & O_CREAT)) {
+               int symlink_ok = 0;
+               if (nd->last.name[nd->last.len])
+                       nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+               if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
+                       symlink_ok = 1;
+               /* we _can_ be in RCU mode here */
+               error = walk_component(nd, path, &nd->last, LAST_NORM,
+                                       !symlink_ok);
+               if (error < 0)
+                       return ERR_PTR(error);
+               if (error) /* symlink */
+                       return NULL;
+               /* sayonara */
+               error = complete_walk(nd);
+               if (error)
+                       return ERR_PTR(-ECHILD);
+
+               error = -ENOTDIR;
+               if (nd->flags & LOOKUP_DIRECTORY) {
+                       if (!nd->inode->i_op->lookup)
+                               goto exit;
+               }
+               audit_inode(pathname, nd->path.dentry);
+               goto ok;
+       }
+
+       /* create side of things */
+       error = complete_walk(nd);
+       if (error)
+               return ERR_PTR(error);
+
+       audit_inode(pathname, dir);
+       error = -EISDIR;
        /* trailing slashes? */
        if (nd->last.name[nd->last.len])
                goto exit;
 
        mutex_lock(&dir->d_inode->i_mutex);
 
-       path->dentry = lookup_hash(nd);
-       path->mnt = nd->path.mnt;
-
-       error = PTR_ERR(path->dentry);
-       if (IS_ERR(path->dentry)) {
+       dentry = lookup_hash(nd);
+       error = PTR_ERR(dentry);
+       if (IS_ERR(dentry)) {
                mutex_unlock(&dir->d_inode->i_mutex);
                goto exit;
        }
 
-       if (IS_ERR(nd->intent.open.file)) {
-               error = PTR_ERR(nd->intent.open.file);
-               goto exit_mutex_unlock;
-       }
+       path->dentry = dentry;
+       path->mnt = nd->path.mnt;
 
        /* Negative dentry, just create the file */
-       if (!path->dentry->d_inode) {
+       if (!dentry->d_inode) {
+               int mode = op->mode;
+               if (!IS_POSIXACL(dir->d_inode))
+                       mode &= ~current_umask();
                /*
                 * This write is needed to ensure that a
-                * ro->rw transition does not occur between
+                * rw->ro transition does not occur between
                 * the time when the file is created and when
                 * a permanent write count is taken through
                 * the 'struct file' in nameidata_to_filp().
@@ -2351,22 +2144,21 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                error = mnt_want_write(nd->path.mnt);
                if (error)
                        goto exit_mutex_unlock;
-               error = __open_namei_create(nd, path, open_flag, mode);
-               if (error) {
-                       mnt_drop_write(nd->path.mnt);
-                       goto exit;
-               }
-               filp = nameidata_to_filp(nd);
-               mnt_drop_write(nd->path.mnt);
-               path_put(&nd->path);
-               if (!IS_ERR(filp)) {
-                       error = ima_file_check(filp, acc_mode);
-                       if (error) {
-                               fput(filp);
-                               filp = ERR_PTR(error);
-                       }
-               }
-               return filp;
+               want_write = 1;
+               /* Don't check for write permission, don't truncate */
+               open_flag &= ~O_TRUNC;
+               will_truncate = 0;
+               acc_mode = MAY_OPEN;
+               error = security_path_mknod(&nd->path, dentry, mode, 0);
+               if (error)
+                       goto exit_mutex_unlock;
+               error = vfs_create(dir->d_inode, dentry, mode, nd);
+               if (error)
+                       goto exit_mutex_unlock;
+               mutex_unlock(&dir->d_inode->i_mutex);
+               dput(nd->path.dentry);
+               nd->path.dentry = dentry;
+               goto common;
        }
 
        /*
@@ -2396,7 +2188,40 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        if (S_ISDIR(nd->inode->i_mode))
                goto exit;
 ok:
-       filp = finish_open(nd, open_flag, acc_mode);
+       if (!S_ISREG(nd->inode->i_mode))
+               will_truncate = 0;
+
+       if (will_truncate) {
+               error = mnt_want_write(nd->path.mnt);
+               if (error)
+                       goto exit;
+               want_write = 1;
+       }
+common:
+       error = may_open(&nd->path, acc_mode, open_flag);
+       if (error)
+               goto exit;
+       filp = nameidata_to_filp(nd);
+       if (!IS_ERR(filp)) {
+               error = ima_file_check(filp, op->acc_mode);
+               if (error) {
+                       fput(filp);
+                       filp = ERR_PTR(error);
+               }
+       }
+       if (!IS_ERR(filp)) {
+               if (will_truncate) {
+                       error = handle_truncate(filp);
+                       if (error) {
+                               fput(filp);
+                               filp = ERR_PTR(error);
+                       }
+               }
+       }
+out:
+       if (want_write)
+               mnt_drop_write(nd->path.mnt);
+       path_put(&nd->path);
        return filp;
 
 exit_mutex_unlock:
@@ -2404,197 +2229,103 @@ exit_mutex_unlock:
 exit_dput:
        path_put_conditional(path, nd);
 exit:
-       path_put(&nd->path);
-       return ERR_PTR(error);
+       filp = ERR_PTR(error);
+       goto out;
 }
 
-/*
- * Note that the low bits of the passed in "open_flag"
- * are not the same as in the local variable "flag". See
- * open_to_namei_flags() for more details.
- */
-struct file *do_filp_open(int dfd, const char *pathname,
-               int open_flag, int mode, int acc_mode)
+static struct file *path_openat(int dfd, const char *pathname,
+               struct nameidata *nd, const struct open_flags *op, int flags)
 {
+       struct file *base = NULL;
        struct file *filp;
-       struct nameidata nd;
-       int error;
        struct path path;
-       int count = 0;
-       int flag = open_to_namei_flags(open_flag);
-       int flags;
-
-       if (!(open_flag & O_CREAT))
-               mode = 0;
-
-       /* Must never be set by userspace */
-       open_flag &= ~FMODE_NONOTIFY;
-
-       /*
-        * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
-        * check for O_DSYNC if the need any syncing at all we enforce it's
-        * always set instead of having to deal with possibly weird behaviour
-        * for malicious applications setting only __O_SYNC.
-        */
-       if (open_flag & __O_SYNC)
-               open_flag |= O_DSYNC;
-
-       if (!acc_mode)
-               acc_mode = MAY_OPEN | ACC_MODE(open_flag);
-
-       /* O_TRUNC implies we need access checks for write permissions */
-       if (open_flag & O_TRUNC)
-               acc_mode |= MAY_WRITE;
-
-       /* Allow the LSM permission hook to distinguish append 
-          access from general write access. */
-       if (open_flag & O_APPEND)
-               acc_mode |= MAY_APPEND;
-
-       flags = LOOKUP_OPEN;
-       if (open_flag & O_CREAT) {
-               flags |= LOOKUP_CREATE;
-               if (open_flag & O_EXCL)
-                       flags |= LOOKUP_EXCL;
-       }
-       if (open_flag & O_DIRECTORY)
-               flags |= LOOKUP_DIRECTORY;
-       if (!(open_flag & O_NOFOLLOW))
-               flags |= LOOKUP_FOLLOW;
+       int error;
 
        filp = get_empty_filp();
        if (!filp)
                return ERR_PTR(-ENFILE);
 
-       filp->f_flags = open_flag;
-       nd.intent.open.file = filp;
-       nd.intent.open.flags = flag;
-       nd.intent.open.create_mode = mode;
+       filp->f_flags = op->open_flag;
+       nd->intent.open.file = filp;
+       nd->intent.open.flags = open_to_namei_flags(op->open_flag);
+       nd->intent.open.create_mode = op->mode;
 
-       if (open_flag & O_CREAT)
-               goto creat;
-
-       /* !O_CREAT, simple open */
-       error = do_path_lookup(dfd, pathname, flags, &nd);
+       error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
        if (unlikely(error))
                goto out_filp;
-       error = -ELOOP;
-       if (!(nd.flags & LOOKUP_FOLLOW)) {
-               if (nd.inode->i_op->follow_link)
-                       goto out_path;
-       }
-       error = -ENOTDIR;
-       if (nd.flags & LOOKUP_DIRECTORY) {
-               if (!nd.inode->i_op->lookup)
-                       goto out_path;
-       }
-       audit_inode(pathname, nd.path.dentry);
-       filp = finish_open(&nd, open_flag, acc_mode);
-       release_open_intent(&nd);
-       return filp;
-
-creat:
-       /* OK, have to create the file. Find the parent. */
-       error = path_init_rcu(dfd, pathname,
-                       LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-       if (error)
-               goto out_filp;
-       error = path_walk_rcu(pathname, &nd);
-       path_finish_rcu(&nd);
-       if (unlikely(error == -ECHILD || error == -ESTALE)) {
-               /* slower, locked walk */
-               if (error == -ESTALE) {
-reval:
-                       flags |= LOOKUP_REVAL;
-               }
-               error = path_init(dfd, pathname,
-                               LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-               if (error)
-                       goto out_filp;
 
-               error = path_walk_simple(pathname, &nd);
-       }
+       current->total_link_count = 0;
+       error = link_path_walk(pathname, nd);
        if (unlikely(error))
                goto out_filp;
-       if (unlikely(!audit_dummy_context()))
-               audit_inode(pathname, nd.path.dentry);
 
-       /*
-        * We have the parent and last component.
-        */
-       nd.flags = flags;
-       filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
+       filp = do_last(nd, &path, op, pathname);
        while (unlikely(!filp)) { /* trailing symlink */
                struct path link = path;
-               struct inode *linki = link.dentry->d_inode;
                void *cookie;
-               error = -ELOOP;
-               if (!(nd.flags & LOOKUP_FOLLOW))
-                       goto exit_dput;
-               if (count++ == 32)
-                       goto exit_dput;
-               /*
-                * This is subtle. Instead of calling do_follow_link() we do
-                * the thing by hands. The reason is that this way we have zero
-                * link_count and path_walk() (called from ->follow_link)
-                * honoring LOOKUP_PARENT.  After that we have the parent and
-                * last component, i.e. we are in the same situation as after
-                * the first path_walk().  Well, almost - if the last component
-                * is normal we get its copy stored in nd->last.name and we will
-                * have to putname() it when we are done. Procfs-like symlinks
-                * just set LAST_BIND.
-                */
-               nd.flags |= LOOKUP_PARENT;
-               error = security_inode_follow_link(link.dentry, &nd);
-               if (error)
-                       goto exit_dput;
-               error = __do_follow_link(&link, &nd, &cookie);
-               if (unlikely(error)) {
-                       if (!IS_ERR(cookie) && linki->i_op->put_link)
-                               linki->i_op->put_link(link.dentry, &nd, cookie);
-                       /* nd.path had been dropped */
-                       nd.path = link;
-                       goto out_path;
+               if (!(nd->flags & LOOKUP_FOLLOW)) {
+                       path_put_conditional(&path, nd);
+                       path_put(&nd->path);
+                       filp = ERR_PTR(-ELOOP);
+                       break;
                }
-               nd.flags &= ~LOOKUP_PARENT;
-               filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-               if (linki->i_op->put_link)
-                       linki->i_op->put_link(link.dentry, &nd, cookie);
-               path_put(&link);
+               nd->flags |= LOOKUP_PARENT;
+               nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
+               error = follow_link(&link, nd, &cookie);
+               if (unlikely(error))
+                       filp = ERR_PTR(error);
+               else
+                       filp = do_last(nd, &path, op, pathname);
+               put_link(nd, &link, cookie);
        }
 out:
-       if (nd.root.mnt)
-               path_put(&nd.root);
-       if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
-               goto reval;
-       release_open_intent(&nd);
+       if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
+               path_put(&nd->root);
+       if (base)
+               fput(base);
+       release_open_intent(nd);
        return filp;
 
-exit_dput:
-       path_put_conditional(&path, &nd);
-out_path:
-       path_put(&nd.path);
 out_filp:
        filp = ERR_PTR(error);
        goto out;
 }
 
-/**
- * filp_open - open file and return file pointer
- *
- * @filename:  path to open
- * @flags:     open flags as per the open(2) second argument
- * @mode:      mode for the new file if O_CREAT is set, else ignored
- *
- * This is the helper to open a file from kernelspace if you really
- * have to.  But in generally you should not do this, so please move
- * along, nothing to see here..
- */
-struct file *filp_open(const char *filename, int flags, int mode)
+struct file *do_filp_open(int dfd, const char *pathname,
+               const struct open_flags *op, int flags)
 {
-       return do_filp_open(AT_FDCWD, filename, flags, mode, 0);
+       struct nameidata nd;
+       struct file *filp;
+
+       filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
+       if (unlikely(filp == ERR_PTR(-ECHILD)))
+               filp = path_openat(dfd, pathname, &nd, op, flags);
+       if (unlikely(filp == ERR_PTR(-ESTALE)))
+               filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
+       return filp;
+}
+
+struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+               const char *name, const struct open_flags *op, int flags)
+{
+       struct nameidata nd;
+       struct file *file;
+
+       nd.root.mnt = mnt;
+       nd.root.dentry = dentry;
+
+       flags |= LOOKUP_ROOT;
+
+       if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
+               return ERR_PTR(-ELOOP);
+
+       file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
+       if (unlikely(file == ERR_PTR(-ECHILD)))
+               file = path_openat(-1, name, &nd, op, flags);
+       if (unlikely(file == ERR_PTR(-ESTALE)))
+               file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
+       return file;
 }
-EXPORT_SYMBOL(filp_open);
 
 /**
  * lookup_create - lookup a dentry, creating it if it doesn't exist
@@ -2656,7 +2387,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        if (error)
                return error;
 
-       if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
+       if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
+           !ns_capable(inode_userns(dir), CAP_MKNOD))
                return -EPERM;
 
        if (!dir->i_op->mknod)
@@ -2817,10 +2549,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
 }
 
 /*
- * We try to drop the dentry early: we should have
- * a usage count of 2 if we're the only user of this
- * dentry, and if that is true (possibly after pruning
- * the dcache), then we drop the dentry now.
+ * The dentry_unhash() helper will try to drop the dentry early: we
+ * should have a usage count of 2 if we're the only user of this
+ * dentry, and if that is true (possibly after pruning the dcache),
+ * then we drop the dentry now.
  *
  * A low-level filesystem can, if it choses, legally
  * do a
@@ -2833,10 +2565,9 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
  */
 void dentry_unhash(struct dentry *dentry)
 {
-       dget(dentry);
        shrink_dcache_parent(dentry);
        spin_lock(&dentry->d_lock);
-       if (dentry->d_count == 2)
+       if (dentry->d_count == 1)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
 }
@@ -2852,25 +2583,27 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
                return -EPERM;
 
        mutex_lock(&dentry->d_inode->i_mutex);
-       dentry_unhash(dentry);
+
+       error = -EBUSY;
        if (d_mountpoint(dentry))
-               error = -EBUSY;
-       else {
-               error = security_inode_rmdir(dir, dentry);
-               if (!error) {
-                       error = dir->i_op->rmdir(dir, dentry);
-                       if (!error) {
-                               dentry->d_inode->i_flags |= S_DEAD;
-                               dont_mount(dentry);
-                       }
-               }
-       }
+               goto out;
+
+       error = security_inode_rmdir(dir, dentry);
+       if (error)
+               goto out;
+
+       shrink_dcache_parent(dentry);
+       error = dir->i_op->rmdir(dir, dentry);
+       if (error)
+               goto out;
+
+       dentry->d_inode->i_flags |= S_DEAD;
+       dont_mount(dentry);
+
+out:
        mutex_unlock(&dentry->d_inode->i_mutex);
-       if (!error) {
+       if (!error)
                d_delete(dentry);
-       }
-       dput(dentry);
-
        return error;
 }
 
@@ -2904,6 +2637,10 @@ static long do_rmdir(int dfd, const char __user *pathname)
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto exit2;
+       if (!dentry->d_inode) {
+               error = -ENOENT;
+               goto exit3;
+       }
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto exit3;
@@ -2992,8 +2729,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)
                if (nd.last.name[nd.last.len])
                        goto slashes;
                inode = dentry->d_inode;
-               if (inode)
-                       ihold(inode);
+               if (!inode)
+                       goto slashes;
+               ihold(inode);
                error = mnt_want_write(nd.path.mnt);
                if (error)
                        goto exit2;
@@ -3133,7 +2871,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
                return error;
 
        mutex_lock(&inode->i_mutex);
-       error = dir->i_op->link(old_dentry, dir, new_dentry);
+       /* Make sure we don't allow creating hardlink to an unlinked file */
+       if (inode->i_nlink == 0)
+               error =  -ENOENT;
+       else
+               error = dir->i_op->link(old_dentry, dir, new_dentry);
        mutex_unlock(&inode->i_mutex);
        if (!error)
                fsnotify_link(dir, inode, new_dentry);
@@ -3155,15 +2897,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
        struct dentry *new_dentry;
        struct nameidata nd;
        struct path old_path;
+       int how = 0;
        int error;
        char *to;
 
-       if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
+       if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;
+       /*
+        * To use null names we require CAP_DAC_READ_SEARCH
+        * This ensures that not everyone will be able to create
+        * handlink using the passed filedescriptor.
+        */
+       if (flags & AT_EMPTY_PATH) {
+               if (!capable(CAP_DAC_READ_SEARCH))
+                       return -ENOENT;
+               how = LOOKUP_EMPTY;
+       }
+
+       if (flags & AT_SYMLINK_FOLLOW)
+               how |= LOOKUP_FOLLOW;
 
-       error = user_path_at(olddfd, oldname,
-                            flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
-                            &old_path);
+       error = user_path_at(olddfd, oldname, how, &old_path);
        if (error)
                return error;
 
@@ -3225,12 +2979,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
  *        HOWEVER, it relies on the assumption that any object with ->lookup()
  *        has no more than 1 dentry.  If "hybrid" objects will ever appear,
  *        we'd better make sure that there's no link(2) for them.
- *     d) some filesystems don't support opened-but-unlinked directories,
- *        either because of layout or because they are not ready to deal with
- *        all cases correctly. The latter will be fixed (taking this sort of
- *        stuff into VFS), but the former is not going away. Solution: the same
- *        trick as in rmdir().
- *     e) conversion from fhandle to dentry may come in the wrong moment - when
+ *     d) conversion from fhandle to dentry may come in the wrong moment - when
  *        we are removing the target. Solution: we will have to grab ->i_mutex
  *        in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
  *        ->i_mutex on parents, which works but leads to some truly excessive
@@ -3240,7 +2989,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
                          struct inode *new_dir, struct dentry *new_dentry)
 {
        int error = 0;
-       struct inode *target;
+       struct inode *target = new_dentry->d_inode;
 
        /*
         * If we are going to change the parent - check write permissions,
@@ -3256,26 +3005,26 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
        if (error)
                return error;
 
-       target = new_dentry->d_inode;
        if (target)
                mutex_lock(&target->i_mutex);
-       if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
-               error = -EBUSY;
-       else {
-               if (target)
-                       dentry_unhash(new_dentry);
-               error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-       }
+
+       error = -EBUSY;
+       if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
+               goto out;
+
+       if (target)
+               shrink_dcache_parent(new_dentry);
+       error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+       if (error)
+               goto out;
+
        if (target) {
-               if (!error) {
-                       target->i_flags |= S_DEAD;
-                       dont_mount(new_dentry);
-               }
-               mutex_unlock(&target->i_mutex);
-               if (d_unhashed(new_dentry))
-                       d_rehash(new_dentry);
-               dput(new_dentry);
+               target->i_flags |= S_DEAD;
+               dont_mount(new_dentry);
        }
+out:
+       if (target)
+               mutex_unlock(&target->i_mutex);
        if (!error)
                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
                        d_move(old_dentry,new_dentry);
@@ -3285,7 +3034,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
                            struct inode *new_dir, struct dentry *new_dentry)
 {
-       struct inode *target;
+       struct inode *target = new_dentry->d_inode;
        int error;
 
        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -3293,19 +3042,22 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
                return error;
 
        dget(new_dentry);
-       target = new_dentry->d_inode;
        if (target)
                mutex_lock(&target->i_mutex);
+
+       error = -EBUSY;
        if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
-               error = -EBUSY;
-       else
-               error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-       if (!error) {
-               if (target)
-                       dont_mount(new_dentry);
-               if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
-                       d_move(old_dentry, new_dentry);
-       }
+               goto out;
+
+       error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+       if (error)
+               goto out;
+
+       if (target)
+               dont_mount(new_dentry);
+       if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+               d_move(old_dentry, new_dentry);
+out:
        if (target)
                mutex_unlock(&target->i_mutex);
        dput(new_dentry);
@@ -3600,7 +3352,7 @@ EXPORT_SYMBOL(page_readlink);
 EXPORT_SYMBOL(__page_symlink);
 EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
-EXPORT_SYMBOL(path_lookup);
+EXPORT_SYMBOL(kern_path_parent);
 EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);