]> nv-tegra.nvidia Code Review - linux-2.6.git/blobdiff - fs/namei.c
exofs: Move exofs specific osd operations out of ios.c
[linux-2.6.git] / fs / namei.c
index def63e7c058dcc50cd4c72fa400536b71e72ebce..14ab8d3f2f0c8f7fc3e829ed26404e53a2420028 100644 (file)
@@ -70,7 +70,7 @@
  * name indicated by the symlink. The old code always complained that the
  * name already exists, due to not following the symlink even if its target
  * is nonexistent.  The new semantics affects also mknod() and link() when
- * the name is a symlink pointing to a non-existant name.
+ * the name is a symlink pointing to a non-existent name.
  *
  * I don't know which semantics is the right one, since I have no access
  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
@@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page)
        return retval;
 }
 
-char * getname(const char __user * filename)
+static char *getname_flags(const char __user * filename, int flags)
 {
        char *tmp, *result;
 
@@ -147,14 +147,21 @@ char * getname(const char __user * filename)
 
                result = tmp;
                if (retval < 0) {
-                       __putname(tmp);
-                       result = ERR_PTR(retval);
+                       if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+                               __putname(tmp);
+                               result = ERR_PTR(retval);
+                       }
                }
        }
        audit_getname(result);
        return result;
 }
 
+char *getname(const char __user * filename)
+{
+       return getname_flags(filename, 0);
+}
+
 #ifdef CONFIG_AUDITSYSCALL
 void putname(const char *name)
 {
@@ -172,10 +179,13 @@ EXPORT_SYMBOL(putname);
 static int acl_permission_check(struct inode *inode, int mask, unsigned int flags,
                int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
 {
-       umode_t                 mode = inode->i_mode;
+       unsigned int mode = inode->i_mode;
 
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 
+       if (current_user_ns() != inode_userns(inode))
+               goto other_perms;
+
        if (current_fsuid() == inode->i_uid)
                mode >>= 6;
        else {
@@ -189,6 +199,7 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
                        mode >>= 3;
        }
 
+other_perms:
        /*
         * If the DACs are ok we don't need any capability check.
         */
@@ -227,10 +238,11 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
 
        /*
         * Read/write DACs are always overridable.
-        * Executable DACs are overridable if at least one exec bit is set.
+        * Executable DACs are overridable for all directories and
+        * for non-directories that have least one exec bit set.
         */
        if (!(mask & MAY_EXEC) || execute_ok(inode))
-               if (capable(CAP_DAC_OVERRIDE))
+               if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
                        return 0;
 
        /*
@@ -238,7 +250,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
         */
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
        if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
-               if (capable(CAP_DAC_READ_SEARCH))
+               if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
                        return 0;
 
        return -EACCES;
@@ -380,103 +392,65 @@ void path_put(struct path *path)
 }
 EXPORT_SYMBOL(path_put);
 
-/**
- * nameidata_drop_rcu - drop this nameidata out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
+/*
  * Path walking has 2 modes, rcu-walk and ref-walk (see
- * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
- * to drop out of rcu-walk mode and take normal reference counts on dentries
- * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
- * refcounts at the last known good point before rcu-walk got stuck, so
- * ref-walk may continue from there. If this is not successful (eg. a seqcount
- * has changed), then failure is returned and path walk restarts from the
- * beginning in ref-walk mode.
- *
- * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
- * ref-walk. Must be called from rcu-walk context.
+ * Documentation/filesystems/path-lookup.txt).  In situations when we can't
+ * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
+ * normal reference counts on dentries and vfsmounts to transition to rcu-walk
+ * mode.  Refcounts are grabbed at the last known good point before rcu-walk
+ * got stuck, so ref-walk may continue from there. If this is not successful
+ * (eg. a seqcount has changed), then failure is returned and it's up to caller
+ * to restart the path walk from the beginning in ref-walk mode.
  */
-static int nameidata_drop_rcu(struct nameidata *nd)
-{
-       struct fs_struct *fs = current->fs;
-       struct dentry *dentry = nd->path.dentry;
-
-       BUG_ON(!(nd->flags & LOOKUP_RCU));
-       if (nd->root.mnt) {
-               spin_lock(&fs->lock);
-               if (nd->root.mnt != fs->root.mnt ||
-                               nd->root.dentry != fs->root.dentry)
-                       goto err_root;
-       }
-       spin_lock(&dentry->d_lock);
-       if (!__d_rcu_to_refcount(dentry, nd->seq))
-               goto err;
-       BUG_ON(nd->inode != dentry->d_inode);
-       spin_unlock(&dentry->d_lock);
-       if (nd->root.mnt) {
-               path_get(&nd->root);
-               spin_unlock(&fs->lock);
-       }
-       mntget(nd->path.mnt);
-
-       rcu_read_unlock();
-       br_read_unlock(vfsmount_lock);
-       nd->flags &= ~LOOKUP_RCU;
-       return 0;
-err:
-       spin_unlock(&dentry->d_lock);
-err_root:
-       if (nd->root.mnt)
-               spin_unlock(&fs->lock);
-       return -ECHILD;
-}
-
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
-{
-       if (nd->flags & LOOKUP_RCU)
-               return nameidata_drop_rcu(nd);
-       return 0;
-}
 
 /**
- * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * @dentry: dentry to drop
+ * unlazy_walk - try to switch to ref-walk mode.
+ * @nd: nameidata pathwalk data
+ * @dentry: child of nd->path.dentry or NULL
  * Returns: 0 on success, -ECHILD on failure
  *
- * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
- * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
- * @nd. Must be called from rcu-walk context.
+ * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
+ * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
+ * @nd or NULL.  Must be called from rcu-walk context.
  */
-static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
+static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 {
        struct fs_struct *fs = current->fs;
        struct dentry *parent = nd->path.dentry;
+       int want_root = 0;
 
        BUG_ON(!(nd->flags & LOOKUP_RCU));
-       if (nd->root.mnt) {
+       if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+               want_root = 1;
                spin_lock(&fs->lock);
                if (nd->root.mnt != fs->root.mnt ||
                                nd->root.dentry != fs->root.dentry)
                        goto err_root;
        }
        spin_lock(&parent->d_lock);
-       spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-       if (!__d_rcu_to_refcount(dentry, nd->seq))
-               goto err;
-       /*
-        * If the sequence check on the child dentry passed, then the child has
-        * not been removed from its parent. This means the parent dentry must
-        * be valid and able to take a reference at this point.
-        */
-       BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
-       BUG_ON(!parent->d_count);
-       parent->d_count++;
-       spin_unlock(&dentry->d_lock);
+       if (!dentry) {
+               if (!__d_rcu_to_refcount(parent, nd->seq))
+                       goto err_parent;
+               BUG_ON(nd->inode != parent->d_inode);
+       } else {
+               if (dentry->d_parent != parent)
+                       goto err_parent;
+               spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+               if (!__d_rcu_to_refcount(dentry, nd->seq))
+                       goto err_child;
+               /*
+                * If the sequence check on the child dentry passed, then
+                * the child has not been removed from its parent. This
+                * means the parent dentry must be valid and able to take
+                * a reference at this point.
+                */
+               BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+               BUG_ON(!parent->d_count);
+               parent->d_count++;
+               spin_unlock(&dentry->d_lock);
+       }
        spin_unlock(&parent->d_lock);
-       if (nd->root.mnt) {
+       if (want_root) {
                path_get(&nd->root);
                spin_unlock(&fs->lock);
        }
@@ -486,66 +460,17 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
        br_read_unlock(vfsmount_lock);
        nd->flags &= ~LOOKUP_RCU;
        return 0;
-err:
+
+err_child:
        spin_unlock(&dentry->d_lock);
+err_parent:
        spin_unlock(&parent->d_lock);
 err_root:
-       if (nd->root.mnt)
+       if (want_root)
                spin_unlock(&fs->lock);
        return -ECHILD;
 }
 
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
-{
-       if (nd->flags & LOOKUP_RCU) {
-               if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
-                       nd->flags &= ~LOOKUP_RCU;
-                       nd->root.mnt = NULL;
-                       rcu_read_unlock();
-                       br_read_unlock(vfsmount_lock);
-                       return -ECHILD;
-               }
-       }
-       return 0;
-}
-
-/**
- * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
- * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
- * nd->path should be the final element of the lookup, so nd->root is discarded.
- * Must be called from rcu-walk context.
- */
-static int nameidata_drop_rcu_last(struct nameidata *nd)
-{
-       struct dentry *dentry = nd->path.dentry;
-
-       BUG_ON(!(nd->flags & LOOKUP_RCU));
-       nd->flags &= ~LOOKUP_RCU;
-       nd->root.mnt = NULL;
-       spin_lock(&dentry->d_lock);
-       if (!__d_rcu_to_refcount(dentry, nd->seq))
-               goto err_unlock;
-       BUG_ON(nd->inode != dentry->d_inode);
-       spin_unlock(&dentry->d_lock);
-
-       mntget(nd->path.mnt);
-
-       rcu_read_unlock();
-       br_read_unlock(vfsmount_lock);
-
-       return 0;
-
-err_unlock:
-       spin_unlock(&dentry->d_lock);
-       rcu_read_unlock();
-       br_read_unlock(vfsmount_lock);
-       return -ECHILD;
-}
-
 /**
  * release_open_intent - free up open intent resources
  * @nd: pointer to nameidata
@@ -589,49 +514,39 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
        return dentry;
 }
 
-static inline struct dentry *
-do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd)
-{
-       int status = d_revalidate(dentry, nd);
-       if (likely(status > 0))
-               return dentry;
-       if (status == -ECHILD) {
-               if (nameidata_dentry_drop_rcu(nd, dentry))
-                       return ERR_PTR(-ECHILD);
-               return do_revalidate(dentry, nd);
-       }
-       if (status < 0)
-               return ERR_PTR(status);
-       /* Don't d_invalidate in rcu-walk mode */
-       if (nameidata_dentry_drop_rcu(nd, dentry))
-               return ERR_PTR(-ECHILD);
-       if (!d_invalidate(dentry)) {
-               dput(dentry);
-               dentry = NULL;
-       }
-       return dentry;
-}
-
-/*
- * handle_reval_path - force revalidation of a dentry
- *
- * In some situations the path walking code will trust dentries without
- * revalidating them. This causes problems for filesystems that depend on
- * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
- * (which indicates that it's possible for the dentry to go stale), force
- * a d_revalidate call before proceeding.
+/**
+ * complete_walk - successful completion of path walk
+ * @nd:  pointer nameidata
  *
- * Returns 0 if the revalidation was successful. If the revalidation fails,
- * either return the error returned by d_revalidate or -ESTALE if the
- * revalidation it just returned 0. If d_revalidate returns 0, we attempt to
- * invalidate the dentry. It's up to the caller to handle putting references
- * to the path if necessary.
+ * If we had been in RCU mode, drop out of it and legitimize nd->path.
+ * Revalidate the final result, unless we'd already done that during
+ * the path walk or the filesystem doesn't ask for it.  Return 0 on
+ * success, -error on failure.  In case of failure caller does not
+ * need to drop nd->path.
  */
-static inline int handle_reval_path(struct nameidata *nd)
+static int complete_walk(struct nameidata *nd)
 {
        struct dentry *dentry = nd->path.dentry;
        int status;
 
+       if (nd->flags & LOOKUP_RCU) {
+               nd->flags &= ~LOOKUP_RCU;
+               if (!(nd->flags & LOOKUP_ROOT))
+                       nd->root.mnt = NULL;
+               spin_lock(&dentry->d_lock);
+               if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
+                       spin_unlock(&dentry->d_lock);
+                       rcu_read_unlock();
+                       br_read_unlock(vfsmount_lock);
+                       return -ECHILD;
+               }
+               BUG_ON(nd->inode != dentry->d_inode);
+               spin_unlock(&dentry->d_lock);
+               mntget(nd->path.mnt);
+               rcu_read_unlock();
+               br_read_unlock(vfsmount_lock);
+       }
+
        if (likely(!(nd->flags & LOOKUP_JUMPED)))
                return 0;
 
@@ -649,6 +564,7 @@ static inline int handle_reval_path(struct nameidata *nd)
        if (!status)
                status = -ESTALE;
 
+       path_put(&nd->path);
        return status;
 }
 
@@ -664,6 +580,7 @@ static inline int handle_reval_path(struct nameidata *nd)
 static inline int exec_permission(struct inode *inode, unsigned int flags)
 {
        int ret;
+       struct user_namespace *ns = inode_userns(inode);
 
        if (inode->i_op->permission) {
                ret = inode->i_op->permission(inode, MAY_EXEC, flags);
@@ -676,7 +593,8 @@ static inline int exec_permission(struct inode *inode, unsigned int flags)
        if (ret == -ECHILD)
                return ret;
 
-       if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
+       if (ns_capable(ns, CAP_DAC_OVERRIDE) ||
+                       ns_capable(ns, CAP_DAC_READ_SEARCH))
                goto ok;
 
        return ret;
@@ -701,6 +619,7 @@ static __always_inline void set_root_rcu(struct nameidata *nd)
                do {
                        seq = read_seqcount_begin(&fs->seq);
                        nd->root = fs->root;
+                       nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
                } while (read_seqcount_retry(&fs->seq, seq));
        }
 }
@@ -747,20 +666,36 @@ static inline void path_to_nameidata(const struct path *path,
        nd->path.dentry = path->dentry;
 }
 
+static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+{
+       struct inode *inode = link->dentry->d_inode;
+       if (!IS_ERR(cookie) && inode->i_op->put_link)
+               inode->i_op->put_link(link->dentry, nd, cookie);
+       path_put(link);
+}
+
 static __always_inline int
-__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
+follow_link(struct path *link, struct nameidata *nd, void **p)
 {
        int error;
        struct dentry *dentry = link->dentry;
 
        BUG_ON(nd->flags & LOOKUP_RCU);
 
-       touch_atime(link->mnt, dentry);
-       nd_set_link(nd, NULL);
-
        if (link->mnt == nd->path.mnt)
                mntget(link->mnt);
 
+       if (unlikely(current->total_link_count >= 40)) {
+               *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
+               path_put(&nd->path);
+               return -ELOOP;
+       }
+       cond_resched();
+       current->total_link_count++;
+
+       touch_atime(link->mnt, dentry);
+       nd_set_link(nd, NULL);
+
        error = security_inode_follow_link(link->dentry, nd);
        if (error) {
                *p = ERR_PTR(error); /* no ->put_link(), please */
@@ -776,51 +711,19 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
                error = 0;
                if (s)
                        error = __vfs_follow_link(nd, s);
-               else if (nd->last_type == LAST_BIND)
+               else if (nd->last_type == LAST_BIND) {
                        nd->flags |= LOOKUP_JUMPED;
+                       nd->inode = nd->path.dentry->d_inode;
+                       if (nd->inode->i_op->follow_link) {
+                               /* stepped on a _really_ weird one */
+                               path_put(&nd->path);
+                               error = -ELOOP;
+                       }
+               }
        }
        return error;
 }
 
-/*
- * This limits recursive symlink follows to 8, while
- * limiting consecutive symlinks to 40.
- *
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups. 
- */
-static inline int do_follow_link(struct inode *inode, struct path *path, struct nameidata *nd)
-{
-       void *cookie;
-       int err = -ELOOP;
-
-       /* We drop rcu-walk here */
-       if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
-               return -ECHILD;
-       BUG_ON(inode != path->dentry->d_inode);
-
-       if (current->link_count >= MAX_NESTED_LINKS)
-               goto loop;
-       if (current->total_link_count >= 40)
-               goto loop;
-       BUG_ON(nd->depth >= MAX_NESTED_LINKS);
-       cond_resched();
-       current->link_count++;
-       current->total_link_count++;
-       nd->depth++;
-       err = __do_follow_link(path, nd, &cookie);
-       if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
-               path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
-       path_put(path);
-       current->link_count--;
-       nd->depth--;
-       return err;
-loop:
-       path_put_conditional(path, nd);
-       path_put(&nd->path);
-       return err;
-}
-
 static int follow_up_rcu(struct path *path)
 {
        struct vfsmount *parent;
@@ -912,6 +815,11 @@ static int follow_automount(struct path *path, unsigned flags,
        if (!mnt) /* mount collision */
                return 0;
 
+       if (!*need_mntput) {
+               /* lock_mount() may release path->mnt on error */
+               mntget(path->mnt);
+               *need_mntput = true;
+       }
        err = finish_automount(mnt, path);
 
        switch (err) {
@@ -919,12 +827,9 @@ static int follow_automount(struct path *path, unsigned flags,
                /* Someone else made a mount here whilst we were busy */
                return 0;
        case 0:
-               dput(path->dentry);
-               if (*need_mntput)
-                       mntput(path->mnt);
+               path_put(path);
                path->mnt = mnt;
                path->dentry = dget(mnt->mnt_root);
-               *need_mntput = true;
                return 0;
        default:
                return err;
@@ -944,9 +849,10 @@ static int follow_automount(struct path *path, unsigned flags,
  */
 static int follow_managed(struct path *path, unsigned flags)
 {
+       struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
        unsigned managed;
        bool need_mntput = false;
-       int ret;
+       int ret = 0;
 
        /* Given that we're not holding a lock here, we retain the value in a
         * local variable for each dentry as we look at it so that we don't see
@@ -959,10 +865,9 @@ static int follow_managed(struct path *path, unsigned flags)
                if (managed & DCACHE_MANAGE_TRANSIT) {
                        BUG_ON(!path->dentry->d_op);
                        BUG_ON(!path->dentry->d_op->d_manage);
-                       ret = path->dentry->d_op->d_manage(path->dentry,
-                                                          false, false);
+                       ret = path->dentry->d_op->d_manage(path->dentry, false);
                        if (ret < 0)
-                               return ret == -EISDIR ? 0 : ret;
+                               break;
                }
 
                /* Transit to a mounted filesystem. */
@@ -988,14 +893,19 @@ static int follow_managed(struct path *path, unsigned flags)
                if (managed & DCACHE_NEED_AUTOMOUNT) {
                        ret = follow_automount(path, flags, &need_mntput);
                        if (ret < 0)
-                               return ret == -EISDIR ? 0 : ret;
+                               break;
                        continue;
                }
 
                /* We didn't change the current path point */
                break;
        }
-       return 0;
+
+       if (need_mntput && path->mnt == mnt)
+               mntput(path->mnt);
+       if (ret == -EISDIR)
+               ret = 0;
+       return ret;
 }
 
 int follow_down_one(struct path *path)
@@ -1013,38 +923,62 @@ int follow_down_one(struct path *path)
        return 0;
 }
 
+static inline bool managed_dentry_might_block(struct dentry *dentry)
+{
+       return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
+               dentry->d_op->d_manage(dentry, true) < 0);
+}
+
 /*
- * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
- * meet a managed dentry and we're not walking to "..".  True is returned to
- * continue, false to abort.
+ * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
+ * we meet a managed dentry that would need blocking.
  */
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
-                              struct inode **inode, bool reverse_transit)
+                              struct inode **inode)
 {
-       while (d_mountpoint(path->dentry)) {
+       for (;;) {
                struct vfsmount *mounted;
-               if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
-                   !reverse_transit &&
-                   path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+               /*
+                * Don't forget we might have a non-mountpoint managed dentry
+                * that wants to block transit.
+                */
+               if (unlikely(managed_dentry_might_block(path->dentry)))
                        return false;
+
+               if (!d_mountpoint(path->dentry))
+                       break;
+
                mounted = __lookup_mnt(path->mnt, path->dentry, 1);
                if (!mounted)
                        break;
                path->mnt = mounted;
                path->dentry = mounted->mnt_root;
                nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+               /*
+                * Update the inode too. We don't need to re-check the
+                * dentry sequence number here after this d_inode read,
+                * because a mount-point is always pinned.
+                */
                *inode = path->dentry->d_inode;
        }
-
-       if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-               return reverse_transit;
        return true;
 }
 
-static int follow_dotdot_rcu(struct nameidata *nd)
+static void follow_mount_rcu(struct nameidata *nd)
 {
-       struct inode *inode = nd->inode;
+       while (d_mountpoint(nd->path.dentry)) {
+               struct vfsmount *mounted;
+               mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
+               if (!mounted)
+                       break;
+               nd->path.mnt = mounted;
+               nd->path.dentry = mounted->mnt_root;
+               nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+       }
+}
 
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
        set_root_rcu(nd);
 
        while (1) {
@@ -1060,7 +994,6 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                        seq = read_seqcount_begin(&parent->d_seq);
                        if (read_seqcount_retry(&old->d_seq, nd->seq))
                                goto failed;
-                       inode = parent->d_inode;
                        nd->path.dentry = parent;
                        nd->seq = seq;
                        break;
@@ -1068,15 +1001,15 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                if (!follow_up_rcu(&nd->path))
                        break;
                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-               inode = nd->path.dentry->d_inode;
        }
-       __follow_mount_rcu(nd, &nd->path, &inode, true);
-       nd->inode = inode;
+       follow_mount_rcu(nd);
+       nd->inode = nd->path.dentry->d_inode;
        return 0;
 
 failed:
        nd->flags &= ~LOOKUP_RCU;
-       nd->root.mnt = NULL;
+       if (!(nd->flags & LOOKUP_ROOT))
+               nd->root.mnt = NULL;
        rcu_read_unlock();
        br_read_unlock(vfsmount_lock);
        return -ECHILD;
@@ -1086,11 +1019,8 @@ failed:
  * Follow down to the covering mount currently visible to userspace.  At each
  * point, the filesystem owning that dentry may be queried as to whether the
  * caller is permitted to proceed or not.
- *
- * Care must be taken as namespace_sem may be held (indicated by mounting_here
- * being true).
  */
-int follow_down(struct path *path, bool mounting_here)
+int follow_down(struct path *path)
 {
        unsigned managed;
        int ret;
@@ -1111,7 +1041,7 @@ int follow_down(struct path *path, bool mounting_here)
                        BUG_ON(!path->dentry->d_op);
                        BUG_ON(!path->dentry->d_op->d_manage);
                        ret = path->dentry->d_op->d_manage(
-                               path->dentry, mounting_here, false);
+                               path->dentry, false);
                        if (ret < 0)
                                return ret == -EISDIR ? 0 : ret;
                }
@@ -1213,7 +1143,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 {
        struct vfsmount *mnt = nd->path.mnt;
        struct dentry *dentry, *parent = nd->path.dentry;
-       struct inode *dir;
+       int need_reval = 1;
+       int status = 1;
        int err;
 
        /*
@@ -1223,48 +1154,72 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
         */
        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;
-
                *inode = nd->inode;
                dentry = __d_lookup_rcu(parent, name, &seq, inode);
-               if (!dentry) {
-                       if (nameidata_drop_rcu(nd))
-                               return -ECHILD;
-                       goto need_lookup;
-               }
+               if (!dentry)
+                       goto unlazy;
+
                /* Memory barrier in read_seqcount_begin of child is enough */
                if (__read_seqcount_retry(&parent->d_seq, nd->seq))
                        return -ECHILD;
-
                nd->seq = seq;
+
                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-                       dentry = do_revalidate_rcu(dentry, nd);
-                       if (!dentry)
-                               goto need_lookup;
-                       if (IS_ERR(dentry))
-                               goto fail;
-                       if (!(nd->flags & LOOKUP_RCU))
-                               goto done;
+                       status = d_revalidate(dentry, nd);
+                       if (unlikely(status <= 0)) {
+                               if (status != -ECHILD)
+                                       need_reval = 0;
+                               goto unlazy;
+                       }
                }
                path->mnt = mnt;
                path->dentry = dentry;
-               if (likely(__follow_mount_rcu(nd, path, inode, false)))
-                       return 0;
-               if (nameidata_drop_rcu(nd))
+               if (unlikely(!__follow_mount_rcu(nd, path, inode)))
+                       goto unlazy;
+               if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+                       goto unlazy;
+               return 0;
+unlazy:
+               if (unlazy_walk(nd, dentry))
                        return -ECHILD;
-               /* fallthru */
+       } else {
+               dentry = __d_lookup(parent, name);
        }
-       dentry = __d_lookup(parent, name);
-       if (!dentry)
-               goto need_lookup;
-found:
-       if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-               dentry = do_revalidate(dentry, nd);
-               if (!dentry)
-                       goto need_lookup;
-               if (IS_ERR(dentry))
-                       goto fail;
+
+retry:
+       if (unlikely(!dentry)) {
+               struct inode *dir = parent->d_inode;
+               BUG_ON(nd->inode != dir);
+
+               mutex_lock(&dir->i_mutex);
+               dentry = d_lookup(parent, name);
+               if (likely(!dentry)) {
+                       dentry = d_alloc_and_lookup(parent, name, nd);
+                       if (IS_ERR(dentry)) {
+                               mutex_unlock(&dir->i_mutex);
+                               return PTR_ERR(dentry);
+                       }
+                       /* known good */
+                       need_reval = 0;
+                       status = 1;
+               }
+               mutex_unlock(&dir->i_mutex);
        }
-done:
+       if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
+               status = d_revalidate(dentry, nd);
+       if (unlikely(status <= 0)) {
+               if (status < 0) {
+                       dput(dentry);
+                       return status;
+               }
+               if (!d_invalidate(dentry)) {
+                       dput(dentry);
+                       dentry = NULL;
+                       need_reval = 1;
+                       goto retry;
+               }
+       }
+
        path->mnt = mnt;
        path->dentry = dentry;
        err = follow_managed(path, nd->flags);
@@ -1274,39 +1229,6 @@ done:
        }
        *inode = path->dentry->d_inode;
        return 0;
-
-need_lookup:
-       dir = parent->d_inode;
-       BUG_ON(nd->inode != dir);
-
-       mutex_lock(&dir->i_mutex);
-       /*
-        * First re-do the cached lookup just in case it was created
-        * while we waited for the directory semaphore, or the first
-        * lookup failed due to an unrelated rename.
-        *
-        * This could use version numbering or similar to avoid unnecessary
-        * cache lookups, but then we'd have to do the first lookup in the
-        * non-racy way. However in the common case here, everything should
-        * be hot in cache, so would it be a big win?
-        */
-       dentry = d_lookup(parent, name);
-       if (likely(!dentry)) {
-               dentry = d_alloc_and_lookup(parent, name, nd);
-               mutex_unlock(&dir->i_mutex);
-               if (IS_ERR(dentry))
-                       goto fail;
-               goto done;
-       }
-       /*
-        * Uhhuh! Nasty case: the cache was re-populated while
-        * we waited on the semaphore. Need to revalidate.
-        */
-       mutex_unlock(&dir->i_mutex);
-       goto found;
-
-fail:
-       return PTR_ERR(dentry);
 }
 
 static inline int may_lookup(struct nameidata *nd)
@@ -1315,7 +1237,7 @@ static inline int may_lookup(struct nameidata *nd)
                int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
                if (err != -ECHILD)
                        return err;
-               if (nameidata_drop_rcu(nd))
+               if (unlazy_walk(nd, NULL))
                        return -ECHILD;
        }
        return exec_permission(nd->inode, 0);
@@ -1339,12 +1261,87 @@ static void terminate_walk(struct nameidata *nd)
                path_put(&nd->path);
        } else {
                nd->flags &= ~LOOKUP_RCU;
-               nd->root.mnt = NULL;
+               if (!(nd->flags & LOOKUP_ROOT))
+                       nd->root.mnt = NULL;
                rcu_read_unlock();
                br_read_unlock(vfsmount_lock);
        }
 }
 
+static inline int walk_component(struct nameidata *nd, struct path *path,
+               struct qstr *name, int type, int follow)
+{
+       struct inode *inode;
+       int err;
+       /*
+        * "." and ".." are special - ".." especially so because it has
+        * to be able to know about the current root directory and
+        * parent relationships.
+        */
+       if (unlikely(type != LAST_NORM))
+               return handle_dots(nd, type);
+       err = do_lookup(nd, name, path, &inode);
+       if (unlikely(err)) {
+               terminate_walk(nd);
+               return err;
+       }
+       if (!inode) {
+               path_to_nameidata(path, nd);
+               terminate_walk(nd);
+               return -ENOENT;
+       }
+       if (unlikely(inode->i_op->follow_link) && follow) {
+               if (nd->flags & LOOKUP_RCU) {
+                       if (unlikely(unlazy_walk(nd, path->dentry))) {
+                               terminate_walk(nd);
+                               return -ECHILD;
+                       }
+               }
+               BUG_ON(inode != path->dentry->d_inode);
+               return 1;
+       }
+       path_to_nameidata(path, nd);
+       nd->inode = inode;
+       return 0;
+}
+
+/*
+ * This limits recursive symlink follows to 8, while
+ * limiting consecutive symlinks to 40.
+ *
+ * Without that kind of total limit, nasty chains of consecutive
+ * symlinks can cause almost arbitrarily long lookups.
+ */
+static inline int nested_symlink(struct path *path, struct nameidata *nd)
+{
+       int res;
+
+       if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
+               path_put_conditional(path, nd);
+               path_put(&nd->path);
+               return -ELOOP;
+       }
+       BUG_ON(nd->depth >= MAX_NESTED_LINKS);
+
+       nd->depth++;
+       current->link_count++;
+
+       do {
+               struct path link = *path;
+               void *cookie;
+
+               res = follow_link(&link, nd, &cookie);
+               if (!res)
+                       res = walk_component(nd, path, &nd->last,
+                                            nd->last_type, LOOKUP_FOLLOW);
+               put_link(nd, &link, cookie);
+       } while (res > 0);
+
+       current->link_count--;
+       nd->depth--;
+       return res;
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1364,12 +1361,8 @@ static int link_path_walk(const char *name, struct nameidata *nd)
        if (!*name)
                return 0;
 
-       if (nd->depth)
-               lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
-
        /* At this point we know we have a real path component. */
        for(;;) {
-               struct inode *inode;
                unsigned long hash;
                struct qstr this;
                unsigned int c;
@@ -1420,74 +1413,26 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                        goto last_component;
                while (*++name == '/');
                if (!*name)
-                       goto last_with_slashes;
-
-               /*
-                * "." and ".." are special - ".." especially so because it has
-                * to be able to know about the current root directory and
-                * parent relationships.
-                */
-               if (unlikely(type != LAST_NORM)) {
-                       if (handle_dots(nd, type))
-                               return -ECHILD;
-                       continue;
-               }
+                       goto last_component;
 
-               /* This does the actual lookups.. */
-               err = do_lookup(nd, &this, &next, &inode);
-               if (err)
-                       break;
+               err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
+               if (err < 0)
+                       return err;
 
-               if (inode && inode->i_op->follow_link) {
-                       err = do_follow_link(inode, &next, nd);
+               if (err) {
+                       err = nested_symlink(&next, nd);
                        if (err)
                                return err;
-                       nd->inode = nd->path.dentry->d_inode;
-               } else {
-                       path_to_nameidata(&next, nd);
-                       nd->inode = inode;
                }
-               err = -ENOENT;
-               if (!nd->inode)
-                       break;
                err = -ENOTDIR; 
                if (!nd->inode->i_op->lookup)
                        break;
                continue;
                /* here ends the main loop */
 
-last_with_slashes:
-               lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 last_component:
                /* Clear LOOKUP_CONTINUE iff it was previously unset */
                nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
-               if (lookup_flags & LOOKUP_PARENT)
-                       goto lookup_parent;
-               if (unlikely(type != LAST_NORM))
-                       return handle_dots(nd, type);
-               err = do_lookup(nd, &this, &next, &inode);
-               if (err)
-                       break;
-               if (inode && unlikely(inode->i_op->follow_link) &&
-                   (lookup_flags & LOOKUP_FOLLOW)) {
-                       err = do_follow_link(inode, &next, nd);
-                       if (err)
-                               return err;
-                       nd->inode = nd->path.dentry->d_inode;
-               } else {
-                       path_to_nameidata(&next, nd);
-                       nd->inode = inode;
-               }
-               err = -ENOENT;
-               if (!nd->inode)
-                       break;
-               if (lookup_flags & LOOKUP_DIRECTORY) {
-                       err = -ENOTDIR; 
-                       if (!nd->inode->i_op->lookup)
-                               break;
-               }
-               return 0;
-lookup_parent:
                nd->last = this;
                nd->last_type = type;
                return 0;
@@ -1506,6 +1451,27 @@ static int path_init(int dfd, const char *name, unsigned int flags,
        nd->last_type = LAST_ROOT; /* if there are only slashes... */
        nd->flags = flags | LOOKUP_JUMPED;
        nd->depth = 0;
+       if (flags & LOOKUP_ROOT) {
+               struct inode *inode = nd->root.dentry->d_inode;
+               if (*name) {
+                       if (!inode->i_op->lookup)
+                               return -ENOTDIR;
+                       retval = inode_permission(inode, MAY_EXEC);
+                       if (retval)
+                               return retval;
+               }
+               nd->path = nd->root;
+               nd->inode = inode;
+               if (flags & LOOKUP_RCU) {
+                       br_read_lock(vfsmount_lock);
+                       rcu_read_lock();
+                       nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+               } else {
+                       path_get(&nd->path);
+               }
+               return 0;
+       }
+
        nd->root.mnt = NULL;
 
        if (*name=='/') {
@@ -1537,20 +1503,22 @@ static int path_init(int dfd, const char *name, unsigned int flags,
        } else {
                struct dentry *dentry;
 
-               file = fget_light(dfd, &fput_needed);
+               file = fget_raw_light(dfd, &fput_needed);
                retval = -EBADF;
                if (!file)
                        goto out_fail;
 
                dentry = file->f_path.dentry;
 
-               retval = -ENOTDIR;
-               if (!S_ISDIR(dentry->d_inode->i_mode))
-                       goto fput_fail;
+               if (*name) {
+                       retval = -ENOTDIR;
+                       if (!S_ISDIR(dentry->d_inode->i_mode))
+                               goto fput_fail;
 
-               retval = file_permission(file, MAY_EXEC);
-               if (retval)
-                       goto fput_fail;
+                       retval = file_permission(file, MAY_EXEC);
+                       if (retval)
+                               goto fput_fail;
+               }
 
                nd->path = file->f_path;
                if (flags & LOOKUP_RCU) {
@@ -1574,12 +1542,23 @@ out_fail:
        return retval;
 }
 
+static inline int lookup_last(struct nameidata *nd, struct path *path)
+{
+       if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
+               nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+
+       nd->flags &= ~LOOKUP_PARENT;
+       return walk_component(nd, path, &nd->last, nd->last_type,
+                                       nd->flags & LOOKUP_FOLLOW);
+}
+
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
 static int path_lookupat(int dfd, const char *name,
                                unsigned int flags, struct nameidata *nd)
 {
        struct file *base = NULL;
-       int retval;
+       struct path path;
+       int err;
 
        /*
         * Path walking is largely split up into 2 different synchronisation
@@ -1595,32 +1574,45 @@ static int path_lookupat(int dfd, const char *name,
         * be handled by restarting a traditional ref-walk (which will always
         * be able to complete).
         */
-       retval = path_init(dfd, name, flags, nd, &base);
+       err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
 
-       if (unlikely(retval))
-               return retval;
+       if (unlikely(err))
+               return err;
 
        current->total_link_count = 0;
-       retval = link_path_walk(name, nd);
-
-       if (nd->flags & LOOKUP_RCU) {
-               /* went all way through without dropping RCU */
-               BUG_ON(retval);
-               if (nameidata_drop_rcu_last(nd))
-                       retval = -ECHILD;
+       err = link_path_walk(name, nd);
+
+       if (!err && !(flags & LOOKUP_PARENT)) {
+               err = lookup_last(nd, &path);
+               while (err > 0) {
+                       void *cookie;
+                       struct path link = path;
+                       nd->flags |= LOOKUP_PARENT;
+                       err = follow_link(&link, nd, &cookie);
+                       if (!err)
+                               err = lookup_last(nd, &path);
+                       put_link(nd, &link, cookie);
+               }
        }
 
-       if (!retval)
-               retval = handle_reval_path(nd);
+       if (!err)
+               err = complete_walk(nd);
+
+       if (!err && nd->flags & LOOKUP_DIRECTORY) {
+               if (!nd->inode->i_op->lookup) {
+                       path_put(&nd->path);
+                       err = -ENOTDIR;
+               }
+       }
 
        if (base)
                fput(base);
 
-       if (nd->root.mnt) {
+       if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
                path_put(&nd->root);
                nd->root.mnt = NULL;
        }
-       return retval;
+       return err;
 }
 
 static int do_path_lookup(int dfd, const char *name,
@@ -1667,46 +1659,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                    const char *name, unsigned int flags,
                    struct nameidata *nd)
 {
-       int result;
-
-       /* same as do_path_lookup */
-       nd->last_type = LAST_ROOT;
-       nd->flags = flags | LOOKUP_JUMPED;
-       nd->depth = 0;
-
-       nd->path.dentry = dentry;
-       nd->path.mnt = mnt;
-       path_get(&nd->path);
-       nd->root = nd->path;
-       path_get(&nd->root);
-       nd->inode = nd->path.dentry->d_inode;
-
-       current->total_link_count = 0;
-
-       result = link_path_walk(name, nd);
-       if (!result)
-               result = handle_reval_path(nd);
-       if (result == -ESTALE) {
-               /* nd->path had been dropped */
-               current->total_link_count = 0;
-               nd->path.dentry = dentry;
-               nd->path.mnt = mnt;
-               nd->inode = dentry->d_inode;
-               path_get(&nd->path);
-               nd->flags = flags | LOOKUP_JUMPED | LOOKUP_REVAL;
-
-               result = link_path_walk(name, nd);
-               if (!result)
-                       result = handle_reval_path(nd);
-       }
-       if (unlikely(!result && !audit_dummy_context() && nd->path.dentry &&
-                               nd->inode))
-               audit_inode(name, nd->path.dentry);
-
-       path_put(&nd->root);
-       nd->root.mnt = NULL;
-
-       return result;
+       nd->root.dentry = dentry;
+       nd->root.mnt = mnt;
+       /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
+       return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
 }
 
 static struct dentry *__lookup_hash(struct qstr *name,
@@ -1795,7 +1751,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
                 struct path *path)
 {
        struct nameidata nd;
-       char *tmp = getname(name);
+       char *tmp = getname_flags(name, flags);
        int err = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
 
@@ -1837,11 +1793,15 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
 
        if (!(dir->i_mode & S_ISVTX))
                return 0;
+       if (current_user_ns() != inode_userns(inode))
+               goto other_userns;
        if (inode->i_uid == fsuid)
                return 0;
        if (dir->i_uid == fsuid)
                return 0;
-       return !capable(CAP_FOWNER);
+
+other_userns:
+       return !ns_capable(inode_userns(inode), CAP_FOWNER);
 }
 
 /*
@@ -1975,12 +1935,16 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        return error;
 }
 
-int may_open(struct path *path, int acc_mode, int flag)
+static int may_open(struct path *path, int acc_mode, int flag)
 {
        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
        int error;
 
+       /* O_PATH? */
+       if (!acc_mode)
+               return 0;
+
        if (!inode)
                return -ENOENT;
 
@@ -2017,7 +1981,7 @@ int may_open(struct path *path, int acc_mode, int flag)
        }
 
        /* O_NOATIME can only be set by the owner or superuser */
-       if (flag & O_NOATIME && !is_owner_or_cap(inode))
+       if (flag & O_NOATIME && !inode_owner_or_capable(inode))
                return -EPERM;
 
        /*
@@ -2048,30 +2012,6 @@ static int handle_truncate(struct file *filp)
        return error;
 }
 
-/*
- * Be careful about ever adding any more callers of this
- * function.  Its flags must be in the namei format, not
- * what get passed to sys_open().
- */
-static int __open_namei_create(struct nameidata *nd, struct path *path,
-                               int open_flag, int mode)
-{
-       int error;
-       struct dentry *dir = nd->path.dentry;
-
-       if (!IS_POSIXACL(dir->d_inode))
-               mode &= ~current_umask();
-       error = security_path_mknod(&nd->path, path->dentry, mode, 0);
-       if (error)
-               goto out_unlock;
-       error = vfs_create(dir->d_inode, path->dentry, mode, nd);
-out_unlock:
-       mutex_unlock(&dir->d_inode->i_mutex);
-       dput(nd->path.dentry);
-       nd->path.dentry = path->dentry;
-       return error;
-}
-
 /*
  * Note that while the flag value (low two bits) for sys_open means:
  *     00 - read-only
@@ -2096,17 +2036,6 @@ static inline int open_to_namei_flags(int flag)
        return flag;
 }
 
-static int open_will_truncate(int flag, struct inode *inode)
-{
-       /*
-        * We'll never write to the fs underlying
-        * a device file.
-        */
-       if (special_file(inode->i_mode))
-               return 0;
-       return (flag & O_TRUNC);
-}
-
 /*
  * Handle the last step of open()
  */
@@ -2114,9 +2043,12 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                            const struct open_flags *op, const char *pathname)
 {
        struct dentry *dir = nd->path.dentry;
-       int will_truncate;
+       struct dentry *dentry;
+       int open_flag = op->open_flag;
+       int will_truncate = open_flag & O_TRUNC;
+       int want_write = 0;
+       int acc_mode = op->acc_mode;
        struct file *filp;
-       struct inode *inode;
        int error;
 
        nd->flags &= ~LOOKUP_PARENT;
@@ -2130,59 +2062,44 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                        return ERR_PTR(error);
                /* fallthrough */
        case LAST_ROOT:
-               if (nd->flags & LOOKUP_RCU) {
-                       if (nameidata_drop_rcu_last(nd))
-                               return ERR_PTR(-ECHILD);
-               }
-               error = handle_reval_path(nd);
+               error = complete_walk(nd);
                if (error)
-                       goto exit;
+                       return ERR_PTR(error);
                audit_inode(pathname, nd->path.dentry);
-               if (op->open_flag & O_CREAT) {
+               if (open_flag & O_CREAT) {
                        error = -EISDIR;
                        goto exit;
                }
                goto ok;
        case LAST_BIND:
-               /* can't be RCU mode here */
-               error = handle_reval_path(nd);
+               error = complete_walk(nd);
                if (error)
-                       goto exit;
+                       return ERR_PTR(error);
                audit_inode(pathname, dir);
                goto ok;
        }
 
-       if (!(op->open_flag & O_CREAT)) {
+       if (!(open_flag & O_CREAT)) {
+               int symlink_ok = 0;
                if (nd->last.name[nd->last.len])
                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+               if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
+                       symlink_ok = 1;
                /* we _can_ be in RCU mode here */
-               error = do_lookup(nd, &nd->last, path, &inode);
-               if (error) {
-                       terminate_walk(nd);
+               error = walk_component(nd, path, &nd->last, LAST_NORM,
+                                       !symlink_ok);
+               if (error < 0)
                        return ERR_PTR(error);
-               }
-               if (!inode) {
-                       path_to_nameidata(path, nd);
-                       terminate_walk(nd);
-                       return ERR_PTR(-ENOENT);
-               }
-               if (unlikely(inode->i_op->follow_link)) {
-                       /* We drop rcu-walk here */
-                       if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
-                               return ERR_PTR(-ECHILD);
+               if (error) /* symlink */
                        return NULL;
-               }
-               path_to_nameidata(path, nd);
-               nd->inode = inode;
                /* sayonara */
-               if (nd->flags & LOOKUP_RCU) {
-                       if (nameidata_drop_rcu_last(nd))
-                               return ERR_PTR(-ECHILD);
-               }
+               error = complete_walk(nd);
+               if (error)
+                       return ERR_PTR(-ECHILD);
 
                error = -ENOTDIR;
                if (nd->flags & LOOKUP_DIRECTORY) {
-                       if (!inode->i_op->lookup)
+                       if (!nd->inode->i_op->lookup)
                                goto exit;
                }
                audit_inode(pathname, nd->path.dentry);
@@ -2190,11 +2107,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        }
 
        /* create side of things */
-
-       if (nd->flags & LOOKUP_RCU) {
-               if (nameidata_drop_rcu_last(nd))
-                       return ERR_PTR(-ECHILD);
-       }
+       error = complete_walk(nd);
+       if (error)
+               return ERR_PTR(error);
 
        audit_inode(pathname, dir);
        error = -EISDIR;
@@ -2204,25 +2119,24 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 
        mutex_lock(&dir->d_inode->i_mutex);
 
-       path->dentry = lookup_hash(nd);
-       path->mnt = nd->path.mnt;
-
-       error = PTR_ERR(path->dentry);
-       if (IS_ERR(path->dentry)) {
+       dentry = lookup_hash(nd);
+       error = PTR_ERR(dentry);
+       if (IS_ERR(dentry)) {
                mutex_unlock(&dir->d_inode->i_mutex);
                goto exit;
        }
 
-       if (IS_ERR(nd->intent.open.file)) {
-               error = PTR_ERR(nd->intent.open.file);
-               goto exit_mutex_unlock;
-       }
+       path->dentry = dentry;
+       path->mnt = nd->path.mnt;
 
        /* Negative dentry, just create the file */
-       if (!path->dentry->d_inode) {
+       if (!dentry->d_inode) {
+               int mode = op->mode;
+               if (!IS_POSIXACL(dir->d_inode))
+                       mode &= ~current_umask();
                /*
                 * This write is needed to ensure that a
-                * ro->rw transition does not occur between
+                * rw->ro transition does not occur between
                 * the time when the file is created and when
                 * a permanent write count is taken through
                 * the 'struct file' in nameidata_to_filp().
@@ -2230,28 +2144,21 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                error = mnt_want_write(nd->path.mnt);
                if (error)
                        goto exit_mutex_unlock;
-               error = __open_namei_create(nd, path, op->open_flag, op->mode);
-               if (error) {
-                       mnt_drop_write(nd->path.mnt);
-                       goto exit;
-               }
+               want_write = 1;
                /* Don't check for write permission, don't truncate */
-               error = may_open(&nd->path, 0, op->open_flag & ~O_TRUNC);
-               if (error) {
-                       mnt_drop_write(nd->path.mnt);
-                       goto exit;
-               }
-               filp = nameidata_to_filp(nd);
-               mnt_drop_write(nd->path.mnt);
-               path_put(&nd->path);
-               if (!IS_ERR(filp)) {
-                       error = ima_file_check(filp, op->acc_mode);
-                       if (error) {
-                               fput(filp);
-                               filp = ERR_PTR(error);
-                       }
-               }
-               return filp;
+               open_flag &= ~O_TRUNC;
+               will_truncate = 0;
+               acc_mode = MAY_OPEN;
+               error = security_path_mknod(&nd->path, dentry, mode, 0);
+               if (error)
+                       goto exit_mutex_unlock;
+               error = vfs_create(dir->d_inode, dentry, mode, nd);
+               if (error)
+                       goto exit_mutex_unlock;
+               mutex_unlock(&dir->d_inode->i_mutex);
+               dput(nd->path.dentry);
+               nd->path.dentry = dentry;
+               goto common;
        }
 
        /*
@@ -2261,7 +2168,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        audit_inode(pathname, path->dentry);
 
        error = -EEXIST;
-       if (op->open_flag & O_EXCL)
+       if (open_flag & O_EXCL)
                goto exit_dput;
 
        error = follow_managed(path, nd->flags);
@@ -2281,18 +2188,19 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        if (S_ISDIR(nd->inode->i_mode))
                goto exit;
 ok:
-       will_truncate = open_will_truncate(op->open_flag, nd->path.dentry->d_inode);
+       if (!S_ISREG(nd->inode->i_mode))
+               will_truncate = 0;
+
        if (will_truncate) {
                error = mnt_want_write(nd->path.mnt);
                if (error)
                        goto exit;
+               want_write = 1;
        }
-       error = may_open(&nd->path, op->acc_mode, op->open_flag);
-       if (error) {
-               if (will_truncate)
-                       mnt_drop_write(nd->path.mnt);
+common:
+       error = may_open(&nd->path, acc_mode, open_flag);
+       if (error)
                goto exit;
-       }
        filp = nameidata_to_filp(nd);
        if (!IS_ERR(filp)) {
                error = ima_file_check(filp, op->acc_mode);
@@ -2310,12 +2218,8 @@ ok:
                        }
                }
        }
-       /*
-        * It is now safe to drop the mnt write
-        * because the filp has had a write taken
-        * on its behalf.
-        */
-       if (will_truncate)
+out:
+       if (want_write)
                mnt_drop_write(nd->path.mnt);
        path_put(&nd->path);
        return filp;
@@ -2325,18 +2229,16 @@ exit_mutex_unlock:
 exit_dput:
        path_put_conditional(path, nd);
 exit:
-       path_put(&nd->path);
-       return ERR_PTR(error);
+       filp = ERR_PTR(error);
+       goto out;
 }
 
 static struct file *path_openat(int dfd, const char *pathname,
-               const struct open_flags *op, int flags)
+               struct nameidata *nd, const struct open_flags *op, int flags)
 {
        struct file *base = NULL;
        struct file *filp;
-       struct nameidata nd;
        struct path path;
-       int count = 0;
        int error;
 
        filp = get_empty_filp();
@@ -2344,62 +2246,46 @@ static struct file *path_openat(int dfd, const char *pathname,
                return ERR_PTR(-ENFILE);
 
        filp->f_flags = op->open_flag;
-       nd.intent.open.file = filp;
-       nd.intent.open.flags = open_to_namei_flags(op->open_flag);
-       nd.intent.open.create_mode = op->mode;
+       nd->intent.open.file = filp;
+       nd->intent.open.flags = open_to_namei_flags(op->open_flag);
+       nd->intent.open.create_mode = op->mode;
 
-       error = path_init(dfd, pathname, flags | LOOKUP_PARENT, &nd, &base);
+       error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
        if (unlikely(error))
                goto out_filp;
 
        current->total_link_count = 0;
-       error = link_path_walk(pathname, &nd);
+       error = link_path_walk(pathname, nd);
        if (unlikely(error))
                goto out_filp;
 
-       filp = do_last(&nd, &path, op, pathname);
+       filp = do_last(nd, &path, op, pathname);
        while (unlikely(!filp)) { /* trailing symlink */
                struct path link = path;
-               struct inode *linki = link.dentry->d_inode;
                void *cookie;
-               error = -ELOOP;
-               if (!(nd.flags & LOOKUP_FOLLOW))
-                       goto exit_dput;
-               if (count++ == 32)
-                       goto exit_dput;
-               /*
-                * This is subtle. Instead of calling do_follow_link() we do
-                * the thing by hands. The reason is that this way we have zero
-                * link_count and path_walk() (called from ->follow_link)
-                * honoring LOOKUP_PARENT.  After that we have the parent and
-                * last component, i.e. we are in the same situation as after
-                * the first path_walk().  Well, almost - if the last component
-                * is normal we get its copy stored in nd->last.name and we will
-                * have to putname() it when we are done. Procfs-like symlinks
-                * just set LAST_BIND.
-                */
-               nd.flags |= LOOKUP_PARENT;
-               nd.flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-               error = __do_follow_link(&link, &nd, &cookie);
+               if (!(nd->flags & LOOKUP_FOLLOW)) {
+                       path_put_conditional(&path, nd);
+                       path_put(&nd->path);
+                       filp = ERR_PTR(-ELOOP);
+                       break;
+               }
+               nd->flags |= LOOKUP_PARENT;
+               nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
+               error = follow_link(&link, nd, &cookie);
                if (unlikely(error))
                        filp = ERR_PTR(error);
                else
-                       filp = do_last(&nd, &path, op, pathname);
-               if (!IS_ERR(cookie) && linki->i_op->put_link)
-                       linki->i_op->put_link(link.dentry, &nd, cookie);
-               path_put(&link);
+                       filp = do_last(nd, &path, op, pathname);
+               put_link(nd, &link, cookie);
        }
 out:
-       if (nd.root.mnt)
-               path_put(&nd.root);
+       if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
+               path_put(&nd->root);
        if (base)
                fput(base);
-       release_open_intent(&nd);
+       release_open_intent(nd);
        return filp;
 
-exit_dput:
-       path_put_conditional(&path, &nd);
-       path_put(&nd.path);
 out_filp:
        filp = ERR_PTR(error);
        goto out;
@@ -2408,16 +2294,39 @@ out_filp:
 struct file *do_filp_open(int dfd, const char *pathname,
                const struct open_flags *op, int flags)
 {
+       struct nameidata nd;
        struct file *filp;
 
-       filp = path_openat(dfd, pathname, op, flags | LOOKUP_RCU);
+       filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
        if (unlikely(filp == ERR_PTR(-ECHILD)))
-               filp = path_openat(dfd, pathname, op, flags);
+               filp = path_openat(dfd, pathname, &nd, op, flags);
        if (unlikely(filp == ERR_PTR(-ESTALE)))
-               filp = path_openat(dfd, pathname, op, flags | LOOKUP_REVAL);
+               filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
        return filp;
 }
 
+struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+               const char *name, const struct open_flags *op, int flags)
+{
+       struct nameidata nd;
+       struct file *file;
+
+       nd.root.mnt = mnt;
+       nd.root.dentry = dentry;
+
+       flags |= LOOKUP_ROOT;
+
+       if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
+               return ERR_PTR(-ELOOP);
+
+       file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
+       if (unlikely(file == ERR_PTR(-ECHILD)))
+               file = path_openat(-1, name, &nd, op, flags);
+       if (unlikely(file == ERR_PTR(-ESTALE)))
+               file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
+       return file;
+}
+
 /**
  * lookup_create - lookup a dentry, creating it if it doesn't exist
  * @nd: nameidata info
@@ -2478,7 +2387,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        if (error)
                return error;
 
-       if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
+       if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
+           !ns_capable(inode_userns(dir), CAP_MKNOD))
                return -EPERM;
 
        if (!dir->i_op->mknod)
@@ -2639,10 +2549,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
 }
 
 /*
- * We try to drop the dentry early: we should have
- * a usage count of 2 if we're the only user of this
- * dentry, and if that is true (possibly after pruning
- * the dcache), then we drop the dentry now.
+ * The dentry_unhash() helper will try to drop the dentry early: we
+ * should have a usage count of 2 if we're the only user of this
+ * dentry, and if that is true (possibly after pruning the dcache),
+ * then we drop the dentry now.
  *
  * A low-level filesystem can, if it choses, legally
  * do a
@@ -2655,10 +2565,9 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
  */
 void dentry_unhash(struct dentry *dentry)
 {
-       dget(dentry);
        shrink_dcache_parent(dentry);
        spin_lock(&dentry->d_lock);
-       if (dentry->d_count == 2)
+       if (dentry->d_count == 1)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
 }
@@ -2674,25 +2583,27 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
                return -EPERM;
 
        mutex_lock(&dentry->d_inode->i_mutex);
-       dentry_unhash(dentry);
+
+       error = -EBUSY;
        if (d_mountpoint(dentry))
-               error = -EBUSY;
-       else {
-               error = security_inode_rmdir(dir, dentry);
-               if (!error) {
-                       error = dir->i_op->rmdir(dir, dentry);
-                       if (!error) {
-                               dentry->d_inode->i_flags |= S_DEAD;
-                               dont_mount(dentry);
-                       }
-               }
-       }
+               goto out;
+
+       error = security_inode_rmdir(dir, dentry);
+       if (error)
+               goto out;
+
+       shrink_dcache_parent(dentry);
+       error = dir->i_op->rmdir(dir, dentry);
+       if (error)
+               goto out;
+
+       dentry->d_inode->i_flags |= S_DEAD;
+       dont_mount(dentry);
+
+out:
        mutex_unlock(&dentry->d_inode->i_mutex);
-       if (!error) {
+       if (!error)
                d_delete(dentry);
-       }
-       dput(dentry);
-
        return error;
 }
 
@@ -2726,6 +2637,10 @@ static long do_rmdir(int dfd, const char __user *pathname)
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto exit2;
+       if (!dentry->d_inode) {
+               error = -ENOENT;
+               goto exit3;
+       }
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto exit3;
@@ -2814,8 +2729,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)
                if (nd.last.name[nd.last.len])
                        goto slashes;
                inode = dentry->d_inode;
-               if (inode)
-                       ihold(inode);
+               if (!inode)
+                       goto slashes;
+               ihold(inode);
                error = mnt_want_write(nd.path.mnt);
                if (error)
                        goto exit2;
@@ -2955,7 +2871,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
                return error;
 
        mutex_lock(&inode->i_mutex);
-       error = dir->i_op->link(old_dentry, dir, new_dentry);
+       /* Make sure we don't allow creating hardlink to an unlinked file */
+       if (inode->i_nlink == 0)
+               error =  -ENOENT;
+       else
+               error = dir->i_op->link(old_dentry, dir, new_dentry);
        mutex_unlock(&inode->i_mutex);
        if (!error)
                fsnotify_link(dir, inode, new_dentry);
@@ -2977,15 +2897,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
        struct dentry *new_dentry;
        struct nameidata nd;
        struct path old_path;
+       int how = 0;
        int error;
        char *to;
 
-       if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
+       if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;
+       /*
+        * To use null names we require CAP_DAC_READ_SEARCH
+        * This ensures that not everyone will be able to create
+        * handlink using the passed filedescriptor.
+        */
+       if (flags & AT_EMPTY_PATH) {
+               if (!capable(CAP_DAC_READ_SEARCH))
+                       return -ENOENT;
+               how = LOOKUP_EMPTY;
+       }
+
+       if (flags & AT_SYMLINK_FOLLOW)
+               how |= LOOKUP_FOLLOW;
 
-       error = user_path_at(olddfd, oldname,
-                            flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
-                            &old_path);
+       error = user_path_at(olddfd, oldname, how, &old_path);
        if (error)
                return error;
 
@@ -3047,12 +2979,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
  *        HOWEVER, it relies on the assumption that any object with ->lookup()
  *        has no more than 1 dentry.  If "hybrid" objects will ever appear,
  *        we'd better make sure that there's no link(2) for them.
- *     d) some filesystems don't support opened-but-unlinked directories,
- *        either because of layout or because they are not ready to deal with
- *        all cases correctly. The latter will be fixed (taking this sort of
- *        stuff into VFS), but the former is not going away. Solution: the same
- *        trick as in rmdir().
- *     e) conversion from fhandle to dentry may come in the wrong moment - when
+ *     d) conversion from fhandle to dentry may come in the wrong moment - when
  *        we are removing the target. Solution: we will have to grab ->i_mutex
  *        in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
  *        ->i_mutex on parents, which works but leads to some truly excessive
@@ -3062,7 +2989,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
                          struct inode *new_dir, struct dentry *new_dentry)
 {
        int error = 0;
-       struct inode *target;
+       struct inode *target = new_dentry->d_inode;
 
        /*
         * If we are going to change the parent - check write permissions,
@@ -3078,26 +3005,26 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
        if (error)
                return error;
 
-       target = new_dentry->d_inode;
        if (target)
                mutex_lock(&target->i_mutex);
-       if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
-               error = -EBUSY;
-       else {
-               if (target)
-                       dentry_unhash(new_dentry);
-               error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-       }
+
+       error = -EBUSY;
+       if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
+               goto out;
+
+       if (target)
+               shrink_dcache_parent(new_dentry);
+       error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+       if (error)
+               goto out;
+
        if (target) {
-               if (!error) {
-                       target->i_flags |= S_DEAD;
-                       dont_mount(new_dentry);
-               }
-               mutex_unlock(&target->i_mutex);
-               if (d_unhashed(new_dentry))
-                       d_rehash(new_dentry);
-               dput(new_dentry);
+               target->i_flags |= S_DEAD;
+               dont_mount(new_dentry);
        }
+out:
+       if (target)
+               mutex_unlock(&target->i_mutex);
        if (!error)
                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
                        d_move(old_dentry,new_dentry);
@@ -3107,7 +3034,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
                            struct inode *new_dir, struct dentry *new_dentry)
 {
-       struct inode *target;
+       struct inode *target = new_dentry->d_inode;
        int error;
 
        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -3115,19 +3042,22 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
                return error;
 
        dget(new_dentry);
-       target = new_dentry->d_inode;
        if (target)
                mutex_lock(&target->i_mutex);
+
+       error = -EBUSY;
        if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
-               error = -EBUSY;
-       else
-               error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-       if (!error) {
-               if (target)
-                       dont_mount(new_dentry);
-               if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
-                       d_move(old_dentry, new_dentry);
-       }
+               goto out;
+
+       error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+       if (error)
+               goto out;
+
+       if (target)
+               dont_mount(new_dentry);
+       if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+               d_move(old_dentry, new_dentry);
+out:
        if (target)
                mutex_unlock(&target->i_mutex);
        dput(new_dentry);