#include <linux/fcntl.h>
#include <linux/device_cgroup.h>
#include <linux/fs_struct.h>
+#include <linux/posix_acl.h>
#include <asm/uaccess.h>
#include "internal.h"
* name indicated by the symlink. The old code always complained that the
* name already exists, due to not following the symlink even if its target
* is nonexistent. The new semantics affects also mknod() and link() when
- * the name is a symlink pointing to a non-existant name.
+ * the name is a symlink pointing to a non-existent name.
*
* I don't know which semantics is the right one, since I have no access
* to standards. But I found by trial that HP-UX 9.0 has the full "new"
return retval;
}
-static char *getname_flags(const char __user * filename, int flags)
+static char *getname_flags(const char __user *filename, int flags, int *empty)
{
char *tmp, *result;
result = tmp;
if (retval < 0) {
+ if (retval == -ENOENT && empty)
+ *empty = 1;
if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
__putname(tmp);
result = ERR_PTR(retval);
char *getname(const char __user * filename)
{
- return getname_flags(filename, 0);
+ return getname_flags(filename, 0, 0);
}
#ifdef CONFIG_AUDITSYSCALL
EXPORT_SYMBOL(putname);
#endif
+static int check_acl(struct inode *inode, int mask)
+{
+#ifdef CONFIG_FS_POSIX_ACL
+ struct posix_acl *acl;
+
+ if (mask & MAY_NOT_BLOCK) {
+ acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
+ if (!acl)
+ return -EAGAIN;
+ /* no ->get_acl() calls in RCU mode... */
+ if (acl == ACL_NOT_CACHED)
+ return -ECHILD;
+ return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
+ }
+
+ acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+
+ /*
+ * A filesystem can force a ACL callback by just never filling the
+ * ACL cache. But normally you'd fill the cache either at inode
+ * instantiation time, or on the first ->get_acl call.
+ *
+ * If the filesystem doesn't have a get_acl() function at all, we'll
+ * just create the negative cache entry.
+ */
+ if (acl == ACL_NOT_CACHED) {
+ if (inode->i_op->get_acl) {
+ acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ } else {
+ set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
+ return -EAGAIN;
+ }
+ }
+
+ if (acl) {
+ int error = posix_acl_permission(inode, acl, mask);
+ posix_acl_release(acl);
+ return error;
+ }
+#endif
+
+ return -EAGAIN;
+}
+
/*
* This does basic POSIX ACL permission checking
*/
-static int acl_permission_check(struct inode *inode, int mask, unsigned int flags,
- int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
+static int acl_permission_check(struct inode *inode, int mask)
{
- umode_t mode = inode->i_mode;
+ unsigned int mode = inode->i_mode;
- mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
+ mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
+
+ if (current_user_ns() != inode_userns(inode))
+ goto other_perms;
- if (current_fsuid() == inode->i_uid)
+ if (likely(current_fsuid() == inode->i_uid))
mode >>= 6;
else {
- if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
- int error = check_acl(inode, mask, flags);
+ if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
+ int error = check_acl(inode, mask);
if (error != -EAGAIN)
return error;
}
mode >>= 3;
}
+other_perms:
/*
* If the DACs are ok we don't need any capability check.
*/
- if ((mask & ~mode) == 0)
+ if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
return 0;
return -EACCES;
}
* generic_permission - check for access rights on a Posix-like filesystem
* @inode: inode to check access rights for
* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- * @check_acl: optional callback to check for Posix ACLs
- * @flags: IPERM_FLAG_ flags.
*
* Used to check for read/write/execute permissions on a file.
* We use "fsuid" for this, letting us set arbitrary permissions
* request cannot be satisfied (eg. requires blocking or too much complexity).
* It would then be called again in ref-walk mode.
*/
-int generic_permission(struct inode *inode, int mask, unsigned int flags,
- int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
+int generic_permission(struct inode *inode, int mask)
{
int ret;
/*
* Do the basic POSIX ACL permission checks.
*/
- ret = acl_permission_check(inode, mask, flags, check_acl);
+ ret = acl_permission_check(inode, mask);
if (ret != -EACCES)
return ret;
+ if (S_ISDIR(inode->i_mode)) {
+ /* DACs are overridable for directories */
+ if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
+ return 0;
+ if (!(mask & MAY_WRITE))
+ if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
+ return 0;
+ return -EACCES;
+ }
/*
* Read/write DACs are always overridable.
- * Executable DACs are overridable if at least one exec bit is set.
+ * Executable DACs are overridable when there is
+ * at least one exec bit set.
*/
- if (!(mask & MAY_EXEC) || execute_ok(inode))
- if (capable(CAP_DAC_OVERRIDE))
+ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
+ if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
return 0;
/*
* Searching includes executable on directories, else just read.
*/
mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
- if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
- if (capable(CAP_DAC_READ_SEARCH))
+ if (mask == MAY_READ)
+ if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
return 0;
return -EACCES;
}
+/*
+ * We _really_ want to just do "generic_permission()" without
+ * even looking at the inode->i_op values. So we keep a cache
+ * flag in inode->i_opflags, that says "this has not special
+ * permission function, use the fast case".
+ */
+static inline int do_inode_permission(struct inode *inode, int mask)
+{
+ if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
+ if (likely(inode->i_op->permission))
+ return inode->i_op->permission(inode, mask);
+
+ /* This gets set once for the inode lifetime */
+ spin_lock(&inode->i_lock);
+ inode->i_opflags |= IOP_FASTPERM;
+ spin_unlock(&inode->i_lock);
+ }
+ return generic_permission(inode, mask);
+}
+
/**
* inode_permission - check for access rights to a given inode
* @inode: inode to check permission on
{
int retval;
- if (mask & MAY_WRITE) {
+ if (unlikely(mask & MAY_WRITE)) {
umode_t mode = inode->i_mode;
/*
return -EACCES;
}
- if (inode->i_op->permission)
- retval = inode->i_op->permission(inode, mask, 0);
- else
- retval = generic_permission(inode, mask, 0,
- inode->i_op->check_acl);
-
+ retval = do_inode_permission(inode, mask);
if (retval)
return retval;
return security_inode_permission(inode, mask);
}
-/**
- * file_permission - check for additional access rights to a given file
- * @file: file to check access rights for
- * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- *
- * Used to check for read/write/execute permissions on an already opened
- * file.
- *
- * Note:
- * Do not use this function in new code. All access checks should
- * be done using inode_permission().
- */
-int file_permission(struct file *file, int mask)
-{
- return inode_permission(file->f_path.dentry->d_inode, mask);
-}
-
-/*
- * get_write_access() gets write permission for a file.
- * put_write_access() releases this write permission.
- * This is used for regular files.
- * We cannot support write (and maybe mmap read-write shared) accesses and
- * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
- * can have the following values:
- * 0: no writers, no VM_DENYWRITE mappings
- * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
- * > 0: (i_writecount) users are writing to the file.
- *
- * Normally we operate on that counter with atomic_{inc,dec} and it's safe
- * except for the cases where we don't hold i_writecount yet. Then we need to
- * use {get,deny}_write_access() - these functions check the sign and refuse
- * to do the change if sign is wrong. Exclusion between them is provided by
- * the inode->i_lock spinlock.
- */
-
-int get_write_access(struct inode * inode)
-{
- spin_lock(&inode->i_lock);
- if (atomic_read(&inode->i_writecount) < 0) {
- spin_unlock(&inode->i_lock);
- return -ETXTBSY;
- }
- atomic_inc(&inode->i_writecount);
- spin_unlock(&inode->i_lock);
-
- return 0;
-}
-
-int deny_write_access(struct file * file)
-{
- struct inode *inode = file->f_path.dentry->d_inode;
-
- spin_lock(&inode->i_lock);
- if (atomic_read(&inode->i_writecount) > 0) {
- spin_unlock(&inode->i_lock);
- return -ETXTBSY;
- }
- atomic_dec(&inode->i_writecount);
- spin_unlock(&inode->i_lock);
-
- return 0;
-}
-
/**
* path_get - get a reference to a path
* @path: path to get the reference to
}
EXPORT_SYMBOL(path_put);
-/**
- * nameidata_drop_rcu - drop this nameidata out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
+/*
* Path walking has 2 modes, rcu-walk and ref-walk (see
- * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
- * to drop out of rcu-walk mode and take normal reference counts on dentries
- * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
- * refcounts at the last known good point before rcu-walk got stuck, so
- * ref-walk may continue from there. If this is not successful (eg. a seqcount
- * has changed), then failure is returned and path walk restarts from the
- * beginning in ref-walk mode.
- *
- * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
- * ref-walk. Must be called from rcu-walk context.
+ * Documentation/filesystems/path-lookup.txt). In situations when we can't
+ * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
+ * normal reference counts on dentries and vfsmounts to transition to rcu-walk
+ * mode. Refcounts are grabbed at the last known good point before rcu-walk
+ * got stuck, so ref-walk may continue from there. If this is not successful
+ * (eg. a seqcount has changed), then failure is returned and it's up to caller
+ * to restart the path walk from the beginning in ref-walk mode.
*/
-static int nameidata_drop_rcu(struct nameidata *nd)
-{
- struct fs_struct *fs = current->fs;
- struct dentry *dentry = nd->path.dentry;
- int want_root = 0;
-
- BUG_ON(!(nd->flags & LOOKUP_RCU));
- if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
- want_root = 1;
- spin_lock(&fs->lock);
- if (nd->root.mnt != fs->root.mnt ||
- nd->root.dentry != fs->root.dentry)
- goto err_root;
- }
- spin_lock(&dentry->d_lock);
- if (!__d_rcu_to_refcount(dentry, nd->seq))
- goto err;
- BUG_ON(nd->inode != dentry->d_inode);
- spin_unlock(&dentry->d_lock);
- if (want_root) {
- path_get(&nd->root);
- spin_unlock(&fs->lock);
- }
- mntget(nd->path.mnt);
-
- rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
- nd->flags &= ~LOOKUP_RCU;
- return 0;
-err:
- spin_unlock(&dentry->d_lock);
-err_root:
- if (want_root)
- spin_unlock(&fs->lock);
- return -ECHILD;
-}
-
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
-static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
-{
- if (nd->flags & LOOKUP_RCU)
- return nameidata_drop_rcu(nd);
- return 0;
-}
/**
- * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * @dentry: dentry to drop
+ * unlazy_walk - try to switch to ref-walk mode.
+ * @nd: nameidata pathwalk data
+ * @dentry: child of nd->path.dentry or NULL
* Returns: 0 on success, -ECHILD on failure
*
- * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
- * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
- * @nd. Must be called from rcu-walk context.
+ * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
+ * for ref-walk mode. @dentry must be a path found by a do_lookup call on
+ * @nd or NULL. Must be called from rcu-walk context.
*/
-static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
+static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
{
struct fs_struct *fs = current->fs;
struct dentry *parent = nd->path.dentry;
goto err_root;
}
spin_lock(&parent->d_lock);
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (!__d_rcu_to_refcount(dentry, nd->seq))
- goto err;
- /*
- * If the sequence check on the child dentry passed, then the child has
- * not been removed from its parent. This means the parent dentry must
- * be valid and able to take a reference at this point.
- */
- BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
- BUG_ON(!parent->d_count);
- parent->d_count++;
- spin_unlock(&dentry->d_lock);
+ if (!dentry) {
+ if (!__d_rcu_to_refcount(parent, nd->seq))
+ goto err_parent;
+ BUG_ON(nd->inode != parent->d_inode);
+ } else {
+ if (dentry->d_parent != parent)
+ goto err_parent;
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ if (!__d_rcu_to_refcount(dentry, nd->seq))
+ goto err_child;
+ /*
+ * If the sequence check on the child dentry passed, then
+ * the child has not been removed from its parent. This
+ * means the parent dentry must be valid and able to take
+ * a reference at this point.
+ */
+ BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+ BUG_ON(!parent->d_count);
+ parent->d_count++;
+ spin_unlock(&dentry->d_lock);
+ }
spin_unlock(&parent->d_lock);
if (want_root) {
path_get(&nd->root);
br_read_unlock(vfsmount_lock);
nd->flags &= ~LOOKUP_RCU;
return 0;
-err:
+
+err_child:
spin_unlock(&dentry->d_lock);
+err_parent:
spin_unlock(&parent->d_lock);
err_root:
if (want_root)
return -ECHILD;
}
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
-static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
-{
- if (nd->flags & LOOKUP_RCU) {
- if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
- nd->flags &= ~LOOKUP_RCU;
- if (!(nd->flags & LOOKUP_ROOT))
- nd->root.mnt = NULL;
- rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
- return -ECHILD;
- }
- }
- return 0;
-}
-
-/**
- * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
- * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
- * nd->path should be the final element of the lookup, so nd->root is discarded.
- * Must be called from rcu-walk context.
- */
-static int nameidata_drop_rcu_last(struct nameidata *nd)
-{
- struct dentry *dentry = nd->path.dentry;
-
- BUG_ON(!(nd->flags & LOOKUP_RCU));
- nd->flags &= ~LOOKUP_RCU;
- if (!(nd->flags & LOOKUP_ROOT))
- nd->root.mnt = NULL;
- spin_lock(&dentry->d_lock);
- if (!__d_rcu_to_refcount(dentry, nd->seq))
- goto err_unlock;
- BUG_ON(nd->inode != dentry->d_inode);
- spin_unlock(&dentry->d_lock);
-
- mntget(nd->path.mnt);
-
- rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
-
- return 0;
-
-err_unlock:
- spin_unlock(&dentry->d_lock);
- rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
- return -ECHILD;
-}
-
/**
* release_open_intent - free up open intent resources
* @nd: pointer to nameidata
return dentry->d_op->d_revalidate(dentry, nd);
}
-static struct dentry *
-do_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
- int status = d_revalidate(dentry, nd);
- if (unlikely(status <= 0)) {
- /*
- * The dentry failed validation.
- * If d_revalidate returned 0 attempt to invalidate
- * the dentry otherwise d_revalidate is asking us
- * to return a fail status.
- */
- if (status < 0) {
- dput(dentry);
- dentry = ERR_PTR(status);
- } else if (!d_invalidate(dentry)) {
- dput(dentry);
- dentry = NULL;
- }
- }
- return dentry;
-}
-
-/*
- * handle_reval_path - force revalidation of a dentry
- *
- * In some situations the path walking code will trust dentries without
- * revalidating them. This causes problems for filesystems that depend on
- * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
- * (which indicates that it's possible for the dentry to go stale), force
- * a d_revalidate call before proceeding.
+/**
+ * complete_walk - successful completion of path walk
+ * @nd: pointer nameidata
*
- * Returns 0 if the revalidation was successful. If the revalidation fails,
- * either return the error returned by d_revalidate or -ESTALE if the
- * revalidation it just returned 0. If d_revalidate returns 0, we attempt to
- * invalidate the dentry. It's up to the caller to handle putting references
- * to the path if necessary.
+ * If we had been in RCU mode, drop out of it and legitimize nd->path.
+ * Revalidate the final result, unless we'd already done that during
+ * the path walk or the filesystem doesn't ask for it. Return 0 on
+ * success, -error on failure. In case of failure caller does not
+ * need to drop nd->path.
*/
-static inline int handle_reval_path(struct nameidata *nd)
+static int complete_walk(struct nameidata *nd)
{
struct dentry *dentry = nd->path.dentry;
int status;
+ if (nd->flags & LOOKUP_RCU) {
+ nd->flags &= ~LOOKUP_RCU;
+ if (!(nd->flags & LOOKUP_ROOT))
+ nd->root.mnt = NULL;
+ spin_lock(&dentry->d_lock);
+ if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ br_read_unlock(vfsmount_lock);
+ return -ECHILD;
+ }
+ BUG_ON(nd->inode != dentry->d_inode);
+ spin_unlock(&dentry->d_lock);
+ mntget(nd->path.mnt);
+ rcu_read_unlock();
+ br_read_unlock(vfsmount_lock);
+ }
+
if (likely(!(nd->flags & LOOKUP_JUMPED)))
return 0;
if (!status)
status = -ESTALE;
+ path_put(&nd->path);
return status;
}
-/*
- * Short-cut version of permission(), for calling on directories
- * during pathname resolution. Combines parts of permission()
- * and generic_permission(), and tests ONLY for MAY_EXEC permission.
- *
- * If appropriate, check DAC only. If not appropriate, or
- * short-cut DAC fails, then call ->permission() to do more
- * complete permission check.
- */
-static inline int exec_permission(struct inode *inode, unsigned int flags)
-{
- int ret;
-
- if (inode->i_op->permission) {
- ret = inode->i_op->permission(inode, MAY_EXEC, flags);
- } else {
- ret = acl_permission_check(inode, MAY_EXEC, flags,
- inode->i_op->check_acl);
- }
- if (likely(!ret))
- goto ok;
- if (ret == -ECHILD)
- return ret;
-
- if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
- goto ok;
-
- return ret;
-ok:
- return security_inode_exec_permission(inode, flags);
-}
-
static __always_inline void set_root(struct nameidata *nd)
{
if (!nd->root.mnt)
do {
seq = read_seqcount_begin(&fs->seq);
nd->root = fs->root;
+ nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
} while (read_seqcount_retry(&fs->seq, seq));
}
}
nd->path.dentry = path->dentry;
}
+static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+{
+ struct inode *inode = link->dentry->d_inode;
+ if (!IS_ERR(cookie) && inode->i_op->put_link)
+ inode->i_op->put_link(link->dentry, nd, cookie);
+ path_put(link);
+}
+
static __always_inline int
-__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
+follow_link(struct path *link, struct nameidata *nd, void **p)
{
int error;
struct dentry *dentry = link->dentry;
BUG_ON(nd->flags & LOOKUP_RCU);
- touch_atime(link->mnt, dentry);
- nd_set_link(nd, NULL);
-
if (link->mnt == nd->path.mnt)
mntget(link->mnt);
+ if (unlikely(current->total_link_count >= 40)) {
+ *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
+ path_put(&nd->path);
+ return -ELOOP;
+ }
+ cond_resched();
+ current->total_link_count++;
+
+ touch_atime(link->mnt, dentry);
+ nd_set_link(nd, NULL);
+
error = security_inode_follow_link(link->dentry, nd);
if (error) {
*p = ERR_PTR(error); /* no ->put_link(), please */
error = 0;
if (s)
error = __vfs_follow_link(nd, s);
- else if (nd->last_type == LAST_BIND)
+ else if (nd->last_type == LAST_BIND) {
nd->flags |= LOOKUP_JUMPED;
+ nd->inode = nd->path.dentry->d_inode;
+ if (nd->inode->i_op->follow_link) {
+ /* stepped on a _really_ weird one */
+ path_put(&nd->path);
+ error = -ELOOP;
+ }
+ }
}
return error;
}
-/*
- * This limits recursive symlink follows to 8, while
- * limiting consecutive symlinks to 40.
- *
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups.
- */
-static inline int do_follow_link(struct inode *inode, struct path *path, struct nameidata *nd)
-{
- void *cookie;
- int err = -ELOOP;
-
- /* We drop rcu-walk here */
- if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
- return -ECHILD;
- BUG_ON(inode != path->dentry->d_inode);
-
- if (current->link_count >= MAX_NESTED_LINKS)
- goto loop;
- if (current->total_link_count >= 40)
- goto loop;
- BUG_ON(nd->depth >= MAX_NESTED_LINKS);
- cond_resched();
- current->link_count++;
- current->total_link_count++;
- nd->depth++;
- err = __do_follow_link(path, nd, &cookie);
- if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
- path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
- path_put(path);
- current->link_count--;
- nd->depth--;
- return err;
-loop:
- path_put_conditional(path, nd);
- path_put(&nd->path);
- return err;
-}
-
static int follow_up_rcu(struct path *path)
{
struct vfsmount *parent;
if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
return -EREMOTE;
- /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
- * and this is the terminal part of the path.
- */
- if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
- return -EISDIR; /* we actually want to stop here */
-
- /* We want to mount if someone is trying to open/create a file of any
- * type under the mountpoint, wants to traverse through the mountpoint
- * or wants to open the mounted directory.
+ /* We don't want to mount if someone's just doing a stat -
+ * unless they're stat'ing a directory and appended a '/' to
+ * the name.
*
- * We don't want to mount if someone's just doing a stat and they've
- * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
- * appended a '/' to the name.
+ * We do, however, want to mount if someone wants to open or
+ * create a file of any type under the mountpoint, wants to
+ * traverse through the mountpoint or wants to open the
+ * mounted directory. Also, autofs may mark negative dentries
+ * as being automount points. These will need the attentions
+ * of the daemon to instantiate them before they can be used.
*/
- if (!(flags & LOOKUP_FOLLOW) &&
- !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
- LOOKUP_OPEN | LOOKUP_CREATE)))
+ if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
+ LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+ path->dentry->d_inode)
return -EISDIR;
current->total_link_count++;
* the path being looked up; if it wasn't then the remainder of
* the path is inaccessible and we should say so.
*/
- if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
+ if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
return -EREMOTE;
return PTR_ERR(mnt);
}
if (!mnt) /* mount collision */
return 0;
+ if (!*need_mntput) {
+ /* lock_mount() may release path->mnt on error */
+ mntget(path->mnt);
+ *need_mntput = true;
+ }
err = finish_automount(mnt, path);
switch (err) {
/* Someone else made a mount here whilst we were busy */
return 0;
case 0:
- dput(path->dentry);
- if (*need_mntput)
- mntput(path->mnt);
+ path_put(path);
path->mnt = mnt;
path->dentry = dget(mnt->mnt_root);
- *need_mntput = true;
return 0;
default:
return err;
*/
static int follow_managed(struct path *path, unsigned flags)
{
+ struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
unsigned managed;
bool need_mntput = false;
- int ret;
+ int ret = 0;
/* Given that we're not holding a lock here, we retain the value in a
* local variable for each dentry as we look at it so that we don't see
if (managed & DCACHE_MANAGE_TRANSIT) {
BUG_ON(!path->dentry->d_op);
BUG_ON(!path->dentry->d_op->d_manage);
- ret = path->dentry->d_op->d_manage(path->dentry,
- false, false);
+ ret = path->dentry->d_op->d_manage(path->dentry, false);
if (ret < 0)
- return ret == -EISDIR ? 0 : ret;
+ break;
}
/* Transit to a mounted filesystem. */
if (managed & DCACHE_NEED_AUTOMOUNT) {
ret = follow_automount(path, flags, &need_mntput);
if (ret < 0)
- return ret == -EISDIR ? 0 : ret;
+ break;
continue;
}
/* We didn't change the current path point */
break;
}
- return 0;
+
+ if (need_mntput && path->mnt == mnt)
+ mntput(path->mnt);
+ if (ret == -EISDIR)
+ ret = 0;
+ return ret < 0 ? ret : need_mntput;
}
int follow_down_one(struct path *path)
return 0;
}
+static inline bool managed_dentry_might_block(struct dentry *dentry)
+{
+ return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
+ dentry->d_op->d_manage(dentry, true) < 0);
+}
+
/*
- * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we
- * meet a managed dentry and we're not walking to "..". True is returned to
- * continue, false to abort.
+ * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
+ * we meet a managed dentry that would need blocking.
*/
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
- struct inode **inode, bool reverse_transit)
+ struct inode **inode)
{
- while (d_mountpoint(path->dentry)) {
+ for (;;) {
struct vfsmount *mounted;
- if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
- !reverse_transit &&
- path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+ /*
+ * Don't forget we might have a non-mountpoint managed dentry
+ * that wants to block transit.
+ */
+ if (unlikely(managed_dentry_might_block(path->dentry)))
return false;
+
+ if (!d_mountpoint(path->dentry))
+ break;
+
mounted = __lookup_mnt(path->mnt, path->dentry, 1);
if (!mounted)
break;
path->mnt = mounted;
path->dentry = mounted->mnt_root;
+ nd->flags |= LOOKUP_JUMPED;
nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+ /*
+ * Update the inode too. We don't need to re-check the
+ * dentry sequence number here after this d_inode read,
+ * because a mount-point is always pinned.
+ */
*inode = path->dentry->d_inode;
}
-
- if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
- return reverse_transit;
return true;
}
-static int follow_dotdot_rcu(struct nameidata *nd)
+static void follow_mount_rcu(struct nameidata *nd)
{
- struct inode *inode = nd->inode;
+ while (d_mountpoint(nd->path.dentry)) {
+ struct vfsmount *mounted;
+ mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
+ if (!mounted)
+ break;
+ nd->path.mnt = mounted;
+ nd->path.dentry = mounted->mnt_root;
+ nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+ }
+}
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
set_root_rcu(nd);
while (1) {
seq = read_seqcount_begin(&parent->d_seq);
if (read_seqcount_retry(&old->d_seq, nd->seq))
goto failed;
- inode = parent->d_inode;
nd->path.dentry = parent;
nd->seq = seq;
break;
if (!follow_up_rcu(&nd->path))
break;
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
- inode = nd->path.dentry->d_inode;
}
- __follow_mount_rcu(nd, &nd->path, &inode, true);
- nd->inode = inode;
+ follow_mount_rcu(nd);
+ nd->inode = nd->path.dentry->d_inode;
return 0;
failed:
* Follow down to the covering mount currently visible to userspace. At each
* point, the filesystem owning that dentry may be queried as to whether the
* caller is permitted to proceed or not.
- *
- * Care must be taken as namespace_sem may be held (indicated by mounting_here
- * being true).
*/
-int follow_down(struct path *path, bool mounting_here)
+int follow_down(struct path *path)
{
unsigned managed;
int ret;
BUG_ON(!path->dentry->d_op);
BUG_ON(!path->dentry->d_op->d_manage);
ret = path->dentry->d_op->d_manage(
- path->dentry, mounting_here, false);
+ path->dentry, false);
if (ret < 0)
return ret == -EISDIR ? 0 : ret;
}
return dentry;
}
+/*
+ * We already have a dentry, but require a lookup to be performed on the parent
+ * directory to fill in d_inode. Returns the new dentry, or ERR_PTR on error.
+ * parent->d_inode->i_mutex must be held. d_lookup must have verified that no
+ * child exists while under i_mutex.
+ */
+static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct inode *inode = parent->d_inode;
+ struct dentry *old;
+
+ /* Don't create child dentry for a dead directory. */
+ if (unlikely(IS_DEADDIR(inode)))
+ return ERR_PTR(-ENOENT);
+
+ old = inode->i_op->lookup(inode, dentry, nd);
+ if (unlikely(old)) {
+ dput(dentry);
+ dentry = old;
+ }
+ return dentry;
+}
+
/*
* It's more convoluted than I'd like it to be, but... it's still fairly
* small and for now I'd prefer to have fast path as straight as possible.
goto unlazy;
}
}
+ if (unlikely(d_need_lookup(dentry)))
+ goto unlazy;
path->mnt = mnt;
path->dentry = dentry;
- if (likely(__follow_mount_rcu(nd, path, inode, false)))
- return 0;
+ if (unlikely(!__follow_mount_rcu(nd, path, inode)))
+ goto unlazy;
+ if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+ goto unlazy;
+ return 0;
unlazy:
- if (dentry) {
- if (nameidata_dentry_drop_rcu(nd, dentry))
- return -ECHILD;
- } else {
- if (nameidata_drop_rcu(nd))
- return -ECHILD;
- }
+ if (unlazy_walk(nd, dentry))
+ return -ECHILD;
} else {
dentry = __d_lookup(parent, name);
}
+ if (dentry && unlikely(d_need_lookup(dentry))) {
+ dput(dentry);
+ dentry = NULL;
+ }
retry:
if (unlikely(!dentry)) {
struct inode *dir = parent->d_inode;
/* known good */
need_reval = 0;
status = 1;
+ } else if (unlikely(d_need_lookup(dentry))) {
+ dentry = d_inode_lookup(parent, dentry, nd);
+ if (IS_ERR(dentry)) {
+ mutex_unlock(&dir->i_mutex);
+ return PTR_ERR(dentry);
+ }
+ /* known good */
+ need_reval = 0;
+ status = 1;
}
mutex_unlock(&dir->i_mutex);
}
path_put_conditional(path, nd);
return err;
}
+ if (err)
+ nd->flags |= LOOKUP_JUMPED;
*inode = path->dentry->d_inode;
return 0;
}
static inline int may_lookup(struct nameidata *nd)
{
if (nd->flags & LOOKUP_RCU) {
- int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+ int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
if (err != -ECHILD)
return err;
- if (nameidata_drop_rcu(nd))
+ if (unlazy_walk(nd, NULL))
return -ECHILD;
}
- return exec_permission(nd->inode, 0);
+ return inode_permission(nd->inode, MAY_EXEC);
}
static inline int handle_dots(struct nameidata *nd, int type)
}
}
+/*
+ * Do we need to follow links? We _really_ want to be able
+ * to do this check without having to look at inode->i_op,
+ * so we keep a cache of "no, this doesn't need follow_link"
+ * for the common case.
+ */
+static inline int should_follow_link(struct inode *inode, int follow)
+{
+ if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
+ if (likely(inode->i_op->follow_link))
+ return follow;
+
+ /* This gets set once for the inode lifetime */
+ spin_lock(&inode->i_lock);
+ inode->i_opflags |= IOP_NOFOLLOW;
+ spin_unlock(&inode->i_lock);
+ }
+ return 0;
+}
+
+static inline int walk_component(struct nameidata *nd, struct path *path,
+ struct qstr *name, int type, int follow)
+{
+ struct inode *inode;
+ int err;
+ /*
+ * "." and ".." are special - ".." especially so because it has
+ * to be able to know about the current root directory and
+ * parent relationships.
+ */
+ if (unlikely(type != LAST_NORM))
+ return handle_dots(nd, type);
+ err = do_lookup(nd, name, path, &inode);
+ if (unlikely(err)) {
+ terminate_walk(nd);
+ return err;
+ }
+ if (!inode) {
+ path_to_nameidata(path, nd);
+ terminate_walk(nd);
+ return -ENOENT;
+ }
+ if (should_follow_link(inode, follow)) {
+ if (nd->flags & LOOKUP_RCU) {
+ if (unlikely(unlazy_walk(nd, path->dentry))) {
+ terminate_walk(nd);
+ return -ECHILD;
+ }
+ }
+ BUG_ON(inode != path->dentry->d_inode);
+ return 1;
+ }
+ path_to_nameidata(path, nd);
+ nd->inode = inode;
+ return 0;
+}
+
+/*
+ * This limits recursive symlink follows to 8, while
+ * limiting consecutive symlinks to 40.
+ *
+ * Without that kind of total limit, nasty chains of consecutive
+ * symlinks can cause almost arbitrarily long lookups.
+ */
+static inline int nested_symlink(struct path *path, struct nameidata *nd)
+{
+ int res;
+
+ if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
+ path_put_conditional(path, nd);
+ path_put(&nd->path);
+ return -ELOOP;
+ }
+ BUG_ON(nd->depth >= MAX_NESTED_LINKS);
+
+ nd->depth++;
+ current->link_count++;
+
+ do {
+ struct path link = *path;
+ void *cookie;
+
+ res = follow_link(&link, nd, &cookie);
+ if (!res)
+ res = walk_component(nd, path, &nd->last,
+ nd->last_type, LOOKUP_FOLLOW);
+ put_link(nd, &link, cookie);
+ } while (res > 0);
+
+ current->link_count--;
+ nd->depth--;
+ return res;
+}
+
+/*
+ * We really don't want to look at inode->i_op->lookup
+ * when we don't have to. So we keep a cache bit in
+ * the inode ->i_opflags field that says "yes, we can
+ * do lookup on this inode".
+ */
+static inline int can_lookup(struct inode *inode)
+{
+ if (likely(inode->i_opflags & IOP_LOOKUP))
+ return 1;
+ if (likely(!inode->i_op->lookup))
+ return 0;
+
+ /* We do this once for the lifetime of the inode */
+ spin_lock(&inode->i_lock);
+ inode->i_opflags |= IOP_LOOKUP;
+ spin_unlock(&inode->i_lock);
+ return 1;
+}
+
/*
* Name resolution.
* This is the basic name resolution function, turning a pathname into
{
struct path next;
int err;
- unsigned int lookup_flags = nd->flags;
while (*name=='/')
name++;
if (!*name)
return 0;
- if (nd->depth)
- lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
-
/* At this point we know we have a real path component. */
for(;;) {
- struct inode *inode;
unsigned long hash;
struct qstr this;
unsigned int c;
int type;
- nd->flags |= LOOKUP_CONTINUE;
-
err = may_lookup(nd);
if (err)
break;
goto last_component;
while (*++name == '/');
if (!*name)
- goto last_with_slashes;
-
- /*
- * "." and ".." are special - ".." especially so because it has
- * to be able to know about the current root directory and
- * parent relationships.
- */
- if (unlikely(type != LAST_NORM)) {
- if (handle_dots(nd, type))
- return -ECHILD;
- continue;
- }
+ goto last_component;
- /* This does the actual lookups.. */
- err = do_lookup(nd, &this, &next, &inode);
- if (err)
- break;
+ err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
+ if (err < 0)
+ return err;
- if (inode && inode->i_op->follow_link) {
- err = do_follow_link(inode, &next, nd);
+ if (err) {
+ err = nested_symlink(&next, nd);
if (err)
return err;
- nd->inode = nd->path.dentry->d_inode;
- } else {
- path_to_nameidata(&next, nd);
- nd->inode = inode;
}
- err = -ENOENT;
- if (!nd->inode)
- break;
+ if (can_lookup(nd->inode))
+ continue;
err = -ENOTDIR;
- if (!nd->inode->i_op->lookup)
- break;
- continue;
+ break;
/* here ends the main loop */
-last_with_slashes:
- lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
last_component:
- /* Clear LOOKUP_CONTINUE iff it was previously unset */
- nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
- if (lookup_flags & LOOKUP_PARENT)
- goto lookup_parent;
- if (unlikely(type != LAST_NORM))
- return handle_dots(nd, type);
- err = do_lookup(nd, &this, &next, &inode);
- if (err)
- break;
- if (inode && unlikely(inode->i_op->follow_link) &&
- (lookup_flags & LOOKUP_FOLLOW)) {
- err = do_follow_link(inode, &next, nd);
- if (err)
- return err;
- nd->inode = nd->path.dentry->d_inode;
- } else {
- path_to_nameidata(&next, nd);
- nd->inode = inode;
- }
- err = -ENOENT;
- if (!nd->inode)
- break;
- if (lookup_flags & LOOKUP_DIRECTORY) {
- err = -ENOTDIR;
- if (!nd->inode->i_op->lookup)
- break;
- }
- return 0;
-lookup_parent:
nd->last = this;
nd->last_type = type;
return 0;
} else {
struct dentry *dentry;
- file = fget_light(dfd, &fput_needed);
+ file = fget_raw_light(dfd, &fput_needed);
retval = -EBADF;
if (!file)
goto out_fail;
if (!S_ISDIR(dentry->d_inode->i_mode))
goto fput_fail;
- retval = file_permission(file, MAY_EXEC);
+ retval = inode_permission(dentry->d_inode, MAY_EXEC);
if (retval)
goto fput_fail;
}
return retval;
}
+static inline int lookup_last(struct nameidata *nd, struct path *path)
+{
+ if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
+ nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+
+ nd->flags &= ~LOOKUP_PARENT;
+ return walk_component(nd, path, &nd->last, nd->last_type,
+ nd->flags & LOOKUP_FOLLOW);
+}
+
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int path_lookupat(int dfd, const char *name,
unsigned int flags, struct nameidata *nd)
{
struct file *base = NULL;
- int retval;
+ struct path path;
+ int err;
/*
* Path walking is largely split up into 2 different synchronisation
* be handled by restarting a traditional ref-walk (which will always
* be able to complete).
*/
- retval = path_init(dfd, name, flags, nd, &base);
+ err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
- if (unlikely(retval))
- return retval;
+ if (unlikely(err))
+ return err;
current->total_link_count = 0;
- retval = link_path_walk(name, nd);
-
- if (nd->flags & LOOKUP_RCU) {
- /* went all way through without dropping RCU */
- BUG_ON(retval);
- if (nameidata_drop_rcu_last(nd))
- retval = -ECHILD;
+ err = link_path_walk(name, nd);
+
+ if (!err && !(flags & LOOKUP_PARENT)) {
+ err = lookup_last(nd, &path);
+ while (err > 0) {
+ void *cookie;
+ struct path link = path;
+ nd->flags |= LOOKUP_PARENT;
+ err = follow_link(&link, nd, &cookie);
+ if (!err)
+ err = lookup_last(nd, &path);
+ put_link(nd, &link, cookie);
+ }
}
- if (!retval)
- retval = handle_reval_path(nd);
+ if (!err)
+ err = complete_walk(nd);
+
+ if (!err && nd->flags & LOOKUP_DIRECTORY) {
+ if (!nd->inode->i_op->lookup) {
+ path_put(&nd->path);
+ err = -ENOTDIR;
+ }
+ }
if (base)
fput(base);
path_put(&nd->root);
nd->root.mnt = NULL;
}
- return retval;
+ return err;
}
static int do_path_lookup(int dfd, const char *name,
* @mnt: pointer to vfs mount of the base directory
* @name: pointer to file name
* @flags: lookup flags
- * @nd: pointer to nameidata
+ * @path: pointer to struct path to fill
*/
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
const char *name, unsigned int flags,
- struct nameidata *nd)
+ struct path *path)
{
- nd->root.dentry = dentry;
- nd->root.mnt = mnt;
+ struct nameidata nd;
+ int err;
+ nd.root.dentry = dentry;
+ nd.root.mnt = mnt;
+ BUG_ON(flags & LOOKUP_PARENT);
/* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
- return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
+ err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd);
+ if (!err)
+ *path = nd.path;
+ return err;
}
static struct dentry *__lookup_hash(struct qstr *name,
struct dentry *dentry;
int err;
- err = exec_permission(inode, 0);
+ err = inode_permission(inode, MAY_EXEC);
if (err)
return ERR_PTR(err);
*/
dentry = d_lookup(base, name);
- if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE))
- dentry = do_revalidate(dentry, nd);
+ if (dentry && d_need_lookup(dentry)) {
+ /*
+ * __lookup_hash is called with the parent dir's i_mutex already
+ * held, so we are good to go here.
+ */
+ dentry = d_inode_lookup(base, dentry, nd);
+ if (IS_ERR(dentry))
+ return dentry;
+ }
+
+ if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+ int status = d_revalidate(dentry, nd);
+ if (unlikely(status <= 0)) {
+ /*
+ * The dentry failed validation.
+ * If d_revalidate returned 0 attempt to invalidate
+ * the dentry otherwise d_revalidate is asking us
+ * to return a fail status.
+ */
+ if (status < 0) {
+ dput(dentry);
+ return ERR_PTR(status);
+ } else if (!d_invalidate(dentry)) {
+ dput(dentry);
+ dentry = NULL;
+ }
+ }
+ }
if (!dentry)
dentry = d_alloc_and_lookup(base, name, nd);
return __lookup_hash(&this, base, NULL);
}
-int user_path_at(int dfd, const char __user *name, unsigned flags,
- struct path *path)
+int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
+ struct path *path, int *empty)
{
struct nameidata nd;
- char *tmp = getname_flags(name, flags);
+ char *tmp = getname_flags(name, flags, empty);
int err = PTR_ERR(tmp);
if (!IS_ERR(tmp)) {
return err;
}
+int user_path_at(int dfd, const char __user *name, unsigned flags,
+ struct path *path)
+{
+ return user_path_at_empty(dfd, name, flags, path, 0);
+}
+
static int user_path_parent(int dfd, const char __user *path,
struct nameidata *nd, char **name)
{
if (!(dir->i_mode & S_ISVTX))
return 0;
+ if (current_user_ns() != inode_userns(inode))
+ goto other_userns;
if (inode->i_uid == fsuid)
return 0;
if (dir->i_uid == fsuid)
return 0;
- return !capable(CAP_FOWNER);
+
+other_userns:
+ return !ns_capable(inode_userns(inode), CAP_FOWNER);
}
/*
struct inode *inode = dentry->d_inode;
int error;
+ /* O_PATH? */
+ if (!acc_mode)
+ return 0;
+
if (!inode)
return -ENOENT;
}
/* O_NOATIME can only be set by the owner or superuser */
- if (flag & O_NOATIME && !is_owner_or_cap(inode))
+ if (flag & O_NOATIME && !inode_owner_or_capable(inode))
return -EPERM;
/*
return error;
}
-/*
- * Note that while the flag value (low two bits) for sys_open means:
- * 00 - read-only
- * 01 - write-only
- * 10 - read-write
- * 11 - special
- * it is changed into
- * 00 - no permissions needed
- * 01 - read-permission
- * 10 - write-permission
- * 11 - read-write
- * for the internal routines (ie open_namei()/follow_link() etc)
- * This is more logical, and also allows the 00 "no perm needed"
- * to be used for symlinks (where the permissions are checked
- * later).
- *
-*/
static inline int open_to_namei_flags(int flag)
{
- if ((flag+1) & O_ACCMODE)
- flag++;
+ if ((flag & O_ACCMODE) == 3)
+ flag--;
return flag;
}
int open_flag = op->open_flag;
int will_truncate = open_flag & O_TRUNC;
int want_write = 0;
- int skip_perm = 0;
+ int acc_mode = op->acc_mode;
struct file *filp;
- struct inode *inode;
int error;
nd->flags &= ~LOOKUP_PARENT;
return ERR_PTR(error);
/* fallthrough */
case LAST_ROOT:
- if (nd->flags & LOOKUP_RCU) {
- if (nameidata_drop_rcu_last(nd))
- return ERR_PTR(-ECHILD);
- }
- error = handle_reval_path(nd);
+ error = complete_walk(nd);
if (error)
- goto exit;
+ return ERR_PTR(error);
audit_inode(pathname, nd->path.dentry);
if (open_flag & O_CREAT) {
error = -EISDIR;
}
goto ok;
case LAST_BIND:
- /* can't be RCU mode here */
- error = handle_reval_path(nd);
+ error = complete_walk(nd);
if (error)
- goto exit;
+ return ERR_PTR(error);
audit_inode(pathname, dir);
goto ok;
}
if (!(open_flag & O_CREAT)) {
+ int symlink_ok = 0;
if (nd->last.name[nd->last.len])
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+ if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
+ symlink_ok = 1;
/* we _can_ be in RCU mode here */
- error = do_lookup(nd, &nd->last, path, &inode);
- if (error) {
- terminate_walk(nd);
+ error = walk_component(nd, path, &nd->last, LAST_NORM,
+ !symlink_ok);
+ if (error < 0)
return ERR_PTR(error);
- }
- if (!inode) {
- path_to_nameidata(path, nd);
- terminate_walk(nd);
- return ERR_PTR(-ENOENT);
- }
- if (unlikely(inode->i_op->follow_link)) {
- /* We drop rcu-walk here */
- if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
- return ERR_PTR(-ECHILD);
+ if (error) /* symlink */
return NULL;
- }
- path_to_nameidata(path, nd);
- nd->inode = inode;
/* sayonara */
- if (nd->flags & LOOKUP_RCU) {
- if (nameidata_drop_rcu_last(nd))
- return ERR_PTR(-ECHILD);
- }
+ error = complete_walk(nd);
+ if (error)
+ return ERR_PTR(-ECHILD);
error = -ENOTDIR;
if (nd->flags & LOOKUP_DIRECTORY) {
- if (!inode->i_op->lookup)
+ if (!nd->inode->i_op->lookup)
goto exit;
}
audit_inode(pathname, nd->path.dentry);
}
/* create side of things */
-
- if (nd->flags & LOOKUP_RCU) {
- if (nameidata_drop_rcu_last(nd))
- return ERR_PTR(-ECHILD);
- }
+ /*
+ * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been
+ * cleared when we got to the last component we are about to look up
+ */
+ error = complete_walk(nd);
+ if (error)
+ return ERR_PTR(error);
audit_inode(pathname, dir);
error = -EISDIR;
/* Don't check for write permission, don't truncate */
open_flag &= ~O_TRUNC;
will_truncate = 0;
- skip_perm = 1;
+ acc_mode = MAY_OPEN;
error = security_path_mknod(&nd->path, dentry, mode, 0);
if (error)
goto exit_mutex_unlock;
if (error < 0)
goto exit_dput;
+ if (error)
+ nd->flags |= LOOKUP_JUMPED;
+
error = -ENOENT;
if (!path->dentry->d_inode)
goto exit_dput;
path_to_nameidata(path, nd);
nd->inode = path->dentry->d_inode;
+ /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
+ error = complete_walk(nd);
+ if (error)
+ goto exit;
error = -EISDIR;
if (S_ISDIR(nd->inode->i_mode))
goto exit;
want_write = 1;
}
common:
- error = may_open(&nd->path, skip_perm ? 0 : op->acc_mode, open_flag);
+ error = may_open(&nd->path, acc_mode, open_flag);
if (error)
goto exit;
filp = nameidata_to_filp(nd);
struct file *base = NULL;
struct file *filp;
struct path path;
- int count = 0;
int error;
filp = get_empty_filp();
filp = do_last(nd, &path, op, pathname);
while (unlikely(!filp)) { /* trailing symlink */
struct path link = path;
- struct inode *linki = link.dentry->d_inode;
void *cookie;
- if (!(nd->flags & LOOKUP_FOLLOW) || count++ == 32) {
+ if (!(nd->flags & LOOKUP_FOLLOW)) {
path_put_conditional(&path, nd);
path_put(&nd->path);
filp = ERR_PTR(-ELOOP);
break;
}
- /*
- * This is subtle. Instead of calling do_follow_link() we do
- * the thing by hands. The reason is that this way we have zero
- * link_count and path_walk() (called from ->follow_link)
- * honoring LOOKUP_PARENT. After that we have the parent and
- * last component, i.e. we are in the same situation as after
- * the first path_walk(). Well, almost - if the last component
- * is normal we get its copy stored in nd->last.name and we will
- * have to putname() it when we are done. Procfs-like symlinks
- * just set LAST_BIND.
- */
nd->flags |= LOOKUP_PARENT;
nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
- error = __do_follow_link(&link, nd, &cookie);
+ error = follow_link(&link, nd, &cookie);
if (unlikely(error))
filp = ERR_PTR(error);
else
filp = do_last(nd, &path, op, pathname);
- if (!IS_ERR(cookie) && linki->i_op->put_link)
- linki->i_op->put_link(link.dentry, nd, cookie);
- path_put(&link);
+ put_link(nd, &link, cookie);
}
out:
if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
flags |= LOOKUP_ROOT;
- if (dentry->d_inode->i_op->follow_link)
+ if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
return ERR_PTR(-ELOOP);
file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
return file;
}
-/**
- * lookup_create - lookup a dentry, creating it if it doesn't exist
- * @nd: nameidata info
- * @is_dir: directory flag
- *
- * Simple function to lookup and return a dentry and create it
- * if it doesn't exist. Is SMP-safe.
- *
- * Returns with nd->path.dentry->d_inode->i_mutex locked.
- */
-struct dentry *lookup_create(struct nameidata *nd, int is_dir)
+struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir)
{
struct dentry *dentry = ERR_PTR(-EEXIST);
+ struct nameidata nd;
+ int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
+ if (error)
+ return ERR_PTR(error);
- mutex_lock_nested(&nd->path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
/*
* Yucky last component or no last component at all?
* (foo/., foo/.., /////)
*/
- if (nd->last_type != LAST_NORM)
- goto fail;
- nd->flags &= ~LOOKUP_PARENT;
- nd->flags |= LOOKUP_CREATE | LOOKUP_EXCL;
- nd->intent.open.flags = O_EXCL;
+ if (nd.last_type != LAST_NORM)
+ goto out;
+ nd.flags &= ~LOOKUP_PARENT;
+ nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;
+ nd.intent.open.flags = O_EXCL;
/*
* Do the final lookup.
*/
- dentry = lookup_hash(nd);
+ mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ dentry = lookup_hash(&nd);
if (IS_ERR(dentry))
goto fail;
* all is fine. Let's be bastards - you had / on the end, you've
* been asking for (non-existent) directory. -ENOENT for you.
*/
- if (unlikely(!is_dir && nd->last.name[nd->last.len])) {
+ if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
dput(dentry);
dentry = ERR_PTR(-ENOENT);
+ goto fail;
}
+ *path = nd.path;
return dentry;
eexist:
dput(dentry);
dentry = ERR_PTR(-EEXIST);
fail:
+ mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+out:
+ path_put(&nd.path);
return dentry;
}
-EXPORT_SYMBOL_GPL(lookup_create);
+EXPORT_SYMBOL(kern_path_create);
+
+struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
+{
+ char *tmp = getname(pathname);
+ struct dentry *res;
+ if (IS_ERR(tmp))
+ return ERR_CAST(tmp);
+ res = kern_path_create(dfd, tmp, path, is_dir);
+ putname(tmp);
+ return res;
+}
+EXPORT_SYMBOL(user_path_create);
int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
{
if (error)
return error;
- if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
+ if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
+ !ns_capable(inode_userns(dir), CAP_MKNOD))
return -EPERM;
if (!dir->i_op->mknod)
SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
unsigned, dev)
{
- int error;
- char *tmp;
struct dentry *dentry;
- struct nameidata nd;
+ struct path path;
+ int error;
if (S_ISDIR(mode))
return -EPERM;
- error = user_path_parent(dfd, filename, &nd, &tmp);
- if (error)
- return error;
+ dentry = user_path_create(dfd, filename, &path, 0);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
- dentry = lookup_create(&nd, 0);
- if (IS_ERR(dentry)) {
- error = PTR_ERR(dentry);
- goto out_unlock;
- }
- if (!IS_POSIXACL(nd.path.dentry->d_inode))
+ if (!IS_POSIXACL(path.dentry->d_inode))
mode &= ~current_umask();
error = may_mknod(mode);
if (error)
goto out_dput;
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(path.mnt);
if (error)
goto out_dput;
- error = security_path_mknod(&nd.path, dentry, mode, dev);
+ error = security_path_mknod(&path, dentry, mode, dev);
if (error)
goto out_drop_write;
switch (mode & S_IFMT) {
case 0: case S_IFREG:
- error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
+ error = vfs_create(path.dentry->d_inode,dentry,mode,NULL);
break;
case S_IFCHR: case S_IFBLK:
- error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,
+ error = vfs_mknod(path.dentry->d_inode,dentry,mode,
new_decode_dev(dev));
break;
case S_IFIFO: case S_IFSOCK:
- error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
+ error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
break;
}
out_drop_write:
- mnt_drop_write(nd.path.mnt);
+ mnt_drop_write(path.mnt);
out_dput:
dput(dentry);
-out_unlock:
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
- path_put(&nd.path);
- putname(tmp);
+ mutex_unlock(&path.dentry->d_inode->i_mutex);
+ path_put(&path);
return error;
}
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
{
- int error = 0;
- char * tmp;
struct dentry *dentry;
- struct nameidata nd;
-
- error = user_path_parent(dfd, pathname, &nd, &tmp);
- if (error)
- goto out_err;
+ struct path path;
+ int error;
- dentry = lookup_create(&nd, 1);
- error = PTR_ERR(dentry);
+ dentry = user_path_create(dfd, pathname, &path, 1);
if (IS_ERR(dentry))
- goto out_unlock;
+ return PTR_ERR(dentry);
- if (!IS_POSIXACL(nd.path.dentry->d_inode))
+ if (!IS_POSIXACL(path.dentry->d_inode))
mode &= ~current_umask();
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(path.mnt);
if (error)
goto out_dput;
- error = security_path_mkdir(&nd.path, dentry, mode);
+ error = security_path_mkdir(&path, dentry, mode);
if (error)
goto out_drop_write;
- error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+ error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
out_drop_write:
- mnt_drop_write(nd.path.mnt);
+ mnt_drop_write(path.mnt);
out_dput:
dput(dentry);
-out_unlock:
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
- path_put(&nd.path);
- putname(tmp);
-out_err:
+ mutex_unlock(&path.dentry->d_inode->i_mutex);
+ path_put(&path);
return error;
}
}
/*
- * We try to drop the dentry early: we should have
- * a usage count of 2 if we're the only user of this
- * dentry, and if that is true (possibly after pruning
- * the dcache), then we drop the dentry now.
+ * The dentry_unhash() helper will try to drop the dentry early: we
+ * should have a usage count of 2 if we're the only user of this
+ * dentry, and if that is true (possibly after pruning the dcache),
+ * then we drop the dentry now.
*
* A low-level filesystem can, if it choses, legally
* do a
*/
void dentry_unhash(struct dentry *dentry)
{
- dget(dentry);
shrink_dcache_parent(dentry);
spin_lock(&dentry->d_lock);
- if (dentry->d_count == 2)
+ if (dentry->d_count == 1)
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
}
if (!dir->i_op->rmdir)
return -EPERM;
+ dget(dentry);
mutex_lock(&dentry->d_inode->i_mutex);
- dentry_unhash(dentry);
+
+ error = -EBUSY;
if (d_mountpoint(dentry))
- error = -EBUSY;
- else {
- error = security_inode_rmdir(dir, dentry);
- if (!error) {
- error = dir->i_op->rmdir(dir, dentry);
- if (!error) {
- dentry->d_inode->i_flags |= S_DEAD;
- dont_mount(dentry);
- }
- }
- }
+ goto out;
+
+ error = security_inode_rmdir(dir, dentry);
+ if (error)
+ goto out;
+
+ shrink_dcache_parent(dentry);
+ error = dir->i_op->rmdir(dir, dentry);
+ if (error)
+ goto out;
+
+ dentry->d_inode->i_flags |= S_DEAD;
+ dont_mount(dentry);
+
+out:
mutex_unlock(&dentry->d_inode->i_mutex);
- if (!error) {
- d_delete(dentry);
- }
dput(dentry);
-
+ if (!error)
+ d_delete(dentry);
return error;
}
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto exit2;
+ if (!dentry->d_inode) {
+ error = -ENOENT;
+ goto exit3;
+ }
error = mnt_want_write(nd.path.mnt);
if (error)
goto exit3;
if (nd.last.name[nd.last.len])
goto slashes;
inode = dentry->d_inode;
- if (inode)
- ihold(inode);
+ if (!inode)
+ goto slashes;
+ ihold(inode);
error = mnt_want_write(nd.path.mnt);
if (error)
goto exit2;
{
int error;
char *from;
- char *to;
struct dentry *dentry;
- struct nameidata nd;
+ struct path path;
from = getname(oldname);
if (IS_ERR(from))
return PTR_ERR(from);
- error = user_path_parent(newdfd, newname, &nd, &to);
- if (error)
- goto out_putname;
-
- dentry = lookup_create(&nd, 0);
+ dentry = user_path_create(newdfd, newname, &path, 0);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
- goto out_unlock;
+ goto out_putname;
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(path.mnt);
if (error)
goto out_dput;
- error = security_path_symlink(&nd.path, dentry, from);
+ error = security_path_symlink(&path, dentry, from);
if (error)
goto out_drop_write;
- error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
+ error = vfs_symlink(path.dentry->d_inode, dentry, from);
out_drop_write:
- mnt_drop_write(nd.path.mnt);
+ mnt_drop_write(path.mnt);
out_dput:
dput(dentry);
-out_unlock:
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
- path_put(&nd.path);
- putname(to);
+ mutex_unlock(&path.dentry->d_inode->i_mutex);
+ path_put(&path);
out_putname:
putname(from);
return error;
int, newdfd, const char __user *, newname, int, flags)
{
struct dentry *new_dentry;
- struct nameidata nd;
- struct path old_path;
+ struct path old_path, new_path;
+ int how = 0;
int error;
- char *to;
- if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
+ if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
return -EINVAL;
+ /*
+ * To use null names we require CAP_DAC_READ_SEARCH
+ * This ensures that not everyone will be able to create
+ * handlink using the passed filedescriptor.
+ */
+ if (flags & AT_EMPTY_PATH) {
+ if (!capable(CAP_DAC_READ_SEARCH))
+ return -ENOENT;
+ how = LOOKUP_EMPTY;
+ }
+
+ if (flags & AT_SYMLINK_FOLLOW)
+ how |= LOOKUP_FOLLOW;
- error = user_path_at(olddfd, oldname,
- flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
- &old_path);
+ error = user_path_at(olddfd, oldname, how, &old_path);
if (error)
return error;
- error = user_path_parent(newdfd, newname, &nd, &to);
- if (error)
- goto out;
- error = -EXDEV;
- if (old_path.mnt != nd.path.mnt)
- goto out_release;
- new_dentry = lookup_create(&nd, 0);
+ new_dentry = user_path_create(newdfd, newname, &new_path, 0);
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
- goto out_unlock;
- error = mnt_want_write(nd.path.mnt);
+ goto out;
+
+ error = -EXDEV;
+ if (old_path.mnt != new_path.mnt)
+ goto out_dput;
+ error = mnt_want_write(new_path.mnt);
if (error)
goto out_dput;
- error = security_path_link(old_path.dentry, &nd.path, new_dentry);
+ error = security_path_link(old_path.dentry, &new_path, new_dentry);
if (error)
goto out_drop_write;
- error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
+ error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
out_drop_write:
- mnt_drop_write(nd.path.mnt);
+ mnt_drop_write(new_path.mnt);
out_dput:
dput(new_dentry);
-out_unlock:
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-out_release:
- path_put(&nd.path);
- putname(to);
+ mutex_unlock(&new_path.dentry->d_inode->i_mutex);
+ path_put(&new_path);
out:
path_put(&old_path);
* HOWEVER, it relies on the assumption that any object with ->lookup()
* has no more than 1 dentry. If "hybrid" objects will ever appear,
* we'd better make sure that there's no link(2) for them.
- * d) some filesystems don't support opened-but-unlinked directories,
- * either because of layout or because they are not ready to deal with
- * all cases correctly. The latter will be fixed (taking this sort of
- * stuff into VFS), but the former is not going away. Solution: the same
- * trick as in rmdir().
- * e) conversion from fhandle to dentry may come in the wrong moment - when
+ * d) conversion from fhandle to dentry may come in the wrong moment - when
* we are removing the target. Solution: we will have to grab ->i_mutex
* in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
* ->i_mutex on parents, which works but leads to some truly excessive
struct inode *new_dir, struct dentry *new_dentry)
{
int error = 0;
- struct inode *target;
+ struct inode *target = new_dentry->d_inode;
/*
* If we are going to change the parent - check write permissions,
if (error)
return error;
- target = new_dentry->d_inode;
+ dget(new_dentry);
if (target)
mutex_lock(&target->i_mutex);
- if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
- error = -EBUSY;
- else {
- if (target)
- dentry_unhash(new_dentry);
- error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
- }
+
+ error = -EBUSY;
+ if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
+ goto out;
+
+ if (target)
+ shrink_dcache_parent(new_dentry);
+ error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (error)
+ goto out;
+
if (target) {
- if (!error) {
- target->i_flags |= S_DEAD;
- dont_mount(new_dentry);
- }
- mutex_unlock(&target->i_mutex);
- if (d_unhashed(new_dentry))
- d_rehash(new_dentry);
- dput(new_dentry);
+ target->i_flags |= S_DEAD;
+ dont_mount(new_dentry);
}
+out:
+ if (target)
+ mutex_unlock(&target->i_mutex);
+ dput(new_dentry);
if (!error)
if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
d_move(old_dentry,new_dentry);
static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct inode *target;
+ struct inode *target = new_dentry->d_inode;
int error;
error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
return error;
dget(new_dentry);
- target = new_dentry->d_inode;
if (target)
mutex_lock(&target->i_mutex);
+
+ error = -EBUSY;
if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
- error = -EBUSY;
- else
- error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
- if (!error) {
- if (target)
- dont_mount(new_dentry);
- if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
- d_move(old_dentry, new_dentry);
- }
+ goto out;
+
+ error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (error)
+ goto out;
+
+ if (target)
+ dont_mount(new_dentry);
+ if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+ d_move(old_dentry, new_dentry);
+out:
if (target)
mutex_unlock(&target->i_mutex);
dput(new_dentry);
EXPORT_SYMBOL(__page_symlink);
EXPORT_SYMBOL(page_symlink);
EXPORT_SYMBOL(page_symlink_inode_operations);
-EXPORT_SYMBOL(kern_path_parent);
EXPORT_SYMBOL(kern_path);
EXPORT_SYMBOL(vfs_path_lookup);
EXPORT_SYMBOL(inode_permission);
-EXPORT_SYMBOL(file_permission);
EXPORT_SYMBOL(unlock_rename);
EXPORT_SYMBOL(vfs_create);
EXPORT_SYMBOL(vfs_follow_link);