nfs4 use mandatory attribute file type in nfs4_get_root
[linux-2.6.git] / fs / namespace.c
index 22ae06a..88058de 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/seq_file.h>
 #include <linux/mnt_namespace.h>
 #include <linux/namei.h>
+#include <linux/nsproxy.h>
 #include <linux/security.h>
 #include <linux/mount.h>
 #include <linux/ramfs.h>
@@ -42,6 +43,8 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 static int event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
+static int mnt_id_start = 0;
+static int mnt_group_start = 1;
 
 static struct list_head *mount_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
@@ -69,7 +72,9 @@ static int mnt_alloc_id(struct vfsmount *mnt)
 retry:
        ida_pre_get(&mnt_id_ida, GFP_KERNEL);
        spin_lock(&vfsmount_lock);
-       res = ida_get_new(&mnt_id_ida, &mnt->mnt_id);
+       res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
+       if (!res)
+               mnt_id_start = mnt->mnt_id + 1;
        spin_unlock(&vfsmount_lock);
        if (res == -EAGAIN)
                goto retry;
@@ -79,8 +84,11 @@ retry:
 
 static void mnt_free_id(struct vfsmount *mnt)
 {
+       int id = mnt->mnt_id;
        spin_lock(&vfsmount_lock);
-       ida_remove(&mnt_id_ida, mnt->mnt_id);
+       ida_remove(&mnt_id_ida, id);
+       if (mnt_id_start > id)
+               mnt_id_start = id;
        spin_unlock(&vfsmount_lock);
 }
 
@@ -91,10 +99,18 @@ static void mnt_free_id(struct vfsmount *mnt)
  */
 static int mnt_alloc_group_id(struct vfsmount *mnt)
 {
+       int res;
+
        if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
                return -ENOMEM;
 
-       return ida_get_new_above(&mnt_group_ida, 1, &mnt->mnt_group_id);
+       res = ida_get_new_above(&mnt_group_ida,
+                               mnt_group_start,
+                               &mnt->mnt_group_id);
+       if (!res)
+               mnt_group_start = mnt->mnt_group_id + 1;
+
+       return res;
 }
 
 /*
@@ -102,7 +118,10 @@ static int mnt_alloc_group_id(struct vfsmount *mnt)
  */
 void mnt_release_group_id(struct vfsmount *mnt)
 {
-       ida_remove(&mnt_group_ida, mnt->mnt_group_id);
+       int id = mnt->mnt_group_id;
+       ida_remove(&mnt_group_ida, id);
+       if (mnt_group_start > id)
+               mnt_group_start = id;
        mnt->mnt_group_id = 0;
 }
 
@@ -265,6 +284,47 @@ out:
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
 /**
+ * mnt_clone_write - get write access to a mount
+ * @mnt: the mount on which to take a write
+ *
+ * This is effectively like mnt_want_write, except
+ * it must only be used to take an extra write reference
+ * on a mountpoint that we already know has a write reference
+ * on it. This allows some optimisation.
+ *
+ * After finished, mnt_drop_write must be called as usual to
+ * drop the reference.
+ */
+int mnt_clone_write(struct vfsmount *mnt)
+{
+       /* superblock may be r/o */
+       if (__mnt_is_readonly(mnt))
+               return -EROFS;
+       preempt_disable();
+       inc_mnt_writers(mnt);
+       preempt_enable();
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mnt_clone_write);
+
+/**
+ * mnt_want_write_file - get write access to a file's mount
+ * @file: the file who's mount on which to take a write
+ *
+ * This is like mnt_want_write, but it takes a file and can
+ * do some optimisations if the file is open for write already
+ */
+int mnt_want_write_file(struct file *file)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
+               return mnt_want_write(file->f_path.mnt);
+       else
+               return mnt_clone_write(file->f_path.mnt);
+}
+EXPORT_SYMBOL_GPL(mnt_want_write_file);
+
+/**
  * mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
  *
@@ -513,7 +573,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
                        mnt->mnt_master = old;
                        CLEAR_MNT_SHARED(mnt);
                } else if (!(flag & CL_PRIVATE)) {
-                       if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
+                       if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
                                list_add(&mnt->mnt_share, &old->mnt_share);
                        if (IS_MNT_SLAVE(old))
                                list_add(&mnt->mnt_slave, &old->mnt_slave);
@@ -568,7 +628,6 @@ repeat:
                mnt->mnt_pinned = 0;
                spin_unlock(&vfsmount_lock);
                acct_auto_close_mnt(mnt);
-               security_sb_umount_close(mnt);
                goto repeat;
        }
 }
@@ -677,6 +736,21 @@ static void m_stop(struct seq_file *m, void *v)
        up_read(&namespace_sem);
 }
 
+int mnt_had_events(struct proc_mounts *p)
+{
+       struct mnt_namespace *ns = p->ns;
+       int res = 0;
+
+       spin_lock(&vfsmount_lock);
+       if (p->event != ns->event) {
+               p->event = ns->event;
+               res = 1;
+       }
+       spin_unlock(&vfsmount_lock);
+
+       return res;
+}
+
 struct proc_fs_info {
        int flag;
        const char *str;
@@ -905,10 +979,12 @@ EXPORT_SYMBOL(may_umount_tree);
 int may_umount(struct vfsmount *mnt)
 {
        int ret = 1;
+       down_read(&namespace_sem);
        spin_lock(&vfsmount_lock);
        if (propagate_mount_busy(mnt, 2))
                ret = 0;
        spin_unlock(&vfsmount_lock);
+       up_read(&namespace_sem);
        return ret;
 }
 
@@ -1020,11 +1096,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
                 * we just try to remount it readonly.
                 */
                down_write(&sb->s_umount);
-               if (!(sb->s_flags & MS_RDONLY)) {
-                       lock_kernel();
+               if (!(sb->s_flags & MS_RDONLY))
                        retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
-                       unlock_kernel();
-               }
                up_write(&sb->s_umount);
                return retval;
        }
@@ -1043,8 +1116,6 @@ static int do_umount(struct vfsmount *mnt, int flags)
                retval = 0;
        }
        spin_unlock(&vfsmount_lock);
-       if (retval)
-               security_sb_umount_busy(mnt);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        return retval;
@@ -1062,8 +1133,15 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 {
        struct path path;
        int retval;
+       int lookup_flags = 0;
 
-       retval = user_path(name, &path);
+       if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
+               return -EINVAL;
+
+       if (!(flags & UMOUNT_NOFOLLOW))
+               lookup_flags |= LOOKUP_FOLLOW;
+
+       retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
        if (retval)
                goto out;
        retval = -EINVAL;
@@ -1187,6 +1265,21 @@ void drop_collected_mounts(struct vfsmount *mnt)
        release_mounts(&umount_list);
 }
 
+int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
+                  struct vfsmount *root)
+{
+       struct vfsmount *mnt;
+       int res = f(root, arg);
+       if (res)
+               return res;
+       list_for_each_entry(mnt, &root->mnt_list, mnt_list) {
+               res = f(mnt, arg);
+               if (res)
+                       return res;
+       }
+       return 0;
+}
+
 static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end)
 {
        struct vfsmount *p;
@@ -1295,12 +1388,12 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
        if (err)
                goto out_cleanup_ids;
 
+       spin_lock(&vfsmount_lock);
+
        if (IS_MNT_SHARED(dest_mnt)) {
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                        set_mnt_shared(p);
        }
-
-       spin_lock(&vfsmount_lock);
        if (parent_path) {
                detach_mnt(source_mnt, parent_path);
                attach_mnt(source_mnt, path);
@@ -1336,20 +1429,13 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
 
        err = -ENOENT;
        mutex_lock(&path->dentry->d_inode->i_mutex);
-       if (IS_DEADDIR(path->dentry->d_inode))
+       if (cant_mount(path->dentry))
                goto out_unlock;
 
-       err = security_sb_check_sb(mnt, path);
-       if (err)
-               goto out_unlock;
-
-       err = -ENOENT;
-       if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry))
+       if (!d_unlinked(path->dentry))
                err = attach_recursive_mnt(mnt, path, NULL);
 out_unlock:
        mutex_unlock(&path->dentry->d_inode->i_mutex);
-       if (!err)
-               security_sb_post_addmount(mnt, path);
        return err;
 }
 
@@ -1477,12 +1563,14 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
                err = change_mount_flags(path->mnt, flags);
        else
                err = do_remount_sb(sb, flags, data, 0);
-       if (!err)
+       if (!err) {
+               spin_lock(&vfsmount_lock);
+               mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
                path->mnt->mnt_flags = mnt_flags;
+               spin_unlock(&vfsmount_lock);
+       }
        up_write(&sb->s_umount);
        if (!err) {
-               security_sb_post_remount(path->mnt, flags, data);
-
                spin_lock(&vfsmount_lock);
                touch_mnt_namespace(path->mnt->mnt_ns);
                spin_unlock(&vfsmount_lock);
@@ -1523,10 +1611,10 @@ static int do_move_mount(struct path *path, char *old_name)
 
        err = -ENOENT;
        mutex_lock(&path->dentry->d_inode->i_mutex);
-       if (IS_DEADDIR(path->dentry->d_inode))
+       if (cant_mount(path->dentry))
                goto out1;
 
-       if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry))
+       if (d_unlinked(path->dentry))
                goto out1;
 
        err = -EINVAL;
@@ -1583,14 +1671,16 @@ static int do_new_mount(struct path *path, char *type, int flags,
 {
        struct vfsmount *mnt;
 
-       if (!type || !memchr(type, 0, PAGE_SIZE))
+       if (!type)
                return -EINVAL;
 
        /* we need capabilities... */
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
+       lock_kernel();
        mnt = do_kern_mount(type, flags, name, data);
+       unlock_kernel();
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);
 
@@ -1606,6 +1696,8 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 {
        int err;
 
+       mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
+
        down_write(&namespace_sem);
        /* Something was mounted here while we slept */
        while (d_mountpoint(path->dentry) &&
@@ -1812,6 +1904,23 @@ int copy_mount_options(const void __user * data, unsigned long *where)
        return 0;
 }
 
+int copy_mount_string(const void __user *data, char **where)
+{
+       char *tmp;
+
+       if (!data) {
+               *where = NULL;
+               return 0;
+       }
+
+       tmp = strndup_user(data, PAGE_SIZE);
+       if (IS_ERR(tmp))
+               return PTR_ERR(tmp);
+
+       *where = tmp;
+       return 0;
+}
+
 /*
  * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
  * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
@@ -1841,12 +1950,20 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 
        if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
                return -EINVAL;
-       if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
-               return -EINVAL;
 
        if (data_page)
                ((char *)data_page)[PAGE_SIZE - 1] = 0;
 
+       /* ... and get the mountpoint */
+       retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
+       if (retval)
+               return retval;
+
+       retval = security_sb_mount(dev_name, &path,
+                                  type_page, flags, data_page);
+       if (retval)
+               goto dput_out;
+
        /* Default to relatime unless overriden */
        if (!(flags & MS_NOATIME))
                mnt_flags |= MNT_RELATIME;
@@ -1871,16 +1988,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
                   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
                   MS_STRICTATIME);
 
-       /* ... and get the mountpoint */
-       retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
-       if (retval)
-               return retval;
-
-       retval = security_sb_mount(dev_name, &path,
-                                  type_page, flags, data_page);
-       if (retval)
-               goto dput_out;
-
        if (flags & MS_REMOUNT)
                retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
                                    data_page);
@@ -1898,6 +2005,21 @@ dput_out:
        return retval;
 }
 
+static struct mnt_namespace *alloc_mnt_ns(void)
+{
+       struct mnt_namespace *new_ns;
+
+       new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+       if (!new_ns)
+               return ERR_PTR(-ENOMEM);
+       atomic_set(&new_ns->count, 1);
+       new_ns->root = NULL;
+       INIT_LIST_HEAD(&new_ns->list);
+       init_waitqueue_head(&new_ns->poll);
+       new_ns->event = 0;
+       return new_ns;
+}
+
 /*
  * Allocate a new namespace structure and populate it with contents
  * copied from the namespace of the passed in task structure.
@@ -1909,14 +2031,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
        struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
        struct vfsmount *p, *q;
 
-       new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
-       if (!new_ns)
-               return ERR_PTR(-ENOMEM);
-
-       atomic_set(&new_ns->count, 1);
-       INIT_LIST_HEAD(&new_ns->list);
-       init_waitqueue_head(&new_ns->poll);
-       new_ns->event = 0;
+       new_ns = alloc_mnt_ns();
+       if (IS_ERR(new_ns))
+               return new_ns;
 
        down_write(&namespace_sem);
        /* First pass: copy the tree topology */
@@ -1980,45 +2097,63 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
        return new_ns;
 }
 
+/**
+ * create_mnt_ns - creates a private namespace and adds a root filesystem
+ * @mnt: pointer to the new root filesystem mountpoint
+ */
+struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
+{
+       struct mnt_namespace *new_ns;
+
+       new_ns = alloc_mnt_ns();
+       if (!IS_ERR(new_ns)) {
+               mnt->mnt_ns = new_ns;
+               new_ns->root = mnt;
+               list_add(&new_ns->list, &new_ns->root->mnt_list);
+       }
+       return new_ns;
+}
+EXPORT_SYMBOL(create_mnt_ns);
+
 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
                char __user *, type, unsigned long, flags, void __user *, data)
 {
-       int retval;
+       int ret;
+       char *kernel_type;
+       char *kernel_dir;
+       char *kernel_dev;
        unsigned long data_page;
-       unsigned long type_page;
-       unsigned long dev_page;
-       char *dir_page;
 
-       retval = copy_mount_options(type, &type_page);
-       if (retval < 0)
-               return retval;
+       ret = copy_mount_string(type, &kernel_type);
+       if (ret < 0)
+               goto out_type;
 
-       dir_page = getname(dir_name);
-       retval = PTR_ERR(dir_page);
-       if (IS_ERR(dir_page))
-               goto out1;
+       kernel_dir = getname(dir_name);
+       if (IS_ERR(kernel_dir)) {
+               ret = PTR_ERR(kernel_dir);
+               goto out_dir;
+       }
 
-       retval = copy_mount_options(dev_name, &dev_page);
-       if (retval < 0)
-               goto out2;
+       ret = copy_mount_string(dev_name, &kernel_dev);
+       if (ret < 0)
+               goto out_dev;
 
-       retval = copy_mount_options(data, &data_page);
-       if (retval < 0)
-               goto out3;
+       ret = copy_mount_options(data, &data_page);
+       if (ret < 0)
+               goto out_data;
 
-       lock_kernel();
-       retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
-                         flags, (void *)data_page);
-       unlock_kernel();
-       free_page(data_page);
+       ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags,
+               (void *) data_page);
 
-out3:
-       free_page(dev_page);
-out2:
-       putname(dir_page);
-out1:
-       free_page(type_page);
-       return retval;
+       free_page(data_page);
+out_data:
+       kfree(kernel_dev);
+out_dev:
+       putname(kernel_dir);
+out_dir:
+       kfree(kernel_type);
+out_type:
+       return ret;
 }
 
 /*
@@ -2087,11 +2222,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        if (!check_mnt(root.mnt))
                goto out2;
        error = -ENOENT;
-       if (IS_DEADDIR(new.dentry->d_inode))
+       if (cant_mount(old.dentry))
                goto out2;
-       if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry))
+       if (d_unlinked(new.dentry))
                goto out2;
-       if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry))
+       if (d_unlinked(old.dentry))
                goto out2;
        error = -EBUSY;
        if (new.mnt == root.mnt ||
@@ -2130,7 +2265,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        spin_unlock(&vfsmount_lock);
        chroot_fs_refs(&root, &new);
-       security_sb_post_pivotroot(&root, &new);
        error = 0;
        path_put(&root_parent);
        path_put(&parent_path);
@@ -2157,16 +2291,9 @@ static void __init init_mount_tree(void)
        mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
        if (IS_ERR(mnt))
                panic("Can't create rootfs");
-       ns = kmalloc(sizeof(*ns), GFP_KERNEL);
-       if (!ns)
+       ns = create_mnt_ns(mnt);
+       if (IS_ERR(ns))
                panic("Can't allocate initial namespace");
-       atomic_set(&ns->count, 1);
-       INIT_LIST_HEAD(&ns->list);
-       init_waitqueue_head(&ns->poll);
-       ns->event = 0;
-       list_add(&mnt->mnt_list, &ns->list);
-       ns->root = mnt;
-       mnt->mnt_ns = ns;
 
        init_task.nsproxy->mnt_ns = ns;
        get_mnt_ns(ns);
@@ -2209,17 +2336,18 @@ void __init mnt_init(void)
        init_mount_tree();
 }
 
-void __put_mnt_ns(struct mnt_namespace *ns)
+void put_mnt_ns(struct mnt_namespace *ns)
 {
-       struct vfsmount *root = ns->root;
        LIST_HEAD(umount_list);
-       ns->root = NULL;
-       spin_unlock(&vfsmount_lock);
+
+       if (!atomic_dec_and_test(&ns->count))
+               return;
        down_write(&namespace_sem);
        spin_lock(&vfsmount_lock);
-       umount_tree(root, 0, &umount_list);
+       umount_tree(ns->root, 0, &umount_list);
        spin_unlock(&vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        kfree(ns);
 }
+EXPORT_SYMBOL(put_mnt_ns);