Revert "ext4: use old interface for ext4_readdir()"

[linux-3.10.git] / fs / namespace.c
diff --git a/fs/namespace.c b/fs/namespace.c

index 7563270a43abf11cbdaeb3a625528f3d90308a32..d0244c8ba09c8e8d53aba8dab6bcb367f2829b07 100644 (file)
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -21,7 +21,8 @@
  #include <linux/fs_struct.h>   /* get_fs_root et.al. */
  #include <linux/fsnotify.h>    /* fsnotify_vfsmount_delete */
  #include <linux/uaccess.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
+#include <linux/magic.h>
  #include "pnode.h"
  #include "internal.h"
  
@@ -826,6 +827,23 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
         }
  
         mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD;
+       /* Don't allow unprivileged users to change mount flags */
+       if (flag & CL_UNPRIVILEGED) {
+               mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
+
+               if (mnt->mnt.mnt_flags & MNT_READONLY)
+                       mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
+
+               if (mnt->mnt.mnt_flags & MNT_NODEV)
+                       mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
+
+               if (mnt->mnt.mnt_flags & MNT_NOSUID)
+                       mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
+
+               if (mnt->mnt.mnt_flags & MNT_NOEXEC)
+                       mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
+       }
+
         atomic_inc(&sb->s_active);
         mnt->mnt.mnt_sb = sb;
         mnt->mnt.mnt_root = dget(root);
@@ -1121,11 +1139,21 @@ EXPORT_SYMBOL(may_umount);
  
  static LIST_HEAD(unmounted);   /* protected by namespace_sem */
  
-void release_mounts(struct list_head *head)
+static void namespace_unlock(void)
  {
         struct mount *mnt;
-       while (!list_empty(head)) {
-               mnt = list_first_entry(head, struct mount, mnt_hash);
+       LIST_HEAD(head);
+
+       if (likely(list_empty(&unmounted))) {
+               up_write(&namespace_sem);
+               return;
+       }
+
+       list_splice_init(&unmounted, &head);
+       up_write(&namespace_sem);
+
+       while (!list_empty(&head)) {
+               mnt = list_first_entry(&head, struct mount, mnt_hash);
                 list_del_init(&mnt->mnt_hash);
                 if (mnt_has_parent(mnt)) {
                         struct dentry *dentry;
@@ -1145,19 +1173,16 @@ void release_mounts(struct list_head *head)
         }
  }
  
-static void namespace_unlock(void)
+static inline void namespace_lock(void)
  {
-       LIST_HEAD(head);
-       list_splice_init(&unmounted, &head);
-       up_write(&namespace_sem);
-       release_mounts(&head);
+       down_write(&namespace_sem);
  }
  
  /*
   * vfsmount lock must be held for write
   * namespace_sem must be held for write
   */
-void umount_tree(struct mount *mnt, int propagate, struct list_head *kill)
+void umount_tree(struct mount *mnt, int propagate)
  {
         LIST_HEAD(tmp_list);
         struct mount *p;
@@ -1181,7 +1206,7 @@ void umount_tree(struct mount *mnt, int propagate, struct list_head *kill)
                 }
                 change_mnt_propagation(p, MS_PRIVATE);
         }
-       list_splice(&tmp_list, kill);
+       list_splice(&tmp_list, &unmounted);
  }
  
  static void shrink_submounts(struct mount *mnt);
@@ -1190,7 +1215,6 @@ static int do_umount(struct mount *mnt, int flags)
  {
         struct super_block *sb = mnt->mnt.mnt_sb;
         int retval;
-       LIST_HEAD(umount_list);
  
         retval = security_sb_umount(&mnt->mnt, flags);
         if (retval)
@@ -1250,6 +1274,8 @@ static int do_umount(struct mount *mnt, int flags)
                  * Special case for "unmounting" root ...
                  * we just try to remount it readonly.
                  */
+               if (!capable(CAP_SYS_ADMIN))
+                       return -EPERM;
                 down_write(&sb->s_umount);
                 if (!(sb->s_flags & MS_RDONLY))
                         retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
@@ -1257,7 +1283,7 @@ static int do_umount(struct mount *mnt, int flags)
                 return retval;
         }
  
-       down_write(&namespace_sem);
+       namespace_lock();
         br_write_lock(&vfsmount_lock);
         event++;
  
@@ -1267,7 +1293,7 @@ static int do_umount(struct mount *mnt, int flags)
         retval = -EBUSY;
         if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
                 if (!list_empty(&mnt->mnt_list))
-                       umount_tree(mnt, 1, &unmounted);
+                       umount_tree(mnt, 1);
                 retval = 0;
         }
         br_write_unlock(&vfsmount_lock);
@@ -1316,6 +1342,9 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
                 goto dput_and_out;
         if (!check_mnt(mnt))
                 goto dput_and_out;
+       retval = -EPERM;
+       if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
+               goto dput_and_out;
  
         retval = do_umount(mnt, flags);
  dput_and_out:
@@ -1344,13 +1373,13 @@ static bool mnt_ns_loop(struct path *path)
          * mount namespace loop?
          */
         struct inode *inode = path->dentry->d_inode;
-       struct proc_inode *ei;
+       struct proc_ns *ei;
         struct mnt_namespace *mnt_ns;
  
         if (!proc_ns_inode(inode))
                 return false;
  
-       ei = PROC_I(inode);
+       ei = get_proc_ns(inode);
         if (ei->ns_ops != &mntns_operations)
                 return false;
  
@@ -1401,11 +1430,9 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
         return res;
  out:
         if (res) {
-               LIST_HEAD(umount_list);
                 br_write_lock(&vfsmount_lock);
-               umount_tree(res, 0, &umount_list);
+               umount_tree(res, 0);
                 br_write_unlock(&vfsmount_lock);
-               release_mounts(&umount_list);
         }
         return q;
  }
@@ -1415,20 +1442,20 @@ out:
  struct vfsmount *collect_mounts(struct path *path)
  {
         struct mount *tree;
-       down_write(&namespace_sem);
+       namespace_lock();
         tree = copy_tree(real_mount(path->mnt), path->dentry,
                          CL_COPY_ALL | CL_PRIVATE);
-       up_write(&namespace_sem);
+       namespace_unlock();
         if (IS_ERR(tree))
-               return NULL;
+               return ERR_CAST(tree);
         return &tree->mnt;
  }
  
  void drop_collected_mounts(struct vfsmount *mnt)
  {
-       down_write(&namespace_sem);
+       namespace_lock();
         br_write_lock(&vfsmount_lock);
-       umount_tree(real_mount(mnt), 0, &unmounted);
+       umount_tree(real_mount(mnt), 0);
         br_write_unlock(&vfsmount_lock);
         namespace_unlock();
  }
@@ -1596,18 +1623,18 @@ retry:
                 mutex_unlock(&dentry->d_inode->i_mutex);
                 return ERR_PTR(-ENOENT);
         }
-       down_write(&namespace_sem);
+       namespace_lock();
         mnt = lookup_mnt(path);
         if (likely(!mnt)) {
                 struct mountpoint *mp = new_mountpoint(dentry);
                 if (IS_ERR(mp)) {
-                       up_write(&namespace_sem);
+                       namespace_unlock();
                         mutex_unlock(&dentry->d_inode->i_mutex);
                         return mp;
                 }
                 return mp;
         }
-       up_write(&namespace_sem);
+       namespace_unlock();
         mutex_unlock(&path->dentry->d_inode->i_mutex);
         path_put(path);
         path->mnt = mnt;
@@ -1619,7 +1646,7 @@ static void unlock_mount(struct mountpoint *where)
  {
         struct dentry *dentry = where->m_dentry;
         put_mountpoint(where);
-       up_write(&namespace_sem);
+       namespace_unlock();
         mutex_unlock(&dentry->d_inode->i_mutex);
  }
  
@@ -1670,7 +1697,7 @@ static int do_change_type(struct path *path, int flag)
         if (!type)
                 return -EINVAL;
  
-       down_write(&namespace_sem);
+       namespace_lock();
         if (type == MS_SHARED) {
                 err = invent_group_ids(mnt, recurse);
                 if (err)
@@ -1683,7 +1710,7 @@ static int do_change_type(struct path *path, int flag)
         br_write_unlock(&vfsmount_lock);
  
   out_unlock:
-       up_write(&namespace_sem);
+       namespace_unlock();
         return err;
  }
  
@@ -1693,7 +1720,6 @@ static int do_change_type(struct path *path, int flag)
  static int do_loopback(struct path *path, const char *old_name,
                                 int recurse)
  {
-       LIST_HEAD(umount_list);
         struct path old_path;
         struct mount *mnt = NULL, *old, *parent;
         struct mountpoint *mp;
@@ -1736,12 +1762,11 @@ static int do_loopback(struct path *path, const char *old_name,
         err = graft_tree(mnt, parent, mp);
         if (err) {
                 br_write_lock(&vfsmount_lock);
-               umount_tree(mnt, 0, &umount_list);
+               umount_tree(mnt, 0);
                 br_write_unlock(&vfsmount_lock);
         }
  out2:
         unlock_mount(mp);
-       release_mounts(&umount_list);
  out:
         path_put(&old_path);
         return err;
@@ -1782,6 +1807,39 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
         if (path->dentry != path->mnt->mnt_root)
                 return -EINVAL;
  
+       /* Don't allow changing of locked mnt flags.
+        *
+        * No locks need to be held here while testing the various
+        * MNT_LOCK flags because those flags can never be cleared
+        * once they are set.
+        */
+       if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
+           !(mnt_flags & MNT_READONLY)) {
+               return -EPERM;
+       }
+       if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
+           !(mnt_flags & MNT_NODEV)) {
+               /* Was the nodev implicitly added in mount? */
+               if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
+                   !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
+                       mnt_flags |= MNT_NODEV;
+               } else {
+                       return -EPERM;
+               }
+       }
+       if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
+           !(mnt_flags & MNT_NOSUID)) {
+               return -EPERM;
+       }
+       if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
+           !(mnt_flags & MNT_NOEXEC)) {
+               return -EPERM;
+       }
+       if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
+           ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
+               return -EPERM;
+       }
+
         err = security_sb_remount(sb, data);
         if (err)
                 return err;
@@ -1795,7 +1853,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
                 err = do_remount_sb(sb, flags, data, 0);
         if (!err) {
                 br_write_lock(&vfsmount_lock);
-               mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
+               mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
                 mnt->mnt.mnt_flags = mnt_flags;
                 br_write_unlock(&vfsmount_lock);
         }
@@ -1981,7 +2039,7 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
                  */
                 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
                         flags |= MS_NODEV;
-                       mnt_flags |= MNT_NODEV;
+                       mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
                 }
         }
  
@@ -2021,11 +2079,11 @@ int finish_automount(struct vfsmount *m, struct path *path)
  fail:
         /* remove m from any expiration list it may be on */
         if (!list_empty(&mnt->mnt_expire)) {
-               down_write(&namespace_sem);
+               namespace_lock();
                 br_write_lock(&vfsmount_lock);
                 list_del_init(&mnt->mnt_expire);
                 br_write_unlock(&vfsmount_lock);
-               up_write(&namespace_sem);
+               namespace_unlock();
         }
         mntput(m);
         mntput(m);
@@ -2039,13 +2097,13 @@ fail:
   */
  void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
  {
-       down_write(&namespace_sem);
+       namespace_lock();
         br_write_lock(&vfsmount_lock);
  
         list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
  
         br_write_unlock(&vfsmount_lock);
-       up_write(&namespace_sem);
+       namespace_unlock();
  }
  EXPORT_SYMBOL(mnt_set_expiry);
  
@@ -2062,7 +2120,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
         if (list_empty(mounts))
                 return;
  
-       down_write(&namespace_sem);
+       namespace_lock();
         br_write_lock(&vfsmount_lock);
  
         /* extract from the expiration list every vfsmount that matches the
@@ -2080,7 +2138,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
         while (!list_empty(&graveyard)) {
                 mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
                 touch_mnt_namespace(mnt->mnt_ns);
-               umount_tree(mnt, 1, &unmounted);
+               umount_tree(mnt, 1);
         }
         br_write_unlock(&vfsmount_lock);
         namespace_unlock();
@@ -2151,7 +2209,7 @@ static void shrink_submounts(struct mount *mnt)
                         m = list_first_entry(&graveyard, struct mount,
                                                 mnt_expire);
                         touch_mnt_namespace(m->mnt_ns);
-                       umount_tree(m, 1, &unmounted);
+                       umount_tree(m, 1);
                 }
         }
  }
@@ -2274,12 +2332,11 @@ long do_mount(const char *dev_name, const char *dir_name,
  
         retval = security_sb_mount(dev_name, &path,
                                    type_page, flags, data_page);
+       if (!retval && !may_mount())
+               retval = -EPERM;
         if (retval)
                 goto dput_out;
  
-       if (!may_mount())
-               return -EPERM;
-
         /* Default to relatime unless overriden */
         if (!(flags & MS_NOATIME))
                 mnt_flags |= MNT_RELATIME;
@@ -2300,6 +2357,14 @@ long do_mount(const char *dev_name, const char *dir_name,
         if (flags & MS_RDONLY)
                 mnt_flags |= MNT_READONLY;
  
+       /* The default atime for remount is preservation */
+       if ((flags & MS_REMOUNT) &&
+           ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
+                      MS_STRICTATIME)) == 0)) {
+               mnt_flags &= ~MNT_ATIME_MASK;
+               mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
+       }
+
         flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
                    MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
                    MS_STRICTATIME);
@@ -2378,14 +2443,14 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
         if (IS_ERR(new_ns))
                 return new_ns;
  
-       down_write(&namespace_sem);
+       namespace_lock();
         /* First pass: copy the tree topology */
         copy_flags = CL_COPY_ALL | CL_EXPIRE;
         if (user_ns != mnt_ns->user_ns)
-               copy_flags |= CL_SHARED_TO_SLAVE;
+               copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
         new = copy_tree(old, old->mnt.mnt_root, copy_flags);
         if (IS_ERR(new)) {
-               up_write(&namespace_sem);
+               namespace_unlock();
                 free_mnt_ns(new_ns);
                 return ERR_CAST(new);
         }
@@ -2416,7 +2481,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
                 p = next_mnt(p, old);
                 q = next_mnt(q, new);
         }
-       up_write(&namespace_sem);
+       namespace_unlock();
  
         if (rootmnt)
                 mntput(rootmnt);
@@ -2454,7 +2519,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
                 struct mount *mnt = real_mount(m);
                 mnt->mnt_ns = new_ns;
                 new_ns->root = mnt;
-               list_add(&new_ns->list, &mnt->mnt_list);
+               list_add(&mnt->mnt_list, &new_ns->list);
         } else {
                 mntput(m);
         }
@@ -2640,6 +2705,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
         /* make sure we can reach put_old from new_root */
         if (!is_path_reachable(old_mnt, old.dentry, &new))
                 goto out4;
+       /* make certain new is below the root */
+       if (!is_path_reachable(new_mnt, new.dentry, &root))
+               goto out4;
         root_mp->m_count++; /* pin it so it won't go away */
         br_write_lock(&vfsmount_lock);
         detach_mnt(new_mnt, &parent_path);
@@ -2738,9 +2806,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
  {
         if (!atomic_dec_and_test(&ns->count))
                 return;
-       down_write(&namespace_sem);
+       namespace_lock();
         br_write_lock(&vfsmount_lock);
-       umount_tree(ns->root, 0, &unmounted);
+       umount_tree(ns->root, 0);
         br_write_unlock(&vfsmount_lock);
         namespace_unlock();
         free_mnt_ns(ns);
@@ -2778,6 +2846,51 @@ bool our_mnt(struct vfsmount *mnt)
         return check_mnt(real_mount(mnt));
  }
  
+bool current_chrooted(void)
+{
+       /* Does the current process have a non-standard root */
+       struct path ns_root;
+       struct path fs_root;
+       bool chrooted;
+
+       /* Find the namespace root */
+       ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
+       ns_root.dentry = ns_root.mnt->mnt_root;
+       path_get(&ns_root);
+       while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
+               ;
+
+       get_fs_root(current->fs, &fs_root);
+
+       chrooted = !path_equal(&fs_root, &ns_root);
+
+       path_put(&fs_root);
+       path_put(&ns_root);
+
+       return chrooted;
+}
+
+void update_mnt_policy(struct user_namespace *userns)
+{
+       struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+       struct mount *mnt;
+
+       down_read(&namespace_sem);
+       list_for_each_entry(mnt, &ns->list, mnt_list) {
+               switch (mnt->mnt.mnt_sb->s_magic) {
+               case SYSFS_MAGIC:
+                       userns->may_mount_sysfs = true;
+                       break;
+               case PROC_SUPER_MAGIC:
+                       userns->may_mount_proc = true;
+                       break;
+               }
+               if (userns->may_mount_sysfs && userns->may_mount_proc)
+                       break;
+       }
+       up_read(&namespace_sem);
+}
+
  static void *mntns_get(struct task_struct *task)
  {
         struct mnt_namespace *ns = NULL;