inotify: invalid mask should return a error number but not set it
[linux-2.6.git] / fs / pipe.c
index 054b8a6..1667e6f 100644 (file)
--- a/fs/pipe.c
+++ b/fs/pipe.c
 #include <linux/fs.h>
 #include <linux/log2.h>
 #include <linux/mount.h>
+#include <linux/magic.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/audit.h>
 #include <linux/syscalls.h>
+#include <linux/fcntl.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 
 /*
+ * The max size that a non-root user is allowed to grow the pipe. Can
+ * be set by root in /proc/sys/fs/pipe-max-size
+ */
+unsigned int pipe_max_size = 1048576;
+
+/*
+ * Minimum pipe size, as required by POSIX
+ */
+unsigned int pipe_min_size = PAGE_SIZE;
+
+/*
  * We use a start+len construction, which provides full use of the 
  * allocated memory.
  * -- Florian Coosmann (FGC)
@@ -218,11 +231,12 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
 {
        if (atomic) {
                buf->flags |= PIPE_BUF_FLAG_ATOMIC;
-               return kmap_atomic(buf->page, KM_USER0);
+               return kmap_atomic(buf->page);
        }
 
        return kmap(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_map);
 
 /**
  * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
@@ -238,10 +252,11 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
 {
        if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
                buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
-               kunmap_atomic(map_data, KM_USER0);
+               kunmap_atomic(map_data);
        } else
                kunmap(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_unmap);
 
 /**
  * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
@@ -272,6 +287,7 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
 
        return 1;
 }
+EXPORT_SYMBOL(generic_pipe_buf_steal);
 
 /**
  * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
@@ -287,6 +303,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 {
        page_cache_get(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_get);
 
 /**
  * generic_pipe_buf_confirm - verify contents of the pipe buffer
@@ -302,6 +319,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
 {
        return 0;
 }
+EXPORT_SYMBOL(generic_pipe_buf_confirm);
 
 /**
  * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
@@ -316,6 +334,7 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
 {
        page_cache_release(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_release);
 
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
        .can_merge = 1,
@@ -327,6 +346,16 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
        .get = generic_pipe_buf_get,
 };
 
+static const struct pipe_buf_operations packet_pipe_buf_ops = {
+       .can_merge = 0,
+       .map = generic_pipe_buf_map,
+       .unmap = generic_pipe_buf_unmap,
+       .confirm = generic_pipe_buf_confirm,
+       .release = anon_pipe_buf_release,
+       .steal = generic_pipe_buf_steal,
+       .get = generic_pipe_buf_get,
+};
+
 static ssize_t
 pipe_read(struct kiocb *iocb, const struct iovec *_iov,
           unsigned long nr_segs, loff_t pos)
@@ -364,7 +393,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
                        error = ops->confirm(pipe, buf);
                        if (error) {
                                if (!ret)
-                                       error = ret;
+                                       ret = error;
                                break;
                        }
 
@@ -388,6 +417,13 @@ redo:
                        ret += chars;
                        buf->offset += chars;
                        buf->len -= chars;
+
+                       /* Was it a packet buffer? Clean up and exit */
+                       if (buf->flags & PIPE_BUF_FLAG_PACKET) {
+                               total_len = chars;
+                               buf->len = 0;
+                       }
+
                        if (!buf->len) {
                                buf->ops = NULL;
                                ops->release(pipe, buf);
@@ -423,7 +459,7 @@ redo:
                        break;
                }
                if (do_wakeup) {
-                       wake_up_interruptible_sync(&pipe->wait);
+                       wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
                        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
                }
                pipe_wait(pipe);
@@ -432,7 +468,7 @@ redo:
 
        /* Signal writers asynchronously that there is more room. */
        if (do_wakeup) {
-               wake_up_interruptible_sync(&pipe->wait);
+               wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        }
        if (ret > 0)
@@ -440,6 +476,11 @@ redo:
        return ret;
 }
 
+static inline int is_packetized(struct file *file)
+{
+       return (file->f_flags & O_DIRECT) != 0;
+}
+
 static ssize_t
 pipe_write(struct kiocb *iocb, const struct iovec *_iov,
            unsigned long nr_segs, loff_t ppos)
@@ -547,14 +588,14 @@ redo1:
                        iov_fault_in_pages_read(iov, chars);
 redo2:
                        if (atomic)
-                               src = kmap_atomic(page, KM_USER0);
+                               src = kmap_atomic(page);
                        else
                                src = kmap(page);
 
                        error = pipe_iov_copy_from_user(src, iov, chars,
                                                        atomic);
                        if (atomic)
-                               kunmap_atomic(src, KM_USER0);
+                               kunmap_atomic(src);
                        else
                                kunmap(page);
 
@@ -574,6 +615,11 @@ redo2:
                        buf->ops = &anon_pipe_buf_ops;
                        buf->offset = 0;
                        buf->len = chars;
+                       buf->flags = 0;
+                       if (is_packetized(filp)) {
+                               buf->ops = &packet_pipe_buf_ops;
+                               buf->flags = PIPE_BUF_FLAG_PACKET;
+                       }
                        pipe->nrbufs = ++bufs;
                        pipe->tmp_page = NULL;
 
@@ -594,7 +640,7 @@ redo2:
                        break;
                }
                if (do_wakeup) {
-                       wake_up_interruptible_sync(&pipe->wait);
+                       wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
                        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                        do_wakeup = 0;
                }
@@ -605,7 +651,7 @@ redo2:
 out:
        mutex_unlock(&inode->i_mutex);
        if (do_wakeup) {
-               wake_up_interruptible_sync(&pipe->wait);
+               wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
        }
        if (ret > 0)
@@ -697,7 +743,7 @@ pipe_release(struct inode *inode, int decr, int decw)
        if (!pipe->readers && !pipe->writers) {
                free_pipe_info(inode);
        } else {
-               wake_up_interruptible_sync(&pipe->wait);
+               wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        }
@@ -814,6 +860,9 @@ pipe_rdwr_open(struct inode *inode, struct file *filp)
 {
        int ret = -ENOENT;
 
+       if (!(filp->f_mode & (FMODE_READ|FMODE_WRITE)))
+               return -EINVAL;
+
        mutex_lock(&inode->i_mutex);
 
        if (inode->i_pipe) {
@@ -930,12 +979,14 @@ static const struct dentry_operations pipefs_dentry_operations = {
 
 static struct inode * get_pipe_inode(void)
 {
-       struct inode *inode = new_inode(pipe_mnt->mnt_sb);
+       struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
        struct pipe_inode_info *pipe;
 
        if (!inode)
                goto fail_inode;
 
+       inode->i_ino = get_next_ino();
+
        pipe = alloc_pipe_info(inode);
        if (!pipe)
                goto fail_iput;
@@ -979,12 +1030,11 @@ struct file *create_write_pipe(int flags)
                goto err;
 
        err = -ENOMEM;
-       path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
+       path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
        if (!path.dentry)
                goto err_inode;
        path.mnt = mntget(pipe_mnt);
 
-       path.dentry->d_op = &pipefs_dentry_operations;
        d_instantiate(path.dentry, inode);
 
        err = -ENFILE;
@@ -993,7 +1043,7 @@ struct file *create_write_pipe(int flags)
                goto err_dentry;
        f->f_mapping = inode->i_mapping;
 
-       f->f_flags = O_WRONLY | (flags & O_NONBLOCK);
+       f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
        f->f_version = 0;
 
        return f;
@@ -1037,7 +1087,7 @@ int do_pipe_flags(int *fd, int flags)
        int error;
        int fdw, fdr;
 
-       if (flags & ~(O_CLOEXEC | O_NONBLOCK))
+       if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
                return -EINVAL;
 
        fw = create_write_pipe(flags);
@@ -1105,26 +1155,20 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
  * Allocate a new array of pipe buffers and copy the info over. Returns the
  * pipe size if successful, or return -ERROR on error.
  */
-static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
 {
        struct pipe_buffer *bufs;
 
        /*
-        * Must be a power-of-2 currently
-        */
-       if (!is_power_of_2(arg))
-               return -EINVAL;
-
-       /*
         * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
         * expect a lot of shrink+grow operations, just free and allocate
         * again like we would do for growing. If the pipe currently
         * contains more buffers than arg, then return busy.
         */
-       if (arg < pipe->nrbufs)
+       if (nr_pages < pipe->nrbufs)
                return -EBUSY;
 
-       bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
+       bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
        if (unlikely(!bufs))
                return -ENOMEM;
 
@@ -1133,20 +1177,68 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
         * and adjust the indexes.
         */
        if (pipe->nrbufs) {
-               const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
-               const unsigned int head = pipe->nrbufs - tail;
+               unsigned int tail;
+               unsigned int head;
+
+               tail = pipe->curbuf + pipe->nrbufs;
+               if (tail < pipe->buffers)
+                       tail = 0;
+               else
+                       tail &= (pipe->buffers - 1);
 
+               head = pipe->nrbufs - tail;
                if (head)
                        memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
                if (tail)
-                       memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
+                       memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
        }
 
        pipe->curbuf = 0;
        kfree(pipe->bufs);
        pipe->bufs = bufs;
-       pipe->buffers = arg;
-       return arg;
+       pipe->buffers = nr_pages;
+       return nr_pages * PAGE_SIZE;
+}
+
+/*
+ * Currently we rely on the pipe array holding a power-of-2 number
+ * of pages.
+ */
+static inline unsigned int round_pipe_size(unsigned int size)
+{
+       unsigned long nr_pages;
+
+       nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+}
+
+/*
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * will return an error.
+ */
+int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
+                size_t *lenp, loff_t *ppos)
+{
+       int ret;
+
+       ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
+       if (ret < 0 || !write)
+               return ret;
+
+       pipe_max_size = round_pipe_size(pipe_max_size);
+       return ret;
+}
+
+/*
+ * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
+ * location, so checking ->i_pipe is not enough to verify that this is a
+ * pipe.
+ */
+struct pipe_inode_info *get_pipe_info(struct file *file)
+{
+       struct inode *i = file->f_path.dentry->d_inode;
+
+       return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
 }
 
 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1154,44 +1246,64 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
        struct pipe_inode_info *pipe;
        long ret;
 
-       pipe = file->f_path.dentry->d_inode->i_pipe;
+       pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
 
        mutex_lock(&pipe->inode->i_mutex);
 
        switch (cmd) {
-       case F_SETPIPE_SZ:
-               ret = pipe_set_size(pipe, arg);
+       case F_SETPIPE_SZ: {
+               unsigned int size, nr_pages;
+
+               size = round_pipe_size(arg);
+               nr_pages = size >> PAGE_SHIFT;
+
+               ret = -EINVAL;
+               if (!nr_pages)
+                       goto out;
+
+               if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
+                       ret = -EPERM;
+                       goto out;
+               }
+               ret = pipe_set_size(pipe, nr_pages);
                break;
+               }
        case F_GETPIPE_SZ:
-               ret = pipe->buffers;
+               ret = pipe->buffers * PAGE_SIZE;
                break;
        default:
                ret = -EINVAL;
                break;
        }
 
+out:
        mutex_unlock(&pipe->inode->i_mutex);
        return ret;
 }
 
+static const struct super_operations pipefs_ops = {
+       .destroy_inode = free_inode_nonrcu,
+       .statfs = simple_statfs,
+};
+
 /*
  * pipefs should _never_ be mounted by userland - too much of security hassle,
  * no real gain from having the whole whorehouse mounted. So we don't need
  * any operations on the root directory. However, we need a non-trivial
  * d_name - pipe: will go nicely and kill the special-casing in procfs.
  */
-static int pipefs_get_sb(struct file_system_type *fs_type,
-                        int flags, const char *dev_name, void *data,
-                        struct vfsmount *mnt)
+static struct dentry *pipefs_mount(struct file_system_type *fs_type,
+                        int flags, const char *dev_name, void *data)
 {
-       return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
+       return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
+                       &pipefs_dentry_operations, PIPEFS_MAGIC);
 }
 
 static struct file_system_type pipe_fs_type = {
        .name           = "pipefs",
-       .get_sb         = pipefs_get_sb,
+       .mount          = pipefs_mount,
        .kill_sb        = kill_anon_super,
 };
 
@@ -1209,11 +1321,4 @@ static int __init init_pipe_fs(void)
        return err;
 }
 
-static void __exit exit_pipe_fs(void)
-{
-       unregister_filesystem(&pipe_fs_type);
-       mntput(pipe_mnt);
-}
-
 fs_initcall(init_pipe_fs);
-module_exit(exit_pipe_fs);