[PATCH] splice: add direct fd <-> fd splicing support
Jens Axboe [Tue, 11 Apr 2006 11:52:07 +0000 (13:52 +0200)]
It's more efficient for sendfile() emulation. Basically we cache an
internal private pipe and just use that as the intermediate area for
pages. Direct splicing is not available from sys_splice(), it is only
meant to be used for sendfile() emulation.

Additional patch from Ingo Molnar to avoid the PIPE_BUFFERS loop at
exit for the normal fast path.

Signed-off-by: Jens Axboe <axboe@suse.de>

fs/pipe.c
fs/splice.c
include/linux/fs.h
include/linux/pipe_fs_i.h
include/linux/sched.h
kernel/exit.c

index 705b486..036536f 100644 (file)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -691,12 +691,10 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
        return info;
 }
 
-void free_pipe_info(struct inode *inode)
+void __free_pipe_info(struct pipe_inode_info *info)
 {
        int i;
-       struct pipe_inode_info *info = inode->i_pipe;
 
-       inode->i_pipe = NULL;
        for (i = 0; i < PIPE_BUFFERS; i++) {
                struct pipe_buffer *buf = info->bufs + i;
                if (buf->ops)
@@ -707,6 +705,12 @@ void free_pipe_info(struct inode *inode)
        kfree(info);
 }
 
+void free_pipe_info(struct inode *inode)
+{
+       __free_pipe_info(inode->i_pipe);
+       inode->i_pipe = NULL;
+}
+
 static struct vfsmount *pipe_mnt __read_mostly;
 static int pipefs_delete_dentry(struct dentry *dentry)
 {
index a532612..c47b561 100644 (file)
@@ -680,8 +680,7 @@ EXPORT_SYMBOL(generic_splice_sendpage);
  * Attempt to initiate a splice from pipe to file.
  */
 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
-                          loff_t __user *off_out, size_t len,
-                          unsigned int flags)
+                          size_t len, unsigned int flags)
 {
        loff_t pos;
        int ret;
@@ -692,9 +691,6 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
        if (!(out->f_mode & FMODE_WRITE))
                return -EBADF;
 
-       if (off_out && copy_from_user(&out->f_pos, off_out, sizeof(loff_t)))
-               return -EFAULT;
-
        pos = out->f_pos;
 
        ret = rw_verify_area(WRITE, out, &pos, len);
@@ -707,9 +703,8 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 /*
  * Attempt to initiate a splice from a file to a pipe.
  */
-static long do_splice_to(struct file *in, loff_t __user *off_in,
-                        struct pipe_inode_info *pipe, size_t len,
-                        unsigned int flags)
+static long do_splice_to(struct file *in, struct pipe_inode_info *pipe,
+                        size_t len, unsigned int flags)
 {
        loff_t pos, isize, left;
        int ret;
@@ -720,9 +715,6 @@ static long do_splice_to(struct file *in, loff_t __user *off_in,
        if (!(in->f_mode & FMODE_READ))
                return -EBADF;
 
-       if (off_in && copy_from_user(&in->f_pos, off_in, sizeof(loff_t)))
-               return -EFAULT;
-
        pos = in->f_pos;
 
        ret = rw_verify_area(READ, in, &pos, len);
@@ -740,6 +732,118 @@ static long do_splice_to(struct file *in, loff_t __user *off_in,
        return in->f_op->splice_read(in, pipe, len, flags);
 }
 
+long do_splice_direct(struct file *in, struct file *out, size_t len,
+                     unsigned int flags)
+{
+       struct pipe_inode_info *pipe;
+       long ret, bytes;
+       umode_t i_mode;
+       int i;
+
+       /*
+        * We require the input being a regular file, as we don't want to
+        * randomly drop data for eg socket -> socket splicing. Use the
+        * piped splicing for that!
+        */
+       i_mode = in->f_dentry->d_inode->i_mode;
+       if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
+               return -EINVAL;
+
+       /*
+        * neither in nor out is a pipe, setup an internal pipe attached to
+        * 'out' and transfer the wanted data from 'in' to 'out' through that
+        */
+       pipe = current->splice_pipe;
+       if (!pipe) {
+               pipe = alloc_pipe_info(NULL);
+               if (!pipe)
+                       return -ENOMEM;
+
+               /*
+                * We don't have an immediate reader, but we'll read the stuff
+                * out of the pipe right after the move_to_pipe(). So set
+                * PIPE_READERS appropriately.
+                */
+               pipe->readers = 1;
+
+               current->splice_pipe = pipe;
+       }
+
+       /*
+        * do the splice
+        */
+       ret = 0;
+       bytes = 0;
+
+       while (len) {
+               size_t read_len, max_read_len;
+
+               /*
+                * Do at most PIPE_BUFFERS pages worth of transfer:
+                */
+               max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
+
+               ret = do_splice_to(in, pipe, max_read_len, flags);
+               if (unlikely(ret < 0))
+                       goto out_release;
+
+               read_len = ret;
+
+               /*
+                * NOTE: nonblocking mode only applies to the input. We
+                * must not do the output in nonblocking mode as then we
+                * could get stuck data in the internal pipe:
+                */
+               ret = do_splice_from(pipe, out, read_len,
+                                    flags & ~SPLICE_F_NONBLOCK);
+               if (unlikely(ret < 0))
+                       goto out_release;
+
+               bytes += ret;
+               len -= ret;
+
+               /*
+                * In nonblocking mode, if we got back a short read then
+                * that was due to either an IO error or due to the
+                * pagecache entry not being there. In the IO error case
+                * the _next_ splice attempt will produce a clean IO error
+                * return value (not a short read), so in both cases it's
+                * correct to break out of the loop here:
+                */
+               if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
+                       break;
+       }
+
+       pipe->nrbufs = pipe->curbuf = 0;
+
+       return bytes;
+
+out_release:
+       /*
+        * If we did an incomplete transfer we must release
+        * the pipe buffers in question:
+        */
+       for (i = 0; i < PIPE_BUFFERS; i++) {
+               struct pipe_buffer *buf = pipe->bufs + i;
+
+               if (buf->ops) {
+                       buf->ops->release(pipe, buf);
+                       buf->ops = NULL;
+               }
+       }
+       pipe->nrbufs = pipe->curbuf = 0;
+
+       /*
+        * If we transferred some data, return the number of bytes:
+        */
+       if (bytes > 0)
+               return bytes;
+
+       return ret;
+}
+
+EXPORT_SYMBOL(do_splice_direct);
+
 /*
  * Determine where to splice to/from.
  */
@@ -749,25 +853,33 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 {
        struct pipe_inode_info *pipe;
 
-       if (off_out && out->f_op->llseek == no_llseek)
-               return -EINVAL;
-       if (off_in && in->f_op->llseek == no_llseek)
-               return -EINVAL;
-
        pipe = in->f_dentry->d_inode->i_pipe;
        if (pipe) {
                if (off_in)
                        return -ESPIPE;
+               if (off_out) {
+                       if (out->f_op->llseek == no_llseek)
+                               return -EINVAL;
+                       if (copy_from_user(&out->f_pos, off_out,
+                                          sizeof(loff_t)))
+                               return -EFAULT;
+               }
 
-               return do_splice_from(pipe, out, off_out, len, flags);
+               return do_splice_from(pipe, out, len, flags);
        }
 
        pipe = out->f_dentry->d_inode->i_pipe;
        if (pipe) {
                if (off_out)
                        return -ESPIPE;
+               if (off_in) {
+                       if (in->f_op->llseek == no_llseek)
+                               return -EINVAL;
+                       if (copy_from_user(&in->f_pos, off_in, sizeof(loff_t)))
+                               return -EFAULT;
+               }
 
-               return do_splice_to(in, off_in, pipe, len, flags);
+               return do_splice_to(in, pipe, len, flags);
        }
 
        return -EINVAL;
index 7e64544..9e8e2ee 100644 (file)
@@ -1613,6 +1613,8 @@ extern void do_generic_mapping_read(struct address_space *mapping,
                                    loff_t *, read_descriptor_t *, read_actor_t);
 extern ssize_t generic_file_splice_read(struct file *, struct pipe_inode_info *, size_t, unsigned int);
 extern ssize_t generic_file_splice_write(struct pipe_inode_info *, struct file *, size_t, unsigned int);
+extern long do_splice_direct(struct file *in, struct file *out,
+                            size_t len, unsigned int flags);
 extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, 
index 9cf99cb..660e9d8 100644 (file)
@@ -58,6 +58,7 @@ void pipe_wait(struct pipe_inode_info *pipe);
 
 struct pipe_inode_info * alloc_pipe_info(struct inode * inode);
 void free_pipe_info(struct inode * inode);
+void __free_pipe_info(struct pipe_inode_info *);
 
 /*
  * splice is tied to pipes as a transport (at least for now), so we'll just
index 541f482..e194ec7 100644 (file)
@@ -684,6 +684,7 @@ static inline void prefetch_stack(struct task_struct *t) { }
 
 struct audit_context;          /* See audit.c */
 struct mempolicy;
+struct pipe_inode_info;
 
 enum sleep_type {
        SLEEP_NORMAL,
@@ -882,6 +883,11 @@ struct task_struct {
 
        atomic_t fs_excl;       /* holding fs exclusive resources */
        struct rcu_head rcu;
+
+       /*
+        * cache last used pipe for splice
+        */
+       struct pipe_inode_info *splice_pipe;
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
index 6c2eeb8..1a9787a 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/mutex.h>
 #include <linux/futex.h>
 #include <linux/compat.h>
+#include <linux/pipe_fs_i.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -941,6 +942,9 @@ fastcall NORET_TYPE void do_exit(long code)
        if (tsk->io_context)
                exit_io_context();
 
+       if (tsk->splice_pipe)
+               __free_pipe_info(tsk->splice_pipe);
+
        /* PF_DEAD causes final put_task_struct after we schedule. */
        preempt_disable();
        BUG_ON(tsk->flags & PF_DEAD);