[PATCH] Introduce sys_splice() system call
[linux-2.6.git] / fs / splice.c
1 /*
2  * "splice": joining two ropes together by interweaving their strands.
3  *
4  * This is the "extended pipe" functionality, where a pipe is used as
5  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6  * buffer that you can use to transfer data from one end to the other.
7  *
8  * The traditional unix read/write is extended with a "splice()" operation
9  * that transfers data buffers to or from a pipe buffer.
10  *
11  * Named by Larry McVoy, original implementation from Linus, extended by
12  * Jens to support splicing to files and fixing the initial implementation
13  * bugs.
14  *
15  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
16  * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org>
17  *
18  */
19 #include <linux/fs.h>
20 #include <linux/file.h>
21 #include <linux/pagemap.h>
22 #include <linux/pipe_fs_i.h>
23 #include <linux/mm_inline.h>
24
25 /*
26  * Passed to the actors
27  */
28 struct splice_desc {
29         unsigned int len, total_len;    /* current and remaining length */
30         unsigned int flags;             /* splice flags */
31         struct file *file;              /* file to read/write */
32         loff_t pos;                     /* file position */
33 };
34
35 static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
36                                         struct pipe_buffer *buf)
37 {
38         page_cache_release(buf->page);
39         buf->page = NULL;
40 }
41
42 static void *page_cache_pipe_buf_map(struct file *file,
43                                      struct pipe_inode_info *info,
44                                      struct pipe_buffer *buf)
45 {
46         struct page *page = buf->page;
47
48         lock_page(page);
49
50         if (!PageUptodate(page)) {
51                 unlock_page(page);
52                 return ERR_PTR(-EIO);
53         }
54
55         if (!page->mapping) {
56                 unlock_page(page);
57                 return ERR_PTR(-ENODATA);
58         }
59
60         return kmap(buf->page);
61 }
62
63 static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
64                                       struct pipe_buffer *buf)
65 {
66         unlock_page(buf->page);
67         kunmap(buf->page);
68 }
69
70 static struct pipe_buf_operations page_cache_pipe_buf_ops = {
71         .can_merge = 0,
72         .map = page_cache_pipe_buf_map,
73         .unmap = page_cache_pipe_buf_unmap,
74         .release = page_cache_pipe_buf_release,
75 };
76
77 static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
78                             int nr_pages, unsigned long offset,
79                             unsigned long len)
80 {
81         struct pipe_inode_info *info;
82         int ret, do_wakeup, i;
83
84         ret = 0;
85         do_wakeup = 0;
86         i = 0;
87
88         mutex_lock(PIPE_MUTEX(*inode));
89
90         info = inode->i_pipe;
91         for (;;) {
92                 int bufs;
93
94                 if (!PIPE_READERS(*inode)) {
95                         send_sig(SIGPIPE, current, 0);
96                         if (!ret)
97                                 ret = -EPIPE;
98                         break;
99                 }
100
101                 bufs = info->nrbufs;
102                 if (bufs < PIPE_BUFFERS) {
103                         int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1);
104                         struct pipe_buffer *buf = info->bufs + newbuf;
105                         struct page *page = pages[i++];
106                         unsigned long this_len;
107
108                         this_len = PAGE_CACHE_SIZE - offset;
109                         if (this_len > len)
110                                 this_len = len;
111
112                         buf->page = page;
113                         buf->offset = offset;
114                         buf->len = this_len;
115                         buf->ops = &page_cache_pipe_buf_ops;
116                         info->nrbufs = ++bufs;
117                         do_wakeup = 1;
118
119                         ret += this_len;
120                         len -= this_len;
121                         offset = 0;
122                         if (!--nr_pages)
123                                 break;
124                         if (!len)
125                                 break;
126                         if (bufs < PIPE_BUFFERS)
127                                 continue;
128
129                         break;
130                 }
131
132                 if (signal_pending(current)) {
133                         if (!ret)
134                                 ret = -ERESTARTSYS;
135                         break;
136                 }
137
138                 if (do_wakeup) {
139                         wake_up_interruptible_sync(PIPE_WAIT(*inode));
140                         kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO,
141                                     POLL_IN);
142                         do_wakeup = 0;
143                 }
144
145                 PIPE_WAITING_WRITERS(*inode)++;
146                 pipe_wait(inode);
147                 PIPE_WAITING_WRITERS(*inode)--;
148         }
149
150         mutex_unlock(PIPE_MUTEX(*inode));
151
152         if (do_wakeup) {
153                 wake_up_interruptible(PIPE_WAIT(*inode));
154                 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
155         }
156
157         while (i < nr_pages)
158                 page_cache_release(pages[i++]);
159
160         return ret;
161 }
162
163 static int __generic_file_splice_read(struct file *in, struct inode *pipe,
164                                       size_t len)
165 {
166         struct address_space *mapping = in->f_mapping;
167         unsigned int offset, nr_pages;
168         struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS];
169         struct page *page;
170         pgoff_t index, pidx;
171         int i, j;
172
173         index = in->f_pos >> PAGE_CACHE_SHIFT;
174         offset = in->f_pos & ~PAGE_CACHE_MASK;
175         nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
176
177         if (nr_pages > PIPE_BUFFERS)
178                 nr_pages = PIPE_BUFFERS;
179
180         /*
181          * initiate read-ahead on this page range
182          */
183         do_page_cache_readahead(mapping, in, index, nr_pages);
184
185         /*
186          * Get as many pages from the page cache as possible..
187          * Start IO on the page cache entries we create (we
188          * can assume that any pre-existing ones we find have
189          * already had IO started on them).
190          */
191         i = find_get_pages(mapping, index, nr_pages, pages);
192
193         /*
194          * common case - we found all pages and they are contiguous,
195          * kick them off
196          */
197         if (i && (pages[i - 1]->index == index + i - 1))
198                 goto splice_them;
199
200         /*
201          * fill shadow[] with pages at the right locations, so we only
202          * have to fill holes
203          */
204         memset(shadow, 0, i * sizeof(struct page *));
205         for (j = 0, pidx = index; j < i; pidx++, j++)
206                 shadow[pages[j]->index - pidx] = pages[j];
207
208         /*
209          * now fill in the holes
210          */
211         for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
212                 int error;
213
214                 if (shadow[i])
215                         continue;
216
217                 /*
218                  * no page there, look one up / create it
219                  */
220                 page = find_or_create_page(mapping, pidx,
221                                                    mapping_gfp_mask(mapping));
222                 if (!page)
223                         break;
224
225                 if (PageUptodate(page))
226                         unlock_page(page);
227                 else {
228                         error = mapping->a_ops->readpage(in, page);
229
230                         if (unlikely(error)) {
231                                 page_cache_release(page);
232                                 break;
233                         }
234                 }
235                 shadow[i] = page;
236         }
237
238         if (!i) {
239                 for (i = 0; i < nr_pages; i++) {
240                          if (shadow[i])
241                                 page_cache_release(shadow[i]);
242                 }
243                 return 0;
244         }
245
246         memcpy(pages, shadow, i * sizeof(struct page *));
247
248         /*
249          * Now we splice them into the pipe..
250          */
251 splice_them:
252         return move_to_pipe(pipe, pages, i, offset, len);
253 }
254
255 ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
256                                  size_t len, unsigned int flags)
257 {
258         ssize_t spliced;
259         int ret;
260
261         ret = 0;
262         spliced = 0;
263         while (len) {
264                 ret = __generic_file_splice_read(in, pipe, len);
265
266                 if (ret <= 0)
267                         break;
268
269                 in->f_pos += ret;
270                 len -= ret;
271                 spliced += ret;
272         }
273
274         if (spliced)
275                 return spliced;
276
277         return ret;
278 }
279
280 /*
281  * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage().
282  */
283 static int pipe_to_sendpage(struct pipe_inode_info *info,
284                             struct pipe_buffer *buf, struct splice_desc *sd)
285 {
286         struct file *file = sd->file;
287         loff_t pos = sd->pos;
288         unsigned int offset;
289         ssize_t ret;
290         void *ptr;
291
292         /*
293          * sub-optimal, but we are limited by the pipe ->map. we don't
294          * need a kmap'ed buffer here, we just want to make sure we
295          * have the page pinned if the pipe page originates from the
296          * page cache
297          */
298         ptr = buf->ops->map(file, info, buf);
299         if (IS_ERR(ptr))
300                 return PTR_ERR(ptr);
301
302         offset = pos & ~PAGE_CACHE_MASK;
303
304         ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,
305                                         sd->len < sd->total_len);
306
307         buf->ops->unmap(info, buf);
308         if (ret == sd->len)
309                 return 0;
310
311         return -EIO;
312 }
313
314 /*
315  * This is a little more tricky than the file -> pipe splicing. There are
316  * basically three cases:
317  *
318  *      - Destination page already exists in the address space and there
319  *        are users of it. For that case we have no other option that
320  *        copying the data. Tough luck.
321  *      - Destination page already exists in the address space, but there
322  *        are no users of it. Make sure it's uptodate, then drop it. Fall
323  *        through to last case.
324  *      - Destination page does not exist, we can add the pipe page to
325  *        the page cache and avoid the copy.
326  *
327  * For now we just do the slower thing and always copy pages over, it's
328  * easier than migrating pages from the pipe to the target file. For the
329  * case of doing file | file splicing, the migrate approach had some LRU
330  * nastiness...
331  */
332 static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
333                         struct splice_desc *sd)
334 {
335         struct file *file = sd->file;
336         struct address_space *mapping = file->f_mapping;
337         unsigned int offset;
338         struct page *page;
339         char *src, *dst;
340         pgoff_t index;
341         int ret;
342
343         /*
344          * after this, page will be locked and unmapped
345          */
346         src = buf->ops->map(file, info, buf);
347         if (IS_ERR(src))
348                 return PTR_ERR(src);
349
350         index = sd->pos >> PAGE_CACHE_SHIFT;
351         offset = sd->pos & ~PAGE_CACHE_MASK;
352
353 find_page:
354         ret = -ENOMEM;
355         page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
356         if (!page)
357                 goto out;
358
359         /*
360          * If the page is uptodate, it is also locked. If it isn't
361          * uptodate, we can mark it uptodate if we are filling the
362          * full page. Otherwise we need to read it in first...
363          */
364         if (!PageUptodate(page)) {
365                 if (sd->len < PAGE_CACHE_SIZE) {
366                         ret = mapping->a_ops->readpage(file, page);
367                         if (unlikely(ret))
368                                 goto out;
369
370                         lock_page(page);
371
372                         if (!PageUptodate(page)) {
373                                 /*
374                                  * page got invalidated, repeat
375                                  */
376                                 if (!page->mapping) {
377                                         unlock_page(page);
378                                         page_cache_release(page);
379                                         goto find_page;
380                                 }
381                                 ret = -EIO;
382                                 goto out;
383                         }
384                 } else {
385                         WARN_ON(!PageLocked(page));
386                         SetPageUptodate(page);
387                 }
388         }
389
390         ret = mapping->a_ops->prepare_write(file, page, 0, sd->len);
391         if (ret)
392                 goto out;
393
394         dst = kmap_atomic(page, KM_USER0);
395         memcpy(dst + offset, src + buf->offset, sd->len);
396         flush_dcache_page(page);
397         kunmap_atomic(dst, KM_USER0);
398
399         ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
400         if (ret < 0)
401                 goto out;
402
403         set_page_dirty(page);
404         ret = write_one_page(page, 0);
405 out:
406         if (ret < 0)
407                 unlock_page(page);
408         page_cache_release(page);
409         buf->ops->unmap(info, buf);
410         return ret;
411 }
412
413 typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
414                            struct splice_desc *);
415
416 static ssize_t move_from_pipe(struct inode *inode, struct file *out,
417                               size_t len, unsigned int flags,
418                               splice_actor *actor)
419 {
420         struct pipe_inode_info *info;
421         int ret, do_wakeup, err;
422         struct splice_desc sd;
423
424         ret = 0;
425         do_wakeup = 0;
426
427         sd.total_len = len;
428         sd.flags = flags;
429         sd.file = out;
430         sd.pos = out->f_pos;
431
432         mutex_lock(PIPE_MUTEX(*inode));
433
434         info = inode->i_pipe;
435         for (;;) {
436                 int bufs = info->nrbufs;
437
438                 if (bufs) {
439                         int curbuf = info->curbuf;
440                         struct pipe_buffer *buf = info->bufs + curbuf;
441                         struct pipe_buf_operations *ops = buf->ops;
442
443                         sd.len = buf->len;
444                         if (sd.len > sd.total_len)
445                                 sd.len = sd.total_len;
446
447                         err = actor(info, buf, &sd);
448                         if (err) {
449                                 if (!ret && err != -ENODATA)
450                                         ret = err;
451
452                                 break;
453                         }
454
455                         ret += sd.len;
456                         buf->offset += sd.len;
457                         buf->len -= sd.len;
458                         if (!buf->len) {
459                                 buf->ops = NULL;
460                                 ops->release(info, buf);
461                                 curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1);
462                                 info->curbuf = curbuf;
463                                 info->nrbufs = --bufs;
464                                 do_wakeup = 1;
465                         }
466
467                         sd.pos += sd.len;
468                         sd.total_len -= sd.len;
469                         if (!sd.total_len)
470                                 break;
471                 }
472
473                 if (bufs)
474                         continue;
475                 if (!PIPE_WRITERS(*inode))
476                         break;
477                 if (!PIPE_WAITING_WRITERS(*inode)) {
478                         if (ret)
479                                 break;
480                 }
481
482                 if (signal_pending(current)) {
483                         if (!ret)
484                                 ret = -ERESTARTSYS;
485                         break;
486                 }
487
488                 if (do_wakeup) {
489                         wake_up_interruptible_sync(PIPE_WAIT(*inode));
490                         kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT);
491                         do_wakeup = 0;
492                 }
493
494                 pipe_wait(inode);
495         }
496
497         mutex_unlock(PIPE_MUTEX(*inode));
498
499         if (do_wakeup) {
500                 wake_up_interruptible(PIPE_WAIT(*inode));
501                 kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
502         }
503
504         mutex_lock(&out->f_mapping->host->i_mutex);
505         out->f_pos = sd.pos;
506         mutex_unlock(&out->f_mapping->host->i_mutex);
507         return ret;
508
509 }
510
511 ssize_t generic_file_splice_write(struct inode *inode, struct file *out,
512                                   size_t len, unsigned int flags)
513 {
514         return move_from_pipe(inode, out, len, flags, pipe_to_file);
515 }
516
517 ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
518                                 size_t len, unsigned int flags)
519 {
520         return move_from_pipe(inode, out, len, flags, pipe_to_sendpage);
521 }
522
523 static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
524                            unsigned int flags)
525 {
526         loff_t pos;
527         int ret;
528
529         if (!out->f_op || !out->f_op->splice_write)
530                 return -EINVAL;
531
532         if (!(out->f_mode & FMODE_WRITE))
533                 return -EBADF;
534
535         pos = out->f_pos;
536         ret = rw_verify_area(WRITE, out, &pos, len);
537         if (unlikely(ret < 0))
538                 return ret;
539
540         return out->f_op->splice_write(pipe, out, len, flags);
541 }
542
543 static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
544                          unsigned int flags)
545 {
546         loff_t pos, isize, left;
547         int ret;
548
549         if (!in->f_op || !in->f_op->splice_read)
550                 return -EINVAL;
551
552         if (!(in->f_mode & FMODE_READ))
553                 return -EBADF;
554
555         pos = in->f_pos;
556         ret = rw_verify_area(READ, in, &pos, len);
557         if (unlikely(ret < 0))
558                 return ret;
559
560         isize = i_size_read(in->f_mapping->host);
561         if (unlikely(in->f_pos >= isize))
562                 return 0;
563         
564         left = isize - in->f_pos;
565         if (left < len)
566                 len = left;
567
568         return in->f_op->splice_read(in, pipe, len, flags);
569 }
570
571 static long do_splice(struct file *in, struct file *out, size_t len,
572                       unsigned int flags)
573 {
574         struct inode *pipe;
575
576         pipe = in->f_dentry->d_inode;
577         if (pipe->i_pipe)
578                 return do_splice_from(pipe, out, len, flags);
579
580         pipe = out->f_dentry->d_inode;
581         if (pipe->i_pipe)
582                 return do_splice_to(in, pipe, len, flags);
583
584         return -EINVAL;
585 }
586
587 asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
588 {
589         long error;
590         struct file *in, *out;
591         int fput_in, fput_out;
592
593         if (unlikely(!len))
594                 return 0;
595
596         error = -EBADF;
597         in = fget_light(fdin, &fput_in);
598         if (in) {
599                 if (in->f_mode & FMODE_READ) {
600                         out = fget_light(fdout, &fput_out);
601                         if (out) {
602                                 if (out->f_mode & FMODE_WRITE)
603                                         error = do_splice(in, out, len, flags);
604                                 fput_light(out, fput_out);
605                         }
606                 }
607
608                 fput_light(in, fput_in);
609         }
610
611         return error;
612 }