a555d0a83fe95efa33a06248633869e89814e807
[linux-2.6.git] / fs / splice.c
1 /*
2  * "splice": joining two ropes together by interweaving their strands.
3  *
4  * This is the "extended pipe" functionality, where a pipe is used as
5  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6  * buffer that you can use to transfer data from one end to the other.
7  *
8  * The traditional unix read/write is extended with a "splice()" operation
9  * that transfers data buffers to or from a pipe buffer.
10  *
11  * Named by Larry McVoy, original implementation from Linus, extended by
12  * Jens to support splicing to files and fixing the initial implementation
13  * bugs.
14  *
15  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
16  * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org>
17  *
18  */
19 #include <linux/fs.h>
20 #include <linux/file.h>
21 #include <linux/pagemap.h>
22 #include <linux/pipe_fs_i.h>
23 #include <linux/mm_inline.h>
24 #include <linux/swap.h>
25 #include <linux/module.h>
26
27 /*
28  * Passed to the actors
29  */
30 struct splice_desc {
31         unsigned int len, total_len;    /* current and remaining length */
32         unsigned int flags;             /* splice flags */
33         struct file *file;              /* file to read/write */
34         loff_t pos;                     /* file position */
35 };
36
37 static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
38                                      struct pipe_buffer *buf)
39 {
40         struct page *page = buf->page;
41
42         WARN_ON(!PageLocked(page));
43         WARN_ON(!PageUptodate(page));
44
45         if (!remove_mapping(page_mapping(page), page))
46                 return 1;
47
48         if (PageLRU(page)) {
49                 struct zone *zone = page_zone(page);
50
51                 spin_lock_irq(&zone->lru_lock);
52                 BUG_ON(!PageLRU(page));
53                 __ClearPageLRU(page);
54                 del_page_from_lru(zone, page);
55                 spin_unlock_irq(&zone->lru_lock);
56         }
57
58         buf->stolen = 1;
59         return 0;
60 }
61
62 static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
63                                         struct pipe_buffer *buf)
64 {
65         page_cache_release(buf->page);
66         buf->page = NULL;
67         buf->stolen = 0;
68 }
69
70 static void *page_cache_pipe_buf_map(struct file *file,
71                                      struct pipe_inode_info *info,
72                                      struct pipe_buffer *buf)
73 {
74         struct page *page = buf->page;
75
76         lock_page(page);
77
78         if (!PageUptodate(page)) {
79                 unlock_page(page);
80                 return ERR_PTR(-EIO);
81         }
82
83         if (!page->mapping) {
84                 unlock_page(page);
85                 return ERR_PTR(-ENODATA);
86         }
87
88         return kmap(buf->page);
89 }
90
91 static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
92                                       struct pipe_buffer *buf)
93 {
94         if (!buf->stolen)
95                 unlock_page(buf->page);
96         kunmap(buf->page);
97 }
98
99 static struct pipe_buf_operations page_cache_pipe_buf_ops = {
100         .can_merge = 0,
101         .map = page_cache_pipe_buf_map,
102         .unmap = page_cache_pipe_buf_unmap,
103         .release = page_cache_pipe_buf_release,
104         .steal = page_cache_pipe_buf_steal,
105 };
106
107 static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
108                             int nr_pages, unsigned long offset,
109                             unsigned long len, unsigned int flags)
110 {
111         struct pipe_inode_info *info;
112         int ret, do_wakeup, i;
113
114         ret = 0;
115         do_wakeup = 0;
116         i = 0;
117
118         mutex_lock(PIPE_MUTEX(*inode));
119
120         info = inode->i_pipe;
121         for (;;) {
122                 int bufs;
123
124                 if (!PIPE_READERS(*inode)) {
125                         send_sig(SIGPIPE, current, 0);
126                         if (!ret)
127                                 ret = -EPIPE;
128                         break;
129                 }
130
131                 bufs = info->nrbufs;
132                 if (bufs < PIPE_BUFFERS) {
133                         int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1);
134                         struct pipe_buffer *buf = info->bufs + newbuf;
135                         struct page *page = pages[i++];
136                         unsigned long this_len;
137
138                         this_len = PAGE_CACHE_SIZE - offset;
139                         if (this_len > len)
140                                 this_len = len;
141
142                         buf->page = page;
143                         buf->offset = offset;
144                         buf->len = this_len;
145                         buf->ops = &page_cache_pipe_buf_ops;
146                         info->nrbufs = ++bufs;
147                         do_wakeup = 1;
148
149                         ret += this_len;
150                         len -= this_len;
151                         offset = 0;
152                         if (!--nr_pages)
153                                 break;
154                         if (!len)
155                                 break;
156                         if (bufs < PIPE_BUFFERS)
157                                 continue;
158
159                         break;
160                 }
161
162                 if (flags & SPLICE_F_NONBLOCK) {
163                         if (!ret)
164                                 ret = -EAGAIN;
165                         break;
166                 }
167
168                 if (signal_pending(current)) {
169                         if (!ret)
170                                 ret = -ERESTARTSYS;
171                         break;
172                 }
173
174                 if (do_wakeup) {
175                         wake_up_interruptible_sync(PIPE_WAIT(*inode));
176                         kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO,
177                                     POLL_IN);
178                         do_wakeup = 0;
179                 }
180
181                 PIPE_WAITING_WRITERS(*inode)++;
182                 pipe_wait(inode);
183                 PIPE_WAITING_WRITERS(*inode)--;
184         }
185
186         mutex_unlock(PIPE_MUTEX(*inode));
187
188         if (do_wakeup) {
189                 wake_up_interruptible(PIPE_WAIT(*inode));
190                 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
191         }
192
193         while (i < nr_pages)
194                 page_cache_release(pages[i++]);
195
196         return ret;
197 }
198
199 static int __generic_file_splice_read(struct file *in, struct inode *pipe,
200                                       size_t len, unsigned int flags)
201 {
202         struct address_space *mapping = in->f_mapping;
203         unsigned int offset, nr_pages;
204         struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS];
205         struct page *page;
206         pgoff_t index, pidx;
207         int i, j;
208
209         index = in->f_pos >> PAGE_CACHE_SHIFT;
210         offset = in->f_pos & ~PAGE_CACHE_MASK;
211         nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
212
213         if (nr_pages > PIPE_BUFFERS)
214                 nr_pages = PIPE_BUFFERS;
215
216         /*
217          * initiate read-ahead on this page range
218          */
219         do_page_cache_readahead(mapping, in, index, nr_pages);
220
221         /*
222          * Get as many pages from the page cache as possible..
223          * Start IO on the page cache entries we create (we
224          * can assume that any pre-existing ones we find have
225          * already had IO started on them).
226          */
227         i = find_get_pages(mapping, index, nr_pages, pages);
228
229         /*
230          * common case - we found all pages and they are contiguous,
231          * kick them off
232          */
233         if (i && (pages[i - 1]->index == index + i - 1))
234                 goto splice_them;
235
236         /*
237          * fill shadow[] with pages at the right locations, so we only
238          * have to fill holes
239          */
240         memset(shadow, 0, nr_pages * sizeof(struct page *));
241         for (j = 0; j < i; j++)
242                 shadow[pages[j]->index - index] = pages[j];
243
244         /*
245          * now fill in the holes
246          */
247         for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
248                 int error;
249
250                 if (shadow[i])
251                         continue;
252
253                 /*
254                  * no page there, look one up / create it
255                  */
256                 page = find_or_create_page(mapping, pidx,
257                                                    mapping_gfp_mask(mapping));
258                 if (!page)
259                         break;
260
261                 if (PageUptodate(page))
262                         unlock_page(page);
263                 else {
264                         error = mapping->a_ops->readpage(in, page);
265
266                         if (unlikely(error)) {
267                                 page_cache_release(page);
268                                 break;
269                         }
270                 }
271                 shadow[i] = page;
272         }
273
274         if (!i) {
275                 for (i = 0; i < nr_pages; i++) {
276                          if (shadow[i])
277                                 page_cache_release(shadow[i]);
278                 }
279                 return 0;
280         }
281
282         memcpy(pages, shadow, i * sizeof(struct page *));
283
284         /*
285          * Now we splice them into the pipe..
286          */
287 splice_them:
288         return move_to_pipe(pipe, pages, i, offset, len, flags);
289 }
290
291 ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
292                                  size_t len, unsigned int flags)
293 {
294         ssize_t spliced;
295         int ret;
296
297         ret = 0;
298         spliced = 0;
299         while (len) {
300                 ret = __generic_file_splice_read(in, pipe, len, flags);
301
302                 if (ret <= 0)
303                         break;
304
305                 in->f_pos += ret;
306                 len -= ret;
307                 spliced += ret;
308
309                 if (!(flags & SPLICE_F_NONBLOCK))
310                         continue;
311                 ret = -EAGAIN;
312                 break;
313         }
314
315         if (spliced)
316                 return spliced;
317
318         return ret;
319 }
320
321 /*
322  * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage().
323  */
324 static int pipe_to_sendpage(struct pipe_inode_info *info,
325                             struct pipe_buffer *buf, struct splice_desc *sd)
326 {
327         struct file *file = sd->file;
328         loff_t pos = sd->pos;
329         unsigned int offset;
330         ssize_t ret;
331         void *ptr;
332
333         /*
334          * sub-optimal, but we are limited by the pipe ->map. we don't
335          * need a kmap'ed buffer here, we just want to make sure we
336          * have the page pinned if the pipe page originates from the
337          * page cache
338          */
339         ptr = buf->ops->map(file, info, buf);
340         if (IS_ERR(ptr))
341                 return PTR_ERR(ptr);
342
343         offset = pos & ~PAGE_CACHE_MASK;
344
345         ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,
346                                         sd->len < sd->total_len);
347
348         buf->ops->unmap(info, buf);
349         if (ret == sd->len)
350                 return 0;
351
352         return -EIO;
353 }
354
355 /*
356  * This is a little more tricky than the file -> pipe splicing. There are
357  * basically three cases:
358  *
359  *      - Destination page already exists in the address space and there
360  *        are users of it. For that case we have no other option that
361  *        copying the data. Tough luck.
362  *      - Destination page already exists in the address space, but there
363  *        are no users of it. Make sure it's uptodate, then drop it. Fall
364  *        through to last case.
365  *      - Destination page does not exist, we can add the pipe page to
366  *        the page cache and avoid the copy.
367  *
368  * For now we just do the slower thing and always copy pages over, it's
369  * easier than migrating pages from the pipe to the target file. For the
370  * case of doing file | file splicing, the migrate approach had some LRU
371  * nastiness...
372  */
373 static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
374                         struct splice_desc *sd)
375 {
376         struct file *file = sd->file;
377         struct address_space *mapping = file->f_mapping;
378         unsigned int offset;
379         struct page *page;
380         pgoff_t index;
381         char *src;
382         int ret;
383
384         /*
385          * after this, page will be locked and unmapped
386          */
387         src = buf->ops->map(file, info, buf);
388         if (IS_ERR(src))
389                 return PTR_ERR(src);
390
391         index = sd->pos >> PAGE_CACHE_SHIFT;
392         offset = sd->pos & ~PAGE_CACHE_MASK;
393
394         /*
395          * reuse buf page, if SPLICE_F_MOVE is set
396          */
397         if (sd->flags & SPLICE_F_MOVE) {
398                 if (buf->ops->steal(info, buf))
399                         goto find_page;
400
401                 page = buf->page;
402                 if (add_to_page_cache_lru(page, mapping, index,
403                                                 mapping_gfp_mask(mapping)))
404                         goto find_page;
405         } else {
406 find_page:
407                 ret = -ENOMEM;
408                 page = find_or_create_page(mapping, index,
409                                                 mapping_gfp_mask(mapping));
410                 if (!page)
411                         goto out;
412
413                 /*
414                  * If the page is uptodate, it is also locked. If it isn't
415                  * uptodate, we can mark it uptodate if we are filling the
416                  * full page. Otherwise we need to read it in first...
417                  */
418                 if (!PageUptodate(page)) {
419                         if (sd->len < PAGE_CACHE_SIZE) {
420                                 ret = mapping->a_ops->readpage(file, page);
421                                 if (unlikely(ret))
422                                         goto out;
423
424                                 lock_page(page);
425
426                                 if (!PageUptodate(page)) {
427                                         /*
428                                          * page got invalidated, repeat
429                                          */
430                                         if (!page->mapping) {
431                                                 unlock_page(page);
432                                                 page_cache_release(page);
433                                                 goto find_page;
434                                         }
435                                         ret = -EIO;
436                                         goto out;
437                                 }
438                         } else {
439                                 WARN_ON(!PageLocked(page));
440                                 SetPageUptodate(page);
441                         }
442                 }
443         }
444
445         ret = mapping->a_ops->prepare_write(file, page, 0, sd->len);
446         if (ret)
447                 goto out;
448
449         if (!buf->stolen) {
450                 char *dst = kmap_atomic(page, KM_USER0);
451
452                 memcpy(dst + offset, src + buf->offset, sd->len);
453                 flush_dcache_page(page);
454                 kunmap_atomic(dst, KM_USER0);
455         }
456
457         ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
458         if (ret < 0)
459                 goto out;
460
461         set_page_dirty(page);
462         ret = write_one_page(page, 0);
463 out:
464         if (ret < 0)
465                 unlock_page(page);
466         if (!buf->stolen)
467                 page_cache_release(page);
468         buf->ops->unmap(info, buf);
469         return ret;
470 }
471
472 typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
473                            struct splice_desc *);
474
475 static ssize_t move_from_pipe(struct inode *inode, struct file *out,
476                               size_t len, unsigned int flags,
477                               splice_actor *actor)
478 {
479         struct pipe_inode_info *info;
480         int ret, do_wakeup, err;
481         struct splice_desc sd;
482
483         ret = 0;
484         do_wakeup = 0;
485
486         sd.total_len = len;
487         sd.flags = flags;
488         sd.file = out;
489         sd.pos = out->f_pos;
490
491         mutex_lock(PIPE_MUTEX(*inode));
492
493         info = inode->i_pipe;
494         for (;;) {
495                 int bufs = info->nrbufs;
496
497                 if (bufs) {
498                         int curbuf = info->curbuf;
499                         struct pipe_buffer *buf = info->bufs + curbuf;
500                         struct pipe_buf_operations *ops = buf->ops;
501
502                         sd.len = buf->len;
503                         if (sd.len > sd.total_len)
504                                 sd.len = sd.total_len;
505
506                         err = actor(info, buf, &sd);
507                         if (err) {
508                                 if (!ret && err != -ENODATA)
509                                         ret = err;
510
511                                 break;
512                         }
513
514                         ret += sd.len;
515                         buf->offset += sd.len;
516                         buf->len -= sd.len;
517                         if (!buf->len) {
518                                 buf->ops = NULL;
519                                 ops->release(info, buf);
520                                 curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1);
521                                 info->curbuf = curbuf;
522                                 info->nrbufs = --bufs;
523                                 do_wakeup = 1;
524                         }
525
526                         sd.pos += sd.len;
527                         sd.total_len -= sd.len;
528                         if (!sd.total_len)
529                                 break;
530                 }
531
532                 if (bufs)
533                         continue;
534                 if (!PIPE_WRITERS(*inode))
535                         break;
536                 if (!PIPE_WAITING_WRITERS(*inode)) {
537                         if (ret)
538                                 break;
539                 }
540
541                 if (flags & SPLICE_F_NONBLOCK) {
542                         if (!ret)
543                                 ret = -EAGAIN;
544                         break;
545                 }
546
547                 if (signal_pending(current)) {
548                         if (!ret)
549                                 ret = -ERESTARTSYS;
550                         break;
551                 }
552
553                 if (do_wakeup) {
554                         wake_up_interruptible_sync(PIPE_WAIT(*inode));
555                         kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT);
556                         do_wakeup = 0;
557                 }
558
559                 pipe_wait(inode);
560         }
561
562         mutex_unlock(PIPE_MUTEX(*inode));
563
564         if (do_wakeup) {
565                 wake_up_interruptible(PIPE_WAIT(*inode));
566                 kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
567         }
568
569         mutex_lock(&out->f_mapping->host->i_mutex);
570         out->f_pos = sd.pos;
571         mutex_unlock(&out->f_mapping->host->i_mutex);
572         return ret;
573
574 }
575
576 ssize_t generic_file_splice_write(struct inode *inode, struct file *out,
577                                   size_t len, unsigned int flags)
578 {
579         return move_from_pipe(inode, out, len, flags, pipe_to_file);
580 }
581
582 ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
583                                 size_t len, unsigned int flags)
584 {
585         return move_from_pipe(inode, out, len, flags, pipe_to_sendpage);
586 }
587
588 EXPORT_SYMBOL(generic_file_splice_write);
589 EXPORT_SYMBOL(generic_file_splice_read);
590
591 static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
592                            unsigned int flags)
593 {
594         loff_t pos;
595         int ret;
596
597         if (!out->f_op || !out->f_op->splice_write)
598                 return -EINVAL;
599
600         if (!(out->f_mode & FMODE_WRITE))
601                 return -EBADF;
602
603         pos = out->f_pos;
604         ret = rw_verify_area(WRITE, out, &pos, len);
605         if (unlikely(ret < 0))
606                 return ret;
607
608         return out->f_op->splice_write(pipe, out, len, flags);
609 }
610
611 static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
612                          unsigned int flags)
613 {
614         loff_t pos, isize, left;
615         int ret;
616
617         if (!in->f_op || !in->f_op->splice_read)
618                 return -EINVAL;
619
620         if (!(in->f_mode & FMODE_READ))
621                 return -EBADF;
622
623         pos = in->f_pos;
624         ret = rw_verify_area(READ, in, &pos, len);
625         if (unlikely(ret < 0))
626                 return ret;
627
628         isize = i_size_read(in->f_mapping->host);
629         if (unlikely(in->f_pos >= isize))
630                 return 0;
631         
632         left = isize - in->f_pos;
633         if (left < len)
634                 len = left;
635
636         return in->f_op->splice_read(in, pipe, len, flags);
637 }
638
639 static long do_splice(struct file *in, struct file *out, size_t len,
640                       unsigned int flags)
641 {
642         struct inode *pipe;
643
644         pipe = in->f_dentry->d_inode;
645         if (pipe->i_pipe)
646                 return do_splice_from(pipe, out, len, flags);
647
648         pipe = out->f_dentry->d_inode;
649         if (pipe->i_pipe)
650                 return do_splice_to(in, pipe, len, flags);
651
652         return -EINVAL;
653 }
654
655 asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
656 {
657         long error;
658         struct file *in, *out;
659         int fput_in, fput_out;
660
661         if (unlikely(!len))
662                 return 0;
663
664         error = -EBADF;
665         in = fget_light(fdin, &fput_in);
666         if (in) {
667                 if (in->f_mode & FMODE_READ) {
668                         out = fget_light(fdout, &fput_out);
669                         if (out) {
670                                 if (out->f_mode & FMODE_WRITE)
671                                         error = do_splice(in, out, len, flags);
672                                 fput_light(out, fput_out);
673                         }
674                 }
675
676                 fput_light(in, fput_in);
677         }
678
679         return error;
680 }