[PATCH] pipe: enable atomic copying of pipe data to/from user space
[linux-2.6.git] / fs / pipe.c
1 /*
2  *  linux/fs/pipe.c
3  *
4  *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
5  */
6
7 #include <linux/mm.h>
8 #include <linux/file.h>
9 #include <linux/poll.h>
10 #include <linux/slab.h>
11 #include <linux/module.h>
12 #include <linux/init.h>
13 #include <linux/fs.h>
14 #include <linux/mount.h>
15 #include <linux/pipe_fs_i.h>
16 #include <linux/uio.h>
17 #include <linux/highmem.h>
18 #include <linux/pagemap.h>
19
20 #include <asm/uaccess.h>
21 #include <asm/ioctls.h>
22
23 /*
24  * We use a start+len construction, which provides full use of the 
25  * allocated memory.
26  * -- Florian Coosmann (FGC)
27  * 
28  * Reads with count = 0 should always return 0.
29  * -- Julian Bradfield 1999-06-07.
30  *
31  * FIFOs and Pipes now generate SIGIO for both readers and writers.
32  * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
33  *
34  * pipe_read & write cleanup
35  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
36  */
37
38 /* Drop the inode semaphore and wait for a pipe event, atomically */
39 void pipe_wait(struct pipe_inode_info *pipe)
40 {
41         DEFINE_WAIT(wait);
42
43         /*
44          * Pipes are system-local resources, so sleeping on them
45          * is considered a noninteractive wait:
46          */
47         prepare_to_wait(&pipe->wait, &wait,
48                         TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
49         if (pipe->inode)
50                 mutex_unlock(&pipe->inode->i_mutex);
51         schedule();
52         finish_wait(&pipe->wait, &wait);
53         if (pipe->inode)
54                 mutex_lock(&pipe->inode->i_mutex);
55 }
56
57 static int
58 pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
59                         int atomic)
60 {
61         unsigned long copy;
62
63         while (len > 0) {
64                 while (!iov->iov_len)
65                         iov++;
66                 copy = min_t(unsigned long, len, iov->iov_len);
67
68                 if (atomic) {
69                         if (__copy_from_user_inatomic(to, iov->iov_base, copy))
70                                 return -EFAULT;
71                 } else {
72                         if (copy_from_user(to, iov->iov_base, copy))
73                                 return -EFAULT;
74                 }
75                 to += copy;
76                 len -= copy;
77                 iov->iov_base += copy;
78                 iov->iov_len -= copy;
79         }
80         return 0;
81 }
82
83 static int
84 pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
85                       int atomic)
86 {
87         unsigned long copy;
88
89         while (len > 0) {
90                 while (!iov->iov_len)
91                         iov++;
92                 copy = min_t(unsigned long, len, iov->iov_len);
93
94                 if (atomic) {
95                         if (__copy_to_user_inatomic(iov->iov_base, from, copy))
96                                 return -EFAULT;
97                 } else {
98                         if (copy_to_user(iov->iov_base, from, copy))
99                                 return -EFAULT;
100                 }
101                 from += copy;
102                 len -= copy;
103                 iov->iov_base += copy;
104                 iov->iov_len -= copy;
105         }
106         return 0;
107 }
108
109 /*
110  * Attempt to pre-fault in the user memory, so we can use atomic copies.
111  * Returns the number of bytes not faulted in.
112  */
113 static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
114 {
115         while (!iov->iov_len)
116                 iov++;
117
118         while (len > 0) {
119                 unsigned long this_len;
120
121                 this_len = min_t(unsigned long, len, iov->iov_len);
122                 if (fault_in_pages_writeable(iov->iov_base, this_len))
123                         break;
124
125                 len -= this_len;
126                 iov++;
127         }
128
129         return len;
130 }
131
132 /*
133  * Pre-fault in the user memory, so we can use atomic copies.
134  */
135 static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
136 {
137         while (!iov->iov_len)
138                 iov++;
139
140         while (len > 0) {
141                 unsigned long this_len;
142
143                 this_len = min_t(unsigned long, len, iov->iov_len);
144                 fault_in_pages_readable(iov->iov_base, this_len);
145                 len -= this_len;
146                 iov++;
147         }
148 }
149
150 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
151                                   struct pipe_buffer *buf)
152 {
153         struct page *page = buf->page;
154
155         /*
156          * If nobody else uses this page, and we don't already have a
157          * temporary page, let's keep track of it as a one-deep
158          * allocation cache. (Otherwise just release our reference to it)
159          */
160         if (page_count(page) == 1 && !pipe->tmp_page)
161                 pipe->tmp_page = page;
162         else
163                 page_cache_release(page);
164 }
165
166 void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
167                            struct pipe_buffer *buf, int atomic)
168 {
169         if (atomic) {
170                 buf->flags |= PIPE_BUF_FLAG_ATOMIC;
171                 return kmap_atomic(buf->page, KM_USER0);
172         }
173
174         return kmap(buf->page);
175 }
176
177 void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
178                             struct pipe_buffer *buf, void *map_data)
179 {
180         if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
181                 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
182                 kunmap_atomic(map_data, KM_USER0);
183         } else
184                 kunmap(buf->page);
185 }
186
187 static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
188                                struct pipe_buffer *buf)
189 {
190         struct page *page = buf->page;
191
192         if (page_count(page) == 1) {
193                 lock_page(page);
194                 return 0;
195         }
196
197         return 1;
198 }
199
200 void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf)
201 {
202         page_cache_get(buf->page);
203 }
204
205 int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf)
206 {
207         return 0;
208 }
209
210 static struct pipe_buf_operations anon_pipe_buf_ops = {
211         .can_merge = 1,
212         .map = generic_pipe_buf_map,
213         .unmap = generic_pipe_buf_unmap,
214         .pin = generic_pipe_buf_pin,
215         .release = anon_pipe_buf_release,
216         .steal = anon_pipe_buf_steal,
217         .get = generic_pipe_buf_get,
218 };
219
220 static ssize_t
221 pipe_readv(struct file *filp, const struct iovec *_iov,
222            unsigned long nr_segs, loff_t *ppos)
223 {
224         struct inode *inode = filp->f_dentry->d_inode;
225         struct pipe_inode_info *pipe;
226         int do_wakeup;
227         ssize_t ret;
228         struct iovec *iov = (struct iovec *)_iov;
229         size_t total_len;
230
231         total_len = iov_length(iov, nr_segs);
232         /* Null read succeeds. */
233         if (unlikely(total_len == 0))
234                 return 0;
235
236         do_wakeup = 0;
237         ret = 0;
238         mutex_lock(&inode->i_mutex);
239         pipe = inode->i_pipe;
240         for (;;) {
241                 int bufs = pipe->nrbufs;
242                 if (bufs) {
243                         int curbuf = pipe->curbuf;
244                         struct pipe_buffer *buf = pipe->bufs + curbuf;
245                         struct pipe_buf_operations *ops = buf->ops;
246                         void *addr;
247                         size_t chars = buf->len;
248                         int error, atomic;
249
250                         if (chars > total_len)
251                                 chars = total_len;
252
253                         error = ops->pin(pipe, buf);
254                         if (error) {
255                                 if (!ret)
256                                         error = ret;
257                                 break;
258                         }
259
260                         atomic = !iov_fault_in_pages_write(iov, chars);
261 redo:
262                         addr = ops->map(pipe, buf, atomic);
263                         error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
264                         ops->unmap(pipe, buf, addr);
265                         if (unlikely(error)) {
266                                 /*
267                                  * Just retry with the slow path if we failed.
268                                  */
269                                 if (atomic) {
270                                         atomic = 0;
271                                         goto redo;
272                                 }
273                                 if (!ret)
274                                         ret = error;
275                                 break;
276                         }
277                         ret += chars;
278                         buf->offset += chars;
279                         buf->len -= chars;
280                         if (!buf->len) {
281                                 buf->ops = NULL;
282                                 ops->release(pipe, buf);
283                                 curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
284                                 pipe->curbuf = curbuf;
285                                 pipe->nrbufs = --bufs;
286                                 do_wakeup = 1;
287                         }
288                         total_len -= chars;
289                         if (!total_len)
290                                 break;  /* common path: read succeeded */
291                 }
292                 if (bufs)       /* More to do? */
293                         continue;
294                 if (!pipe->writers)
295                         break;
296                 if (!pipe->waiting_writers) {
297                         /* syscall merging: Usually we must not sleep
298                          * if O_NONBLOCK is set, or if we got some data.
299                          * But if a writer sleeps in kernel space, then
300                          * we can wait for that data without violating POSIX.
301                          */
302                         if (ret)
303                                 break;
304                         if (filp->f_flags & O_NONBLOCK) {
305                                 ret = -EAGAIN;
306                                 break;
307                         }
308                 }
309                 if (signal_pending(current)) {
310                         if (!ret)
311                                 ret = -ERESTARTSYS;
312                         break;
313                 }
314                 if (do_wakeup) {
315                         wake_up_interruptible_sync(&pipe->wait);
316                         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
317                 }
318                 pipe_wait(pipe);
319         }
320         mutex_unlock(&inode->i_mutex);
321
322         /* Signal writers asynchronously that there is more room. */
323         if (do_wakeup) {
324                 wake_up_interruptible(&pipe->wait);
325                 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
326         }
327         if (ret > 0)
328                 file_accessed(filp);
329         return ret;
330 }
331
332 static ssize_t
333 pipe_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
334 {
335         struct iovec iov = { .iov_base = buf, .iov_len = count };
336
337         return pipe_readv(filp, &iov, 1, ppos);
338 }
339
340 static ssize_t
341 pipe_writev(struct file *filp, const struct iovec *_iov,
342             unsigned long nr_segs, loff_t *ppos)
343 {
344         struct inode *inode = filp->f_dentry->d_inode;
345         struct pipe_inode_info *pipe;
346         ssize_t ret;
347         int do_wakeup;
348         struct iovec *iov = (struct iovec *)_iov;
349         size_t total_len;
350         ssize_t chars;
351
352         total_len = iov_length(iov, nr_segs);
353         /* Null write succeeds. */
354         if (unlikely(total_len == 0))
355                 return 0;
356
357         do_wakeup = 0;
358         ret = 0;
359         mutex_lock(&inode->i_mutex);
360         pipe = inode->i_pipe;
361
362         if (!pipe->readers) {
363                 send_sig(SIGPIPE, current, 0);
364                 ret = -EPIPE;
365                 goto out;
366         }
367
368         /* We try to merge small writes */
369         chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
370         if (pipe->nrbufs && chars != 0) {
371                 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
372                                                         (PIPE_BUFFERS-1);
373                 struct pipe_buffer *buf = pipe->bufs + lastbuf;
374                 struct pipe_buf_operations *ops = buf->ops;
375                 int offset = buf->offset + buf->len;
376
377                 if (ops->can_merge && offset + chars <= PAGE_SIZE) {
378                         int error, atomic = 1;
379                         void *addr;
380
381                         error = ops->pin(pipe, buf);
382                         if (error)
383                                 goto out;
384
385                         iov_fault_in_pages_read(iov, chars);
386 redo1:
387                         addr = ops->map(pipe, buf, atomic);
388                         error = pipe_iov_copy_from_user(offset + addr, iov,
389                                                         chars, atomic);
390                         ops->unmap(pipe, buf, addr);
391                         ret = error;
392                         do_wakeup = 1;
393                         if (error) {
394                                 if (atomic) {
395                                         atomic = 0;
396                                         goto redo1;
397                                 }
398                                 goto out;
399                         }
400                         buf->len += chars;
401                         total_len -= chars;
402                         ret = chars;
403                         if (!total_len)
404                                 goto out;
405                 }
406         }
407
408         for (;;) {
409                 int bufs;
410
411                 if (!pipe->readers) {
412                         send_sig(SIGPIPE, current, 0);
413                         if (!ret)
414                                 ret = -EPIPE;
415                         break;
416                 }
417                 bufs = pipe->nrbufs;
418                 if (bufs < PIPE_BUFFERS) {
419                         int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
420                         struct pipe_buffer *buf = pipe->bufs + newbuf;
421                         struct page *page = pipe->tmp_page;
422                         char *src;
423                         int error, atomic = 1;
424
425                         if (!page) {
426                                 page = alloc_page(GFP_HIGHUSER);
427                                 if (unlikely(!page)) {
428                                         ret = ret ? : -ENOMEM;
429                                         break;
430                                 }
431                                 pipe->tmp_page = page;
432                         }
433                         /* Always wake up, even if the copy fails. Otherwise
434                          * we lock up (O_NONBLOCK-)readers that sleep due to
435                          * syscall merging.
436                          * FIXME! Is this really true?
437                          */
438                         do_wakeup = 1;
439                         chars = PAGE_SIZE;
440                         if (chars > total_len)
441                                 chars = total_len;
442
443                         iov_fault_in_pages_read(iov, chars);
444 redo2:
445                         if (atomic)
446                                 src = kmap_atomic(page, KM_USER0);
447                         else
448                                 src = kmap(page);
449
450                         error = pipe_iov_copy_from_user(src, iov, chars,
451                                                         atomic);
452                         if (atomic)
453                                 kunmap_atomic(src, KM_USER0);
454                         else
455                                 kunmap(page);
456
457                         if (unlikely(error)) {
458                                 if (atomic) {
459                                         atomic = 0;
460                                         goto redo2;
461                                 }
462                                 if (!ret)
463                                         ret = error;
464                                 break;
465                         }
466                         ret += chars;
467
468                         /* Insert it into the buffer array */
469                         buf->page = page;
470                         buf->ops = &anon_pipe_buf_ops;
471                         buf->offset = 0;
472                         buf->len = chars;
473                         pipe->nrbufs = ++bufs;
474                         pipe->tmp_page = NULL;
475
476                         total_len -= chars;
477                         if (!total_len)
478                                 break;
479                 }
480                 if (bufs < PIPE_BUFFERS)
481                         continue;
482                 if (filp->f_flags & O_NONBLOCK) {
483                         if (!ret)
484                                 ret = -EAGAIN;
485                         break;
486                 }
487                 if (signal_pending(current)) {
488                         if (!ret)
489                                 ret = -ERESTARTSYS;
490                         break;
491                 }
492                 if (do_wakeup) {
493                         wake_up_interruptible_sync(&pipe->wait);
494                         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
495                         do_wakeup = 0;
496                 }
497                 pipe->waiting_writers++;
498                 pipe_wait(pipe);
499                 pipe->waiting_writers--;
500         }
501 out:
502         mutex_unlock(&inode->i_mutex);
503         if (do_wakeup) {
504                 wake_up_interruptible(&pipe->wait);
505                 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
506         }
507         if (ret > 0)
508                 file_update_time(filp);
509         return ret;
510 }
511
512 static ssize_t
513 pipe_write(struct file *filp, const char __user *buf,
514            size_t count, loff_t *ppos)
515 {
516         struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
517
518         return pipe_writev(filp, &iov, 1, ppos);
519 }
520
521 static ssize_t
522 bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
523 {
524         return -EBADF;
525 }
526
527 static ssize_t
528 bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
529            loff_t *ppos)
530 {
531         return -EBADF;
532 }
533
534 static int
535 pipe_ioctl(struct inode *pino, struct file *filp,
536            unsigned int cmd, unsigned long arg)
537 {
538         struct inode *inode = filp->f_dentry->d_inode;
539         struct pipe_inode_info *pipe;
540         int count, buf, nrbufs;
541
542         switch (cmd) {
543                 case FIONREAD:
544                         mutex_lock(&inode->i_mutex);
545                         pipe = inode->i_pipe;
546                         count = 0;
547                         buf = pipe->curbuf;
548                         nrbufs = pipe->nrbufs;
549                         while (--nrbufs >= 0) {
550                                 count += pipe->bufs[buf].len;
551                                 buf = (buf+1) & (PIPE_BUFFERS-1);
552                         }
553                         mutex_unlock(&inode->i_mutex);
554
555                         return put_user(count, (int __user *)arg);
556                 default:
557                         return -EINVAL;
558         }
559 }
560
561 /* No kernel lock held - fine */
562 static unsigned int
563 pipe_poll(struct file *filp, poll_table *wait)
564 {
565         unsigned int mask;
566         struct inode *inode = filp->f_dentry->d_inode;
567         struct pipe_inode_info *pipe = inode->i_pipe;
568         int nrbufs;
569
570         poll_wait(filp, &pipe->wait, wait);
571
572         /* Reading only -- no need for acquiring the semaphore.  */
573         nrbufs = pipe->nrbufs;
574         mask = 0;
575         if (filp->f_mode & FMODE_READ) {
576                 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
577                 if (!pipe->writers && filp->f_version != pipe->w_counter)
578                         mask |= POLLHUP;
579         }
580
581         if (filp->f_mode & FMODE_WRITE) {
582                 mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
583                 /*
584                  * Most Unices do not set POLLERR for FIFOs but on Linux they
585                  * behave exactly like pipes for poll().
586                  */
587                 if (!pipe->readers)
588                         mask |= POLLERR;
589         }
590
591         return mask;
592 }
593
594 static int
595 pipe_release(struct inode *inode, int decr, int decw)
596 {
597         struct pipe_inode_info *pipe;
598
599         mutex_lock(&inode->i_mutex);
600         pipe = inode->i_pipe;
601         pipe->readers -= decr;
602         pipe->writers -= decw;
603
604         if (!pipe->readers && !pipe->writers) {
605                 free_pipe_info(inode);
606         } else {
607                 wake_up_interruptible(&pipe->wait);
608                 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
609                 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
610         }
611         mutex_unlock(&inode->i_mutex);
612
613         return 0;
614 }
615
616 static int
617 pipe_read_fasync(int fd, struct file *filp, int on)
618 {
619         struct inode *inode = filp->f_dentry->d_inode;
620         int retval;
621
622         mutex_lock(&inode->i_mutex);
623         retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
624         mutex_unlock(&inode->i_mutex);
625
626         if (retval < 0)
627                 return retval;
628
629         return 0;
630 }
631
632
633 static int
634 pipe_write_fasync(int fd, struct file *filp, int on)
635 {
636         struct inode *inode = filp->f_dentry->d_inode;
637         int retval;
638
639         mutex_lock(&inode->i_mutex);
640         retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
641         mutex_unlock(&inode->i_mutex);
642
643         if (retval < 0)
644                 return retval;
645
646         return 0;
647 }
648
649
650 static int
651 pipe_rdwr_fasync(int fd, struct file *filp, int on)
652 {
653         struct inode *inode = filp->f_dentry->d_inode;
654         struct pipe_inode_info *pipe = inode->i_pipe;
655         int retval;
656
657         mutex_lock(&inode->i_mutex);
658
659         retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
660
661         if (retval >= 0)
662                 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
663
664         mutex_unlock(&inode->i_mutex);
665
666         if (retval < 0)
667                 return retval;
668
669         return 0;
670 }
671
672
673 static int
674 pipe_read_release(struct inode *inode, struct file *filp)
675 {
676         pipe_read_fasync(-1, filp, 0);
677         return pipe_release(inode, 1, 0);
678 }
679
680 static int
681 pipe_write_release(struct inode *inode, struct file *filp)
682 {
683         pipe_write_fasync(-1, filp, 0);
684         return pipe_release(inode, 0, 1);
685 }
686
687 static int
688 pipe_rdwr_release(struct inode *inode, struct file *filp)
689 {
690         int decr, decw;
691
692         pipe_rdwr_fasync(-1, filp, 0);
693         decr = (filp->f_mode & FMODE_READ) != 0;
694         decw = (filp->f_mode & FMODE_WRITE) != 0;
695         return pipe_release(inode, decr, decw);
696 }
697
698 static int
699 pipe_read_open(struct inode *inode, struct file *filp)
700 {
701         /* We could have perhaps used atomic_t, but this and friends
702            below are the only places.  So it doesn't seem worthwhile.  */
703         mutex_lock(&inode->i_mutex);
704         inode->i_pipe->readers++;
705         mutex_unlock(&inode->i_mutex);
706
707         return 0;
708 }
709
710 static int
711 pipe_write_open(struct inode *inode, struct file *filp)
712 {
713         mutex_lock(&inode->i_mutex);
714         inode->i_pipe->writers++;
715         mutex_unlock(&inode->i_mutex);
716
717         return 0;
718 }
719
720 static int
721 pipe_rdwr_open(struct inode *inode, struct file *filp)
722 {
723         mutex_lock(&inode->i_mutex);
724         if (filp->f_mode & FMODE_READ)
725                 inode->i_pipe->readers++;
726         if (filp->f_mode & FMODE_WRITE)
727                 inode->i_pipe->writers++;
728         mutex_unlock(&inode->i_mutex);
729
730         return 0;
731 }
732
733 /*
734  * The file_operations structs are not static because they
735  * are also used in linux/fs/fifo.c to do operations on FIFOs.
736  */
737 const struct file_operations read_fifo_fops = {
738         .llseek         = no_llseek,
739         .read           = pipe_read,
740         .readv          = pipe_readv,
741         .write          = bad_pipe_w,
742         .poll           = pipe_poll,
743         .ioctl          = pipe_ioctl,
744         .open           = pipe_read_open,
745         .release        = pipe_read_release,
746         .fasync         = pipe_read_fasync,
747 };
748
749 const struct file_operations write_fifo_fops = {
750         .llseek         = no_llseek,
751         .read           = bad_pipe_r,
752         .write          = pipe_write,
753         .writev         = pipe_writev,
754         .poll           = pipe_poll,
755         .ioctl          = pipe_ioctl,
756         .open           = pipe_write_open,
757         .release        = pipe_write_release,
758         .fasync         = pipe_write_fasync,
759 };
760
761 const struct file_operations rdwr_fifo_fops = {
762         .llseek         = no_llseek,
763         .read           = pipe_read,
764         .readv          = pipe_readv,
765         .write          = pipe_write,
766         .writev         = pipe_writev,
767         .poll           = pipe_poll,
768         .ioctl          = pipe_ioctl,
769         .open           = pipe_rdwr_open,
770         .release        = pipe_rdwr_release,
771         .fasync         = pipe_rdwr_fasync,
772 };
773
774 static struct file_operations read_pipe_fops = {
775         .llseek         = no_llseek,
776         .read           = pipe_read,
777         .readv          = pipe_readv,
778         .write          = bad_pipe_w,
779         .poll           = pipe_poll,
780         .ioctl          = pipe_ioctl,
781         .open           = pipe_read_open,
782         .release        = pipe_read_release,
783         .fasync         = pipe_read_fasync,
784 };
785
786 static struct file_operations write_pipe_fops = {
787         .llseek         = no_llseek,
788         .read           = bad_pipe_r,
789         .write          = pipe_write,
790         .writev         = pipe_writev,
791         .poll           = pipe_poll,
792         .ioctl          = pipe_ioctl,
793         .open           = pipe_write_open,
794         .release        = pipe_write_release,
795         .fasync         = pipe_write_fasync,
796 };
797
798 static struct file_operations rdwr_pipe_fops = {
799         .llseek         = no_llseek,
800         .read           = pipe_read,
801         .readv          = pipe_readv,
802         .write          = pipe_write,
803         .writev         = pipe_writev,
804         .poll           = pipe_poll,
805         .ioctl          = pipe_ioctl,
806         .open           = pipe_rdwr_open,
807         .release        = pipe_rdwr_release,
808         .fasync         = pipe_rdwr_fasync,
809 };
810
811 struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
812 {
813         struct pipe_inode_info *pipe;
814
815         pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
816         if (pipe) {
817                 init_waitqueue_head(&pipe->wait);
818                 pipe->r_counter = pipe->w_counter = 1;
819                 pipe->inode = inode;
820         }
821
822         return pipe;
823 }
824
825 void __free_pipe_info(struct pipe_inode_info *pipe)
826 {
827         int i;
828
829         for (i = 0; i < PIPE_BUFFERS; i++) {
830                 struct pipe_buffer *buf = pipe->bufs + i;
831                 if (buf->ops)
832                         buf->ops->release(pipe, buf);
833         }
834         if (pipe->tmp_page)
835                 __free_page(pipe->tmp_page);
836         kfree(pipe);
837 }
838
839 void free_pipe_info(struct inode *inode)
840 {
841         __free_pipe_info(inode->i_pipe);
842         inode->i_pipe = NULL;
843 }
844
845 static struct vfsmount *pipe_mnt __read_mostly;
846 static int pipefs_delete_dentry(struct dentry *dentry)
847 {
848         return 1;
849 }
850
851 static struct dentry_operations pipefs_dentry_operations = {
852         .d_delete       = pipefs_delete_dentry,
853 };
854
855 static struct inode * get_pipe_inode(void)
856 {
857         struct inode *inode = new_inode(pipe_mnt->mnt_sb);
858         struct pipe_inode_info *pipe;
859
860         if (!inode)
861                 goto fail_inode;
862
863         pipe = alloc_pipe_info(inode);
864         if (!pipe)
865                 goto fail_iput;
866         inode->i_pipe = pipe;
867
868         pipe->readers = pipe->writers = 1;
869         inode->i_fop = &rdwr_pipe_fops;
870
871         /*
872          * Mark the inode dirty from the very beginning,
873          * that way it will never be moved to the dirty
874          * list because "mark_inode_dirty()" will think
875          * that it already _is_ on the dirty list.
876          */
877         inode->i_state = I_DIRTY;
878         inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
879         inode->i_uid = current->fsuid;
880         inode->i_gid = current->fsgid;
881         inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
882         inode->i_blksize = PAGE_SIZE;
883
884         return inode;
885
886 fail_iput:
887         iput(inode);
888
889 fail_inode:
890         return NULL;
891 }
892
893 int do_pipe(int *fd)
894 {
895         struct qstr this;
896         char name[32];
897         struct dentry *dentry;
898         struct inode * inode;
899         struct file *f1, *f2;
900         int error;
901         int i, j;
902
903         error = -ENFILE;
904         f1 = get_empty_filp();
905         if (!f1)
906                 goto no_files;
907
908         f2 = get_empty_filp();
909         if (!f2)
910                 goto close_f1;
911
912         inode = get_pipe_inode();
913         if (!inode)
914                 goto close_f12;
915
916         error = get_unused_fd();
917         if (error < 0)
918                 goto close_f12_inode;
919         i = error;
920
921         error = get_unused_fd();
922         if (error < 0)
923                 goto close_f12_inode_i;
924         j = error;
925
926         error = -ENOMEM;
927         sprintf(name, "[%lu]", inode->i_ino);
928         this.name = name;
929         this.len = strlen(name);
930         this.hash = inode->i_ino; /* will go */
931         dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &this);
932         if (!dentry)
933                 goto close_f12_inode_i_j;
934
935         dentry->d_op = &pipefs_dentry_operations;
936         d_add(dentry, inode);
937         f1->f_vfsmnt = f2->f_vfsmnt = mntget(mntget(pipe_mnt));
938         f1->f_dentry = f2->f_dentry = dget(dentry);
939         f1->f_mapping = f2->f_mapping = inode->i_mapping;
940
941         /* read file */
942         f1->f_pos = f2->f_pos = 0;
943         f1->f_flags = O_RDONLY;
944         f1->f_op = &read_pipe_fops;
945         f1->f_mode = FMODE_READ;
946         f1->f_version = 0;
947
948         /* write file */
949         f2->f_flags = O_WRONLY;
950         f2->f_op = &write_pipe_fops;
951         f2->f_mode = FMODE_WRITE;
952         f2->f_version = 0;
953
954         fd_install(i, f1);
955         fd_install(j, f2);
956         fd[0] = i;
957         fd[1] = j;
958
959         return 0;
960
961 close_f12_inode_i_j:
962         put_unused_fd(j);
963 close_f12_inode_i:
964         put_unused_fd(i);
965 close_f12_inode:
966         free_pipe_info(inode);
967         iput(inode);
968 close_f12:
969         put_filp(f2);
970 close_f1:
971         put_filp(f1);
972 no_files:
973         return error;   
974 }
975
976 /*
977  * pipefs should _never_ be mounted by userland - too much of security hassle,
978  * no real gain from having the whole whorehouse mounted. So we don't need
979  * any operations on the root directory. However, we need a non-trivial
980  * d_name - pipe: will go nicely and kill the special-casing in procfs.
981  */
982
983 static struct super_block *
984 pipefs_get_sb(struct file_system_type *fs_type, int flags,
985               const char *dev_name, void *data)
986 {
987         return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
988 }
989
990 static struct file_system_type pipe_fs_type = {
991         .name           = "pipefs",
992         .get_sb         = pipefs_get_sb,
993         .kill_sb        = kill_anon_super,
994 };
995
996 static int __init init_pipe_fs(void)
997 {
998         int err = register_filesystem(&pipe_fs_type);
999
1000         if (!err) {
1001                 pipe_mnt = kern_mount(&pipe_fs_type);
1002                 if (IS_ERR(pipe_mnt)) {
1003                         err = PTR_ERR(pipe_mnt);
1004                         unregister_filesystem(&pipe_fs_type);
1005                 }
1006         }
1007         return err;
1008 }
1009
1010 static void __exit exit_pipe_fs(void)
1011 {
1012         unregister_filesystem(&pipe_fs_type);
1013         mntput(pipe_mnt);
1014 }
1015
1016 fs_initcall(init_pipe_fs);
1017 module_exit(exit_pipe_fs);