[PATCH] direct-io: sync and invalidate file region when falling back to buffered...
Jeff Moyer [Fri, 20 Oct 2006 06:28:13 +0000 (23:28 -0700)]
When direct-io falls back to buffered write, it will just leave the dirty data
floating about in pagecache, pending regular writeback.

But normal direct-io semantics are that IO is synchronous, and that it leaves
no pagecache behind.

So change the fallback-to-buffered-write code to sync the file region and to
then strip away the pagecache, just as a regular direct-io write would do.

Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

mm/filemap.c

index 3464b68..57faa8d 100644 (file)
@@ -2222,7 +2222,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
                                unsigned long nr_segs, loff_t *ppos)
 {
        struct file *file = iocb->ki_filp;
-       const struct address_space * mapping = file->f_mapping;
+       struct address_space * mapping = file->f_mapping;
        size_t ocount;          /* original count */
        size_t count;           /* after file limit checks */
        struct inode    *inode = mapping->host;
@@ -2275,8 +2275,11 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (unlikely(file->f_flags & O_DIRECT)) {
-               written = generic_file_direct_write(iocb, iov,
-                               &nr_segs, pos, ppos, count, ocount);
+               loff_t endbyte;
+               ssize_t written_buffered;
+
+               written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
+                                                       ppos, count, ocount);
                if (written < 0 || written == count)
                        goto out;
                /*
@@ -2285,10 +2288,46 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
                 */
                pos += written;
                count -= written;
-       }
+               written_buffered = generic_file_buffered_write(iocb, iov,
+                                               nr_segs, pos, ppos, count,
+                                               written);
+               /*
+                * If generic_file_buffered_write() retuned a synchronous error
+                * then we want to return the number of bytes which were
+                * direct-written, or the error code if that was zero.  Note
+                * that this differs from normal direct-io semantics, which
+                * will return -EFOO even if some bytes were written.
+                */
+               if (written_buffered < 0) {
+                       err = written_buffered;
+                       goto out;
+               }
 
-       written = generic_file_buffered_write(iocb, iov, nr_segs,
-                       pos, ppos, count, written);
+               /*
+                * We need to ensure that the page cache pages are written to
+                * disk and invalidated to preserve the expected O_DIRECT
+                * semantics.
+                */
+               endbyte = pos + written_buffered - written - 1;
+               err = do_sync_file_range(file, pos, endbyte,
+                                        SYNC_FILE_RANGE_WAIT_BEFORE|
+                                        SYNC_FILE_RANGE_WRITE|
+                                        SYNC_FILE_RANGE_WAIT_AFTER);
+               if (err == 0) {
+                       written = written_buffered;
+                       invalidate_mapping_pages(mapping,
+                                                pos >> PAGE_CACHE_SHIFT,
+                                                endbyte >> PAGE_CACHE_SHIFT);
+               } else {
+                       /*
+                        * We don't know how much we wrote, so just return
+                        * the number of bytes which were direct-written
+                        */
+               }
+       } else {
+               written = generic_file_buffered_write(iocb, iov, nr_segs,
+                               pos, ppos, count, written);
+       }
 out:
        current->backing_dev_info = NULL;
        return written ? written : err;