Btrfs: O_DIRECT writes via buffered writes + invaldiate
Chris Mason [Fri, 3 Oct 2008 16:30:02 +0000 (12:30 -0400)]
This reworks the btrfs O_DIRECT write code a bit.  It had always fallen
back to buffered IO and done an invalidate, but needed to be updated
for the data=ordered code.  The invalidate wasn't actually removing pages
because they were still inside an ordered extent.

This also combines the O_DIRECT/O_SYNC paths where possible, and kicks
off IO in the main btrfs_file_write loop to keep the pipe down the the
disk full as we process long writes.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

fs/btrfs/file.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h

index 3088a11..a03d1bb 100644 (file)
@@ -905,6 +905,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
        struct page *pinned[2];
        unsigned long first_index;
        unsigned long last_index;
+       int will_write;
+
+       will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
+                     (file->f_flags & O_DIRECT));
 
        nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
                     PAGE_CACHE_SIZE / (sizeof(struct page *)));
@@ -1001,15 +1005,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                if (ret)
                        goto out;
 
+               if (will_write) {
+                       btrfs_fdatawrite_range(inode->i_mapping, pos,
+                                              pos + write_bytes - 1,
+                                              WB_SYNC_NONE);
+               } else {
+                       balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+                                                          num_pages);
+                       if (num_pages <
+                           (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+                               btrfs_btree_balance_dirty(root, 1);
+                       btrfs_throttle(root);
+               }
+
                buf += write_bytes;
                count -= write_bytes;
                pos += write_bytes;
                num_written += write_bytes;
 
-               balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
-               if (num_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-                       btrfs_btree_balance_dirty(root, 1);
-               btrfs_throttle(root);
                cond_resched();
        }
 out:
@@ -1023,36 +1036,29 @@ out_nolock:
                page_cache_release(pinned[1]);
        *ppos = pos;
 
-       if (num_written > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+       if (num_written > 0 && will_write) {
                struct btrfs_trans_handle *trans;
 
-               err = btrfs_fdatawrite_range(inode->i_mapping, start_pos,
-                                            start_pos + num_written -1,
-                                            WB_SYNC_NONE);
-               if (err < 0)
-                       num_written = err;
-
-               err = btrfs_wait_on_page_writeback_range(inode->i_mapping,
-                                start_pos, start_pos + num_written - 1);
-               if (err < 0)
+               err = btrfs_wait_ordered_range(inode, start_pos, num_written);
+               if (err)
                        num_written = err;
 
-               trans = btrfs_start_transaction(root, 1);
-               ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
-               if (ret == 0) {
-                       btrfs_sync_log(trans, root);
-                       btrfs_end_transaction(trans, root);
-               } else {
-                       btrfs_commit_transaction(trans, root);
+               if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+                       trans = btrfs_start_transaction(root, 1);
+                       ret = btrfs_log_dentry_safe(trans, root,
+                                                   file->f_dentry);
+                       if (ret == 0) {
+                               btrfs_sync_log(trans, root);
+                               btrfs_end_transaction(trans, root);
+                       } else {
+                               btrfs_commit_transaction(trans, root);
+                       }
+               }
+               if (file->f_flags & O_DIRECT) {
+                       invalidate_mapping_pages(inode->i_mapping,
+                             start_pos >> PAGE_CACHE_SHIFT,
+                            (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
                }
-       } else if (num_written > 0 && (file->f_flags & O_DIRECT)) {
-               do_sync_mapping_range(inode->i_mapping, start_pos,
-                                     start_pos + num_written - 1,
-                                     SYNC_FILE_RANGE_WRITE |
-                                     SYNC_FILE_RANGE_WAIT_AFTER);
-               invalidate_mapping_pages(inode->i_mapping,
-                     start_pos >> PAGE_CACHE_SHIFT,
-                    (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
        }
        current->backing_dev_info = NULL;
        return num_written ? num_written : err;
index dcc1730..2eb6cab 100644 (file)
@@ -397,7 +397,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
 /*
  * Used to wait on ordered extents across a large range of bytes.
  */
-void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 {
        u64 end;
        u64 orig_end;
@@ -451,6 +451,7 @@ again:
                       (unsigned long long)orig_end);
                goto again;
        }
+       return 0;
 }
 
 /*
index fd45519..f50f887 100644 (file)
@@ -135,7 +135,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
                                                         u64 file_offset);
 void btrfs_start_ordered_extent(struct inode *inode,
                                struct btrfs_ordered_extent *entry, int wait);
-void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 int btrfs_ordered_update_i_size(struct inode *inode,