ceph: fix short sync reads from the OSD
Sage Weil [Wed, 1 Jun 2011 23:08:44 +0000 (16:08 -0700)]
If we get a short read from the OSD because the object is small, we need to
zero the remainder of the buffer.  For O_DIRECT reads, the attempted range
is not trimmed to i_size by the VFS, so we were actually looping
indefinitely.

Fix by trimming by i_size, and the unconditionally zeroing the trailing
range.

Reported-by: Jeff Wu <cpwu@tnsoft.com.cn>
Signed-off-by: Sage Weil <sage@newdream.net>

fs/ceph/file.c

index 8c5ac4e..b654f40 100644 (file)
@@ -283,7 +283,7 @@ int ceph_release(struct inode *inode, struct file *file)
 static int striped_read(struct inode *inode,
                        u64 off, u64 len,
                        struct page **pages, int num_pages,
-                       int *checkeof, bool align_to_pages,
+                       int *checkeof, bool o_direct,
                        unsigned long buf_align)
 {
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
@@ -308,7 +308,7 @@ static int striped_read(struct inode *inode,
        io_align = off & ~PAGE_MASK;
 
 more:
-       if (align_to_pages)
+       if (o_direct)
                page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
        else
                page_align = pos & ~PAGE_MASK;
@@ -346,20 +346,22 @@ more:
        }
 
        if (was_short) {
-               /* was original extent fully inside i_size? */
-               if (pos + left <= inode->i_size) {
-                       dout("zero tail\n");
-                       ceph_zero_page_vector_range(page_off + read, len - read,
+               /* did we bounce off eof? */
+               if (pos + left > inode->i_size)
+                       *checkeof = 1;
+
+               /* zero trailing bytes (inside i_size) */
+               if (left > 0 && pos < inode->i_size) {
+                       if (pos + left > inode->i_size)
+                               left = inode->i_size - pos;
+
+                       dout("zero tail %d\n", left);
+                       ceph_zero_page_vector_range(page_off + read, left,
                                                    pages);
-                       read = len;
-                       goto out;
+                       read += left;
                }
-
-               /* check i_size */
-               *checkeof = 1;
        }
 
-out:
        if (ret >= 0)
                ret = read;
        dout("striped_read returns %d\n", ret);
@@ -659,7 +661,7 @@ out:
 
                /* hit EOF or hole? */
                if (statret == 0 && *ppos < inode->i_size) {
-                       dout("aio_read sync_read hit hole, reading more\n");
+                       dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size);
                        read += ret;
                        base += ret;
                        len -= ret;