mm: trim more holes
[linux-2.6.git] / mm / filemap.c
index 0876cc5..73b98c6 100644 (file)
@@ -593,7 +593,7 @@ void fastcall __lock_page_nosync(struct page *page)
  * Is there a pagecache struct page at the given (mapping, offset) tuple?
  * If yes, increment its refcount and return it; if no, return NULL.
  */
-struct page * find_get_page(struct address_space *mapping, unsigned long offset)
+struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
 {
        struct page *page;
 
@@ -617,30 +617,31 @@ EXPORT_SYMBOL(find_get_page);
  * Returns zero if the page was not present. find_lock_page() may sleep.
  */
 struct page *find_lock_page(struct address_space *mapping,
-                               unsigned long offset)
+                               pgoff_t offset)
 {
        struct page *page;
 
-       read_lock_irq(&mapping->tree_lock);
 repeat:
+       read_lock_irq(&mapping->tree_lock);
        page = radix_tree_lookup(&mapping->page_tree, offset);
        if (page) {
                page_cache_get(page);
                if (TestSetPageLocked(page)) {
                        read_unlock_irq(&mapping->tree_lock);
                        __lock_page(page);
-                       read_lock_irq(&mapping->tree_lock);
 
                        /* Has the page been truncated while we slept? */
-                       if (unlikely(page->mapping != mapping ||
-                                    page->index != offset)) {
+                       if (unlikely(page->mapping != mapping)) {
                                unlock_page(page);
                                page_cache_release(page);
                                goto repeat;
                        }
+                       VM_BUG_ON(page->index != offset);
+                       goto out;
                }
        }
        read_unlock_irq(&mapping->tree_lock);
+out:
        return page;
 }
 EXPORT_SYMBOL(find_lock_page);
@@ -663,7 +664,7 @@ EXPORT_SYMBOL(find_lock_page);
  * memory exhaustion.
  */
 struct page *find_or_create_page(struct address_space *mapping,
-               unsigned long index, gfp_t gfp_mask)
+               pgoff_t index, gfp_t gfp_mask)
 {
        struct page *page, *cached_page = NULL;
        int err;
@@ -797,7 +798,7 @@ EXPORT_SYMBOL(find_get_pages_tag);
  * and deadlock against the caller's locked page.
  */
 struct page *
-grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
+grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
 {
        struct page *page = find_get_page(mapping, index);
 
@@ -859,47 +860,49 @@ static void shrink_readahead_size_eio(struct file *filp,
  * It may be NULL.
  */
 void do_generic_mapping_read(struct address_space *mapping,
-                            struct file_ra_state *_ra,
+                            struct file_ra_state *ra,
                             struct file *filp,
                             loff_t *ppos,
                             read_descriptor_t *desc,
                             read_actor_t actor)
 {
        struct inode *inode = mapping->host;
-       unsigned long index;
-       unsigned long offset;
-       unsigned long last_index;
-       unsigned long next_index;
-       unsigned long prev_index;
+       pgoff_t index;
+       pgoff_t last_index;
+       pgoff_t prev_index;
+       unsigned long offset;      /* offset into pagecache page */
        unsigned int prev_offset;
        struct page *cached_page;
        int error;
-       struct file_ra_state ra = *_ra;
 
        cached_page = NULL;
        index = *ppos >> PAGE_CACHE_SHIFT;
-       next_index = index;
-       prev_index = ra.prev_index;
-       prev_offset = ra.prev_offset;
+       prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
+       prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
        last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
        offset = *ppos & ~PAGE_CACHE_MASK;
 
        for (;;) {
                struct page *page;
-               unsigned long end_index;
+               pgoff_t end_index;
                loff_t isize;
                unsigned long nr, ret;
 
                cond_resched();
-               if (index == next_index)
-                       next_index = page_cache_readahead(mapping, &ra, filp,
-                                       index, last_index - index);
-
 find_page:
                page = find_get_page(mapping, index);
-               if (unlikely(page == NULL)) {
-                       handle_ra_miss(mapping, &ra, index);
-                       goto no_cached_page;
+               if (!page) {
+                       page_cache_sync_readahead(mapping,
+                                       ra, filp,
+                                       index, last_index - index);
+                       page = find_get_page(mapping, index);
+                       if (unlikely(page == NULL))
+                               goto no_cached_page;
+               }
+               if (PageReadahead(page)) {
+                       page_cache_async_readahead(mapping,
+                                       ra, filp, page,
+                                       index, last_index - index);
                }
                if (!PageUptodate(page))
                        goto page_not_up_to_date;
@@ -961,7 +964,6 @@ page_ok:
                index += offset >> PAGE_CACHE_SHIFT;
                offset &= ~PAGE_CACHE_MASK;
                prev_offset = offset;
-               ra.prev_offset = offset;
 
                page_cache_release(page);
                if (ret == nr && desc->count)
@@ -1010,7 +1012,7 @@ readpage:
                                }
                                unlock_page(page);
                                error = -EIO;
-                               shrink_readahead_size_eio(filp, &ra);
+                               shrink_readahead_size_eio(filp, ra);
                                goto readpage_error;
                        }
                        unlock_page(page);
@@ -1050,9 +1052,11 @@ no_cached_page:
        }
 
 out:
-       *_ra = ra;
+       ra->prev_pos = prev_index;
+       ra->prev_pos <<= PAGE_CACHE_SHIFT;
+       ra->prev_pos |= prev_offset;
 
-       *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+       *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
        if (cached_page)
                page_cache_release(cached_page);
        if (filp)
@@ -1212,29 +1216,9 @@ out:
 }
 EXPORT_SYMBOL(generic_file_aio_read);
 
-int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
-{
-       ssize_t written;
-       unsigned long count = desc->count;
-       struct file *file = desc->arg.data;
-
-       if (size > count)
-               size = count;
-
-       written = file->f_op->sendpage(file, page, offset,
-                                      size, &file->f_pos, size<count);
-       if (written < 0) {
-               desc->error = written;
-               written = 0;
-       }
-       desc->count = count - written;
-       desc->written += written;
-       return written;
-}
-
 static ssize_t
 do_readahead(struct address_space *mapping, struct file *filp,
-            unsigned long index, unsigned long nr)
+            pgoff_t index, unsigned long nr)
 {
        if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
                return -EINVAL;
@@ -1254,8 +1238,8 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
        if (file) {
                if (file->f_mode & FMODE_READ) {
                        struct address_space *mapping = file->f_mapping;
-                       unsigned long start = offset >> PAGE_CACHE_SHIFT;
-                       unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+                       pgoff_t start = offset >> PAGE_CACHE_SHIFT;
+                       pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
                        unsigned long len = end - start + 1;
                        ret = do_readahead(mapping, file, start, len);
                }
@@ -1265,7 +1249,6 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
 }
 
 #ifdef CONFIG_MMU
-static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
 /**
  * page_cache_read - adds requested page to the page cache if not already there
  * @file:      file to read
@@ -1274,7 +1257,7 @@ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
  * This adds the requested page to the page cache if it isn't already there,
  * and schedules an I/O to read in its contents from disk.
  */
-static int fastcall page_cache_read(struct file * file, unsigned long offset)
+static int fastcall page_cache_read(struct file * file, pgoff_t offset)
 {
        struct address_space *mapping = file->f_mapping;
        struct page *page; 
@@ -1322,9 +1305,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct page *page;
        unsigned long size;
        int did_readaround = 0;
-       int ret;
-
-       ret = VM_FAULT_MINOR;
+       int ret = 0;
 
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (vmf->pgoff >= size)
@@ -1335,33 +1316,37 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                goto no_cached_page;
 
        /*
-        * The readahead code wants to be told about each and every page
-        * so it can build and shrink its windows appropriately
-        *
-        * For sequential accesses, we use the generic readahead logic.
-        */
-       if (VM_SequentialReadHint(vma))
-               page_cache_readahead(mapping, ra, file, vmf->pgoff, 1);
-
-       /*
         * Do we have something in the page cache already?
         */
 retry_find:
        page = find_lock_page(mapping, vmf->pgoff);
+       /*
+        * For sequential accesses, we use the generic readahead logic.
+        */
+       if (VM_SequentialReadHint(vma)) {
+               if (!page) {
+                       page_cache_sync_readahead(mapping, ra, file,
+                                                          vmf->pgoff, 1);
+                       page = find_lock_page(mapping, vmf->pgoff);
+                       if (!page)
+                               goto no_cached_page;
+               }
+               if (PageReadahead(page)) {
+                       page_cache_async_readahead(mapping, ra, file, page,
+                                                          vmf->pgoff, 1);
+               }
+       }
+
        if (!page) {
                unsigned long ra_pages;
 
-               if (VM_SequentialReadHint(vma)) {
-                       handle_ra_miss(mapping, ra, vmf->pgoff);
-                       goto no_cached_page;
-               }
                ra->mmap_miss++;
 
                /*
                 * Do we miss much more than hit in this file? If so,
                 * stop bothering with read-ahead. It will only hurt.
                 */
-               if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
+               if (ra->mmap_miss > MMAP_LOTSAMISS)
                        goto no_cached_page;
 
                /*
@@ -1387,7 +1372,7 @@ retry_find:
        }
 
        if (!did_readaround)
-               ra->mmap_hit++;
+               ra->mmap_miss--;
 
        /*
         * We have a locked page in the page cache, now we need to check
@@ -1400,6 +1385,7 @@ retry_find:
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (unlikely(vmf->pgoff >= size)) {
                unlock_page(page);
+               page_cache_release(page);
                goto outside_data_content;
        }
 
@@ -1407,8 +1393,9 @@ retry_find:
         * Found the page and have a reference on it.
         */
        mark_page_accessed(page);
+       ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
        vmf->page = page;
-       return ret | FAULT_RET_LOCKED;
+       return ret | VM_FAULT_LOCKED;
 
 outside_data_content:
        /*
@@ -1511,7 +1498,7 @@ EXPORT_SYMBOL(generic_file_mmap);
 EXPORT_SYMBOL(generic_file_readonly_mmap);
 
 static struct page *__read_cache_page(struct address_space *mapping,
-                               unsigned long index,
+                               pgoff_t index,
                                int (*filler)(void *,struct page*),
                                void *data)
 {
@@ -1552,7 +1539,7 @@ repeat:
  * after submitting it to the filler.
  */
 struct page *read_cache_page_async(struct address_space *mapping,
-                               unsigned long index,
+                               pgoff_t index,
                                int (*filler)(void *,struct page*),
                                void *data)
 {
@@ -1600,7 +1587,7 @@ EXPORT_SYMBOL(read_cache_page_async);
  * If the page does not get brought uptodate, return -EIO.
  */
 struct page *read_cache_page(struct address_space *mapping,
-                               unsigned long index,
+                               pgoff_t index,
                                int (*filler)(void *,struct page*),
                                void *data)
 {
@@ -1851,16 +1838,15 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                size_t count, ssize_t written)
 {
        struct file *file = iocb->ki_filp;
-       struct address_space * mapping = file->f_mapping;
+       struct address_space *mapping = file->f_mapping;
        const struct address_space_operations *a_ops = mapping->a_ops;
        struct inode    *inode = mapping->host;
        long            status = 0;
        struct page     *page;
        struct page     *cached_page = NULL;
-       size_t          bytes;
        struct pagevec  lru_pvec;
        const struct iovec *cur_iov = iov; /* current iovec */
-       size_t          iov_base = 0;      /* offset in the current iovec */
+       size_t          iov_offset = 0;    /* offset in the current iovec */
        char __user     *buf;
 
        pagevec_init(&lru_pvec, 0);
@@ -1871,115 +1857,102 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
        if (likely(nr_segs == 1))
                buf = iov->iov_base + written;
        else {
-               filemap_set_next_iovec(&cur_iov, &iov_base, written);
-               buf = cur_iov->iov_base + iov_base;
+               filemap_set_next_iovec(&cur_iov, &iov_offset, written);
+               buf = cur_iov->iov_base + iov_offset;
        }
 
        do {
-               unsigned long index;
-               unsigned long offset;
-               size_t copied;
+               pgoff_t index;          /* Pagecache index for current page */
+               unsigned long offset;   /* Offset into pagecache page */
+               unsigned long maxlen;   /* Bytes remaining in current iovec */
+               size_t bytes;           /* Bytes to write to page */
+               size_t copied;          /* Bytes copied from user */
 
-               offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+               offset = (pos & (PAGE_CACHE_SIZE - 1));
                index = pos >> PAGE_CACHE_SHIFT;
                bytes = PAGE_CACHE_SIZE - offset;
+               if (bytes > count)
+                       bytes = count;
 
-               /* Limit the size of the copy to the caller's write size */
-               bytes = min(bytes, count);
+               maxlen = cur_iov->iov_len - iov_offset;
+               if (maxlen > bytes)
+                       maxlen = bytes;
 
-               /* We only need to worry about prefaulting when writes are from
-                * user-space.  NFSd uses vfs_writev with several non-aligned
-                * segments in the vector, and limiting to one segment a time is
-                * a noticeable performance for re-write
+#ifndef CONFIG_DEBUG_VM
+               /*
+                * Bring in the user page that we will copy from _first_.
+                * Otherwise there's a nasty deadlock on copying from the
+                * same page as we're writing to, without it being marked
+                * up-to-date.
                 */
-               if (!segment_eq(get_fs(), KERNEL_DS)) {
-                       /*
-                        * Limit the size of the copy to that of the current
-                        * segment, because fault_in_pages_readable() doesn't
-                        * know how to walk segments.
-                        */
-                       bytes = min(bytes, cur_iov->iov_len - iov_base);
+               fault_in_pages_readable(buf, maxlen);
+#endif
 
-                       /*
-                        * Bring in the user page that we will copy from
-                        * _first_.  Otherwise there's a nasty deadlock on
-                        * copying from the same page as we're writing to,
-                        * without it being marked up-to-date.
-                        */
-                       fault_in_pages_readable(buf, bytes);
-               }
                page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
                if (!page) {
                        status = -ENOMEM;
                        break;
                }
 
-               if (unlikely(bytes == 0)) {
-                       status = 0;
-                       copied = 0;
-                       goto zero_length_segment;
-               }
-
                status = a_ops->prepare_write(file, page, offset, offset+bytes);
-               if (unlikely(status)) {
-                       loff_t isize = i_size_read(inode);
+               if (unlikely(status))
+                       goto fs_write_aop_error;
 
-                       if (status != AOP_TRUNCATED_PAGE)
-                               unlock_page(page);
-                       page_cache_release(page);
-                       if (status == AOP_TRUNCATED_PAGE)
-                               continue;
-                       /*
-                        * prepare_write() may have instantiated a few blocks
-                        * outside i_size.  Trim these off again.
-                        */
-                       if (pos + bytes > isize)
-                               vmtruncate(inode, isize);
-                       break;
-               }
                if (likely(nr_segs == 1))
                        copied = filemap_copy_from_user(page, offset,
                                                        buf, bytes);
                else
                        copied = filemap_copy_from_user_iovec(page, offset,
-                                               cur_iov, iov_base, bytes);
+                                               cur_iov, iov_offset, bytes);
                flush_dcache_page(page);
                status = a_ops->commit_write(file, page, offset, offset+bytes);
-               if (status == AOP_TRUNCATED_PAGE) {
-                       page_cache_release(page);
-                       continue;
+               if (unlikely(status < 0 || status == AOP_TRUNCATED_PAGE))
+                       goto fs_write_aop_error;
+               if (unlikely(copied != bytes)) {
+                       status = -EFAULT;
+                       goto fs_write_aop_error;
                }
-zero_length_segment:
-               if (likely(copied >= 0)) {
-                       if (!status)
-                               status = copied;
-
-                       if (status >= 0) {
-                               written += status;
-                               count -= status;
-                               pos += status;
-                               buf += status;
-                               if (unlikely(nr_segs > 1)) {
-                                       filemap_set_next_iovec(&cur_iov,
-                                                       &iov_base, status);
-                                       if (count)
-                                               buf = cur_iov->iov_base +
-                                                       iov_base;
-                               } else {
-                                       iov_base += status;
-                               }
+               if (unlikely(status > 0)) /* filesystem did partial write */
+                       copied = status;
+
+               if (likely(copied > 0)) {
+                       written += copied;
+                       count -= copied;
+                       pos += copied;
+                       buf += copied;
+                       if (unlikely(nr_segs > 1)) {
+                               filemap_set_next_iovec(&cur_iov,
+                                               &iov_offset, copied);
+                               if (count)
+                                       buf = cur_iov->iov_base + iov_offset;
+                       } else {
+                               iov_offset += copied;
                        }
                }
-               if (unlikely(copied != bytes))
-                       if (status >= 0)
-                               status = -EFAULT;
                unlock_page(page);
                mark_page_accessed(page);
                page_cache_release(page);
-               if (status < 0)
-                       break;
                balance_dirty_pages_ratelimited(mapping);
                cond_resched();
+               continue;
+
+fs_write_aop_error:
+               if (status != AOP_TRUNCATED_PAGE)
+                       unlock_page(page);
+               page_cache_release(page);
+
+               /*
+                * prepare_write() may have instantiated a few blocks
+                * outside i_size.  Trim these off again. Don't need
+                * i_size_read because we hold i_mutex.
+                */
+               if (pos + bytes > inode->i_size)
+                       vmtruncate(inode, inode->i_size);
+               if (status == AOP_TRUNCATED_PAGE)
+                       continue;
+               else
+                       break;
+
        } while (count);
        *ppos = pos;