mm: avoid livelock on !__GFP_FS allocations
[linux-2.6.git] / mm / filemap.c
index 99977f0..7771871 100644 (file)
  * the NFS filesystem used to do this differently, for example)
  */
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
 #include <linux/aio.h>
 #include <linux/capability.h>
 #include <linux/kernel_stat.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/mman.h>
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
-#include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <linux/cleancache.h>
 #include "internal.h"
 
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
-#include <linux/buffer_head.h> /* for generic_osync_inode */
+#include <linux/buffer_head.h> /* for try_to_free_buffers */
 
 #include <asm/mman.h>
 
-
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  * though.
 /*
  * Lock ordering:
  *
- *  ->i_mmap_lock              (vmtruncate)
+ *  ->i_mmap_mutex             (truncate_pagecache)
  *    ->private_lock           (__free_pte->__set_page_dirty_buffers)
  *      ->swap_lock            (exclusive_swap_page, others)
  *        ->mapping->tree_lock
  *
  *  ->i_mutex
- *    ->i_mmap_lock            (truncate->unmap_mapping_range)
+ *    ->i_mmap_mutex           (truncate->unmap_mapping_range)
  *
  *  ->mmap_sem
- *    ->i_mmap_lock
+ *    ->i_mmap_mutex
  *      ->page_table_lock or pte_lock  (various, mainly in memory.c)
  *        ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
  *
  *  ->i_mutex                  (generic_file_buffered_write)
  *    ->mmap_sem               (fault_in_pages_readable->do_page_fault)
  *
- *  ->i_mutex
- *    ->i_alloc_sem             (various)
- *
- *  ->inode_lock
- *    ->sb_lock                        (fs/fs-writeback.c)
+ *  bdi->wb.list_lock
+ *    sb_lock                  (fs/fs-writeback.c)
  *    ->mapping->tree_lock     (__sync_single_inode)
  *
- *  ->i_mmap_lock
+ *  ->i_mmap_mutex
  *    ->anon_vma.lock          (vma_adjust)
  *
  *  ->anon_vma.lock
  *    ->zone.lru_lock          (check_pte_range->isolate_lru_page)
  *    ->private_lock           (page_remove_rmap->set_page_dirty)
  *    ->tree_lock              (page_remove_rmap->set_page_dirty)
- *    ->inode_lock             (page_remove_rmap->set_page_dirty)
- *    ->inode_lock             (zap_pte_range->set_page_dirty)
+ *    bdi.wb->list_lock                (page_remove_rmap->set_page_dirty)
+ *    ->inode->i_lock          (page_remove_rmap->set_page_dirty)
+ *    bdi.wb->list_lock                (zap_pte_range->set_page_dirty)
+ *    ->inode->i_lock          (zap_pte_range->set_page_dirty)
  *    ->private_lock           (zap_pte_range->__set_page_dirty_buffers)
  *
- *  ->task->proc_lock
- *    ->dcache_lock            (proc_pid_lookup)
+ *  (code doesn't rely on that order, so you could switch it around)
+ *  ->tasklist_lock             (memory_failure, collect_procs_ao)
+ *    ->i_mmap_mutex
  */
 
 /*
- * Remove a page from the page cache and free it. Caller has to make
+ * Delete a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
  * is safe.  The caller must hold the mapping's tree_lock.
  */
-void __remove_from_page_cache(struct page *page)
+void __delete_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
 
+       /*
+        * if we're uptodate, flush out into the cleancache, otherwise
+        * invalidate any existing cleancache entries.  We can't leave
+        * stale data around in the cleancache once our page is gone
+        */
+       if (PageUptodate(page) && PageMappedToDisk(page))
+               cleancache_put_page(page);
+       else
+               cleancache_flush_page(mapping, page);
+
        radix_tree_delete(&mapping->page_tree, page->index);
        page->mapping = NULL;
+       /* Leave page->index set: truncation lookup relies upon it */
        mapping->nrpages--;
        __dec_zone_page_state(page, NR_FILE_PAGES);
+       if (PageSwapBacked(page))
+               __dec_zone_page_state(page, NR_SHMEM);
        BUG_ON(page_mapped(page));
 
        /*
@@ -135,57 +147,42 @@ void __remove_from_page_cache(struct page *page)
        }
 }
 
-void remove_from_page_cache(struct page *page)
+/**
+ * delete_from_page_cache - delete page from page cache
+ * @page: the page which the kernel is trying to remove from page cache
+ *
+ * This must be called only on pages that have been verified to be in the page
+ * cache and locked.  It will never put the page into the free list, the caller
+ * has a reference on the page.
+ */
+void delete_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
+       void (*freepage)(struct page *);
 
        BUG_ON(!PageLocked(page));
 
+       freepage = mapping->a_ops->freepage;
        spin_lock_irq(&mapping->tree_lock);
-       __remove_from_page_cache(page);
+       __delete_from_page_cache(page);
        spin_unlock_irq(&mapping->tree_lock);
        mem_cgroup_uncharge_cache_page(page);
+
+       if (freepage)
+               freepage(page);
+       page_cache_release(page);
 }
+EXPORT_SYMBOL(delete_from_page_cache);
 
-static int sync_page(void *word)
+static int sleep_on_page(void *word)
 {
-       struct address_space *mapping;
-       struct page *page;
-
-       page = container_of((unsigned long *)word, struct page, flags);
-
-       /*
-        * page_mapping() is being called without PG_locked held.
-        * Some knowledge of the state and use of the page is used to
-        * reduce the requirements down to a memory barrier.
-        * The danger here is of a stale page_mapping() return value
-        * indicating a struct address_space different from the one it's
-        * associated with when it is associated with one.
-        * After smp_mb(), it's either the correct page_mapping() for
-        * the page, or an old page_mapping() and the page's own
-        * page_mapping() has gone NULL.
-        * The ->sync_page() address_space operation must tolerate
-        * page_mapping() going NULL. By an amazing coincidence,
-        * this comes about because none of the users of the page
-        * in the ->sync_page() methods make essential use of the
-        * page_mapping(), merely passing the page down to the backing
-        * device's unplug functions when it's non-NULL, which in turn
-        * ignore it for all cases but swap, where only page_private(page) is
-        * of interest. When page_mapping() does go NULL, the entire
-        * call stack gracefully ignores the page and returns.
-        * -- wli
-        */
-       smp_mb();
-       mapping = page_mapping(page);
-       if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
-               mapping->a_ops->sync_page(page);
        io_schedule();
        return 0;
 }
 
-static int sync_page_killable(void *word)
+static int sleep_on_page_killable(void *word)
 {
-       sync_page(word);
+       sleep_on_page(word);
        return fatal_signal_pending(current) ? -EINTR : 0;
 }
 
@@ -255,27 +252,27 @@ int filemap_flush(struct address_space *mapping)
 EXPORT_SYMBOL(filemap_flush);
 
 /**
- * wait_on_page_writeback_range - wait for writeback to complete
- * @mapping:   target address_space
- * @start:     beginning page index
- * @end:       ending page index
+ * filemap_fdatawait_range - wait for writeback to complete
+ * @mapping:           address space structure to wait for
+ * @start_byte:                offset in bytes where the range starts
+ * @end_byte:          offset in bytes where the range ends (inclusive)
  *
- * Wait for writeback to complete against pages indexed by start->end
- * inclusive
+ * Walk the list of under-writeback pages of the given address space
+ * in the given range and wait for all of them.
  */
-int wait_on_page_writeback_range(struct address_space *mapping,
-                               pgoff_t start, pgoff_t end)
+int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
+                           loff_t end_byte)
 {
+       pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
+       pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
        struct pagevec pvec;
        int nr_pages;
        int ret = 0;
-       pgoff_t index;
 
-       if (end < start)
+       if (end_byte < start_byte)
                return 0;
 
        pagevec_init(&pvec, 0);
-       index = start;
        while ((index <= end) &&
                        (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
                        PAGECACHE_TAG_WRITEBACK,
@@ -290,7 +287,7 @@ int wait_on_page_writeback_range(struct address_space *mapping,
                                continue;
 
                        wait_on_page_writeback(page);
-                       if (PageError(page))
+                       if (TestClearPageError(page))
                                ret = -EIO;
                }
                pagevec_release(&pvec);
@@ -305,70 +302,7 @@ int wait_on_page_writeback_range(struct address_space *mapping,
 
        return ret;
 }
-
-/**
- * sync_page_range - write and wait on all pages in the passed range
- * @inode:     target inode
- * @mapping:   target address_space
- * @pos:       beginning offset in pages to write
- * @count:     number of bytes to write
- *
- * Write and wait upon all the pages in the passed range.  This is a "data
- * integrity" operation.  It waits upon in-flight writeout before starting and
- * waiting upon new writeout.  If there was an IO error, return it.
- *
- * We need to re-take i_mutex during the generic_osync_inode list walk because
- * it is otherwise livelockable.
- */
-int sync_page_range(struct inode *inode, struct address_space *mapping,
-                       loff_t pos, loff_t count)
-{
-       pgoff_t start = pos >> PAGE_CACHE_SHIFT;
-       pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
-       int ret;
-
-       if (!mapping_cap_writeback_dirty(mapping) || !count)
-               return 0;
-       ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
-       if (ret == 0) {
-               mutex_lock(&inode->i_mutex);
-               ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
-               mutex_unlock(&inode->i_mutex);
-       }
-       if (ret == 0)
-               ret = wait_on_page_writeback_range(mapping, start, end);
-       return ret;
-}
-EXPORT_SYMBOL(sync_page_range);
-
-/**
- * sync_page_range_nolock - write & wait on all pages in the passed range without locking
- * @inode:     target inode
- * @mapping:   target address_space
- * @pos:       beginning offset in pages to write
- * @count:     number of bytes to write
- *
- * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea
- * as it forces O_SYNC writers to different parts of the same file
- * to be serialised right until io completion.
- */
-int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
-                          loff_t pos, loff_t count)
-{
-       pgoff_t start = pos >> PAGE_CACHE_SHIFT;
-       pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
-       int ret;
-
-       if (!mapping_cap_writeback_dirty(mapping) || !count)
-               return 0;
-       ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
-       if (ret == 0)
-               ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
-       if (ret == 0)
-               ret = wait_on_page_writeback_range(mapping, start, end);
-       return ret;
-}
-EXPORT_SYMBOL(sync_page_range_nolock);
+EXPORT_SYMBOL(filemap_fdatawait_range);
 
 /**
  * filemap_fdatawait - wait for all under-writeback pages to complete
@@ -384,8 +318,7 @@ int filemap_fdatawait(struct address_space *mapping)
        if (i_size == 0)
                return 0;
 
-       return wait_on_page_writeback_range(mapping, 0,
-                               (i_size - 1) >> PAGE_CACHE_SHIFT);
+       return filemap_fdatawait_range(mapping, 0, i_size - 1);
 }
 EXPORT_SYMBOL(filemap_fdatawait);
 
@@ -432,9 +365,8 @@ int filemap_write_and_wait_range(struct address_space *mapping,
                                                 WB_SYNC_ALL);
                /* See comment of filemap_write_and_wait() */
                if (err != -EIO) {
-                       int err2 = wait_on_page_writeback_range(mapping,
-                                               lstart >> PAGE_CACHE_SHIFT,
-                                               lend >> PAGE_CACHE_SHIFT);
+                       int err2 = filemap_fdatawait_range(mapping,
+                                               lstart, lend);
                        if (!err)
                                err = err2;
                }
@@ -444,6 +376,76 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 EXPORT_SYMBOL(filemap_write_and_wait_range);
 
 /**
+ * replace_page_cache_page - replace a pagecache page with a new one
+ * @old:       page to be replaced
+ * @new:       page to replace with
+ * @gfp_mask:  allocation mode
+ *
+ * This function replaces a page in the pagecache with a new one.  On
+ * success it acquires the pagecache reference for the new page and
+ * drops it for the old page.  Both the old and new pages must be
+ * locked.  This function does not add the new page to the LRU, the
+ * caller must do that.
+ *
+ * The remove + add is atomic.  The only way this function can fail is
+ * memory allocation failure.
+ */
+int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+{
+       int error;
+       struct mem_cgroup *memcg = NULL;
+
+       VM_BUG_ON(!PageLocked(old));
+       VM_BUG_ON(!PageLocked(new));
+       VM_BUG_ON(new->mapping);
+
+       /*
+        * This is not page migration, but prepare_migration and
+        * end_migration does enough work for charge replacement.
+        *
+        * In the longer term we probably want a specialized function
+        * for moving the charge from old to new in a more efficient
+        * manner.
+        */
+       error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
+       if (error)
+               return error;
+
+       error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+       if (!error) {
+               struct address_space *mapping = old->mapping;
+               void (*freepage)(struct page *);
+
+               pgoff_t offset = old->index;
+               freepage = mapping->a_ops->freepage;
+
+               page_cache_get(new);
+               new->mapping = mapping;
+               new->index = offset;
+
+               spin_lock_irq(&mapping->tree_lock);
+               __delete_from_page_cache(old);
+               error = radix_tree_insert(&mapping->page_tree, offset, new);
+               BUG_ON(error);
+               mapping->nrpages++;
+               __inc_zone_page_state(new, NR_FILE_PAGES);
+               if (PageSwapBacked(new))
+                       __inc_zone_page_state(new, NR_SHMEM);
+               spin_unlock_irq(&mapping->tree_lock);
+               radix_tree_preload_end();
+               if (freepage)
+                       freepage(old);
+               page_cache_release(old);
+               mem_cgroup_end_migration(memcg, old, new, true);
+       } else {
+               mem_cgroup_end_migration(memcg, old, new, false);
+       }
+
+       return error;
+}
+EXPORT_SYMBOL_GPL(replace_page_cache_page);
+
+/**
  * add_to_page_cache_locked - add a locked page to the pagecache
  * @page:      page to add
  * @mapping:   the page's address_space
@@ -459,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
        int error;
 
        VM_BUG_ON(!PageLocked(page));
+       VM_BUG_ON(PageSwapBacked(page));
 
        error = mem_cgroup_cache_charge(page, current->mm,
                                        gfp_mask & GFP_RECLAIM_MASK);
@@ -479,6 +482,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                        spin_unlock_irq(&mapping->tree_lock);
                } else {
                        page->mapping = NULL;
+                       /* Leave page->index set: truncation relies upon it */
                        spin_unlock_irq(&mapping->tree_lock);
                        mem_cgroup_uncharge_cache_page(page);
                        page_cache_release(page);
@@ -496,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 {
        int ret;
 
-       /*
-        * Splice_read and readahead add shmem/tmpfs pages into the page cache
-        * before shmem_readpage has a chance to mark them as SwapBacked: they
-        * need to go on the active_anon lru below, and mem_cgroup_cache_charge
-        * (called in add_to_page_cache) needs to know where they're going too.
-        */
-       if (mapping_cap_swap_backed(mapping))
-               SetPageSwapBacked(page);
-
        ret = add_to_page_cache(page, mapping, offset, gfp_mask);
-       if (ret == 0) {
-               if (page_is_file_cache(page))
-                       lru_cache_add_file(page);
-               else
-                       lru_cache_add_active_anon(page);
-       }
+       if (ret == 0)
+               lru_cache_add_file(page);
        return ret;
 }
 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
@@ -519,21 +510,21 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
 #ifdef CONFIG_NUMA
 struct page *__page_cache_alloc(gfp_t gfp)
 {
+       int n;
+       struct page *page;
+
        if (cpuset_do_page_mem_spread()) {
-               int n = cpuset_mem_spread_node();
-               return alloc_pages_node(n, gfp, 0);
+               get_mems_allowed();
+               n = cpuset_mem_spread_node();
+               page = alloc_pages_exact_node(n, gfp, 0);
+               put_mems_allowed();
+               return page;
        }
        return alloc_pages(gfp, 0);
 }
 EXPORT_SYMBOL(__page_cache_alloc);
 #endif
 
-static int __sleep_on_page_lock(void *word)
-{
-       io_schedule();
-       return 0;
-}
-
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
@@ -561,11 +552,22 @@ void wait_on_page_bit(struct page *page, int bit_nr)
        DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
 
        if (test_bit(bit_nr, &page->flags))
-               __wait_on_bit(page_waitqueue(page), &wait, sync_page,
+               __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
                                                        TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_on_page_bit);
 
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+       DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+
+       if (!test_bit(bit_nr, &page->flags))
+               return 0;
+
+       return __wait_on_bit(page_waitqueue(page), &wait,
+                            sleep_on_page_killable, TASK_KILLABLE);
+}
+
 /**
  * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
  * @page: Page defining the wait queue of interest
@@ -625,17 +627,12 @@ EXPORT_SYMBOL(end_page_writeback);
 /**
  * __lock_page - get a lock on the page, assuming we need to sleep to get it
  * @page: the page to lock
- *
- * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
- * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
- * chances are that on the second loop, the block layer's plug list is empty,
- * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
  */
 void __lock_page(struct page *page)
 {
        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
-       __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
+       __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
                                                        TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_page);
@@ -645,22 +642,40 @@ int __lock_page_killable(struct page *page)
        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
        return __wait_on_bit_lock(page_waitqueue(page), &wait,
-                                       sync_page_killable, TASK_KILLABLE);
+                                       sleep_on_page_killable, TASK_KILLABLE);
 }
 EXPORT_SYMBOL_GPL(__lock_page_killable);
 
-/**
- * __lock_page_nosync - get a lock on the page, without calling sync_page()
- * @page: the page to lock
- *
- * Variant of lock_page that does not require the caller to hold a reference
- * on the page's mapping.
- */
-void __lock_page_nosync(struct page *page)
+int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+                        unsigned int flags)
 {
-       DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
-       __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
-                                                       TASK_UNINTERRUPTIBLE);
+       if (flags & FAULT_FLAG_ALLOW_RETRY) {
+               /*
+                * CAUTION! In this case, mmap_sem is not released
+                * even though return 0.
+                */
+               if (flags & FAULT_FLAG_RETRY_NOWAIT)
+                       return 0;
+
+               up_read(&mm->mmap_sem);
+               if (flags & FAULT_FLAG_KILLABLE)
+                       wait_on_page_locked_killable(page);
+               else
+                       wait_on_page_locked(page);
+               return 0;
+       } else {
+               if (flags & FAULT_FLAG_KILLABLE) {
+                       int ret;
+
+                       ret = __lock_page_killable(page);
+                       if (ret) {
+                               up_read(&mm->mmap_sem);
+                               return 0;
+                       }
+               } else
+                       __lock_page(page);
+               return 1;
+       }
 }
 
 /**
@@ -682,9 +697,18 @@ repeat:
        pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
        if (pagep) {
                page = radix_tree_deref_slot(pagep);
-               if (unlikely(!page || page == RADIX_TREE_RETRY))
-                       goto repeat;
-
+               if (unlikely(!page))
+                       goto out;
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page))
+                               goto repeat;
+                       /*
+                        * Otherwise, shmem/tmpfs must be storing a swap entry
+                        * here as an exceptional entry: so return it without
+                        * attempting to raise page count.
+                        */
+                       goto out;
+               }
                if (!page_cache_get_speculative(page))
                        goto repeat;
 
@@ -698,6 +722,7 @@ repeat:
                        goto repeat;
                }
        }
+out:
        rcu_read_unlock();
 
        return page;
@@ -720,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
 
 repeat:
        page = find_get_page(mapping, offset);
-       if (page) {
+       if (page && !radix_tree_exception(page)) {
                lock_page(page);
                /* Has the page been truncated? */
                if (unlikely(page->mapping != mapping)) {
@@ -802,25 +827,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 {
        unsigned int i;
        unsigned int ret;
-       unsigned int nr_found;
+       unsigned int nr_found, nr_skip;
 
        rcu_read_lock();
 restart:
        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-                               (void ***)pages, start, nr_pages);
+                               (void ***)pages, NULL, start, nr_pages);
        ret = 0;
+       nr_skip = 0;
        for (i = 0; i < nr_found; i++) {
                struct page *page;
 repeat:
                page = radix_tree_deref_slot((void **)pages[i]);
                if (unlikely(!page))
                        continue;
-               /*
-                * this can only trigger if nr_found == 1, making livelock
-                * a non issue.
-                */
-               if (unlikely(page == RADIX_TREE_RETRY))
-                       goto restart;
+
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page)) {
+                               /*
+                                * Transient condition which can only trigger
+                                * when entry at index 0 moves out of or back
+                                * to root: none yet gotten, safe to restart.
+                                */
+                               WARN_ON(start | i);
+                               goto restart;
+                       }
+                       /*
+                        * Otherwise, shmem/tmpfs must be storing a swap entry
+                        * here as an exceptional entry: so skip over it -
+                        * we only reach this from invalidate_mapping_pages().
+                        */
+                       nr_skip++;
+                       continue;
+               }
 
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -834,6 +873,13 @@ repeat:
                pages[ret] = page;
                ret++;
        }
+
+       /*
+        * If all entries were removed before we could secure them,
+        * try again, because callers stop trying once 0 is returned.
+        */
+       if (unlikely(!ret && nr_found > nr_skip))
+               goto restart;
        rcu_read_unlock();
        return ret;
 }
@@ -860,7 +906,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
        rcu_read_lock();
 restart:
        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-                               (void ***)pages, index, nr_pages);
+                               (void ***)pages, NULL, index, nr_pages);
        ret = 0;
        for (i = 0; i < nr_found; i++) {
                struct page *page;
@@ -868,15 +914,23 @@ repeat:
                page = radix_tree_deref_slot((void **)pages[i]);
                if (unlikely(!page))
                        continue;
-               /*
-                * this can only trigger if nr_found == 1, making livelock
-                * a non issue.
-                */
-               if (unlikely(page == RADIX_TREE_RETRY))
-                       goto restart;
 
-               if (page->mapping == NULL || page->index != index)
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page)) {
+                               /*
+                                * Transient condition which can only trigger
+                                * when entry at index 0 moves out of or back
+                                * to root: none yet gotten, safe to restart.
+                                */
+                               goto restart;
+                       }
+                       /*
+                        * Otherwise, shmem/tmpfs must be storing a swap entry
+                        * here as an exceptional entry: so stop looking for
+                        * contiguous pages.
+                        */
                        break;
+               }
 
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -887,6 +941,16 @@ repeat:
                        goto repeat;
                }
 
+               /*
+                * must check mapping and index after taking the ref.
+                * otherwise we can get both false positives and false
+                * negatives, which is just confusing to the caller.
+                */
+               if (page->mapping == NULL || page->index != index) {
+                       page_cache_release(page);
+                       break;
+               }
+
                pages[ret] = page;
                ret++;
                index++;
@@ -925,12 +989,22 @@ repeat:
                page = radix_tree_deref_slot((void **)pages[i]);
                if (unlikely(!page))
                        continue;
-               /*
-                * this can only trigger if nr_found == 1, making livelock
-                * a non issue.
-                */
-               if (unlikely(page == RADIX_TREE_RETRY))
-                       goto restart;
+
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page)) {
+                               /*
+                                * Transient condition which can only trigger
+                                * when entry at index 0 moves out of or back
+                                * to root: none yet gotten, safe to restart.
+                                */
+                               goto restart;
+                       }
+                       /*
+                        * This function is never used on a shmem/tmpfs
+                        * mapping, so a swap entry won't be found here.
+                        */
+                       BUG();
+               }
 
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -944,6 +1018,13 @@ repeat:
                pages[ret] = page;
                ret++;
        }
+
+       /*
+        * If all entries were removed before we could secure them,
+        * try again, because callers stop trying once 0 is returned.
+        */
+       if (unlikely(!ret && nr_found))
+               goto restart;
        rcu_read_unlock();
 
        if (ret)
@@ -1004,9 +1085,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait);
 static void shrink_readahead_size_eio(struct file *filp,
                                        struct file_ra_state *ra)
 {
-       if (!ra->ra_pages)
-               return;
-
        ra->ra_pages /= 4;
 }
 
@@ -1070,6 +1148,9 @@ find_page:
                                goto page_not_up_to_date;
                        if (!trylock_page(page))
                                goto page_not_up_to_date;
+                       /* Did it get truncated before we got the lock? */
+                       if (!page->mapping)
+                               goto page_not_up_to_date_locked;
                        if (!mapping->a_ops->is_partially_uptodate(page,
                                                                desc, offset))
                                goto page_not_up_to_date_locked;
@@ -1160,6 +1241,12 @@ page_not_up_to_date_locked:
                }
 
 readpage:
+               /*
+                * A previous I/O error may have been due to temporary
+                * failures, eg. multipath errors.
+                * PG_error will be set again if readpage fails.
+                */
+               ClearPageError(page);
                /* Start the actual read. The read will unlock the page. */
                error = mapping->a_ops->readpage(filp, page);
 
@@ -1178,7 +1265,7 @@ readpage:
                        if (!PageUptodate(page)) {
                                if (page->mapping == NULL) {
                                        /*
-                                        * invalidate_inode_pages got it
+                                        * invalidate_mapping_pages got it
                                         */
                                        unlock_page(page);
                                        page_cache_release(page);
@@ -1324,15 +1411,18 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 {
        struct file *filp = iocb->ki_filp;
        ssize_t retval;
-       unsigned long seg;
+       unsigned long seg = 0;
        size_t count;
        loff_t *ppos = &iocb->ki_pos;
+       struct blk_plug plug;
 
        count = 0;
        retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
        if (retval)
                return retval;
 
+       blk_start_plug(&plug);
+
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (filp->f_flags & O_DIRECT) {
                loff_t size;
@@ -1351,21 +1441,47 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                                retval = mapping->a_ops->direct_IO(READ, iocb,
                                                        iov, pos, nr_segs);
                        }
-                       if (retval > 0)
+                       if (retval > 0) {
                                *ppos = pos + retval;
-                       if (retval) {
+                               count -= retval;
+                       }
+
+                       /*
+                        * Btrfs can have a short DIO read if we encounter
+                        * compressed extents, so if there was an error, or if
+                        * we've already read everything we wanted to, or if
+                        * there was a short read because we hit EOF, go ahead
+                        * and return.  Otherwise fallthrough to buffered io for
+                        * the rest of the read.
+                        */
+                       if (retval < 0 || !count || *ppos >= size) {
                                file_accessed(filp);
                                goto out;
                        }
                }
        }
 
+       count = retval;
        for (seg = 0; seg < nr_segs; seg++) {
                read_descriptor_t desc;
+               loff_t offset = 0;
+
+               /*
+                * If we did a short DIO read we need to skip the section of the
+                * iov that we've already read data into.
+                */
+               if (count) {
+                       if (count > iov[seg].iov_len) {
+                               count -= iov[seg].iov_len;
+                               continue;
+                       }
+                       offset = count;
+                       count = 0;
+               }
 
                desc.written = 0;
-               desc.arg.buf = iov[seg].iov_base;
-               desc.count = iov[seg].iov_len;
+               desc.arg.buf = iov[seg].iov_base + offset;
+               desc.count = iov[seg].iov_len - offset;
                if (desc.count == 0)
                        continue;
                desc.error = 0;
@@ -1379,6 +1495,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                        break;
        }
 out:
+       blk_finish_plug(&plug);
        return retval;
 }
 EXPORT_SYMBOL(generic_file_aio_read);
@@ -1471,14 +1588,17 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
        /* If we don't want any read-ahead, don't bother */
        if (VM_RandomReadHint(vma))
                return;
+       if (!ra->ra_pages)
+               return;
 
-       if (VM_SequentialReadHint(vma) ||
-                       offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
-               page_cache_sync_readahead(mapping, ra, file, offset, 1);
+       if (VM_SequentialReadHint(vma)) {
+               page_cache_sync_readahead(mapping, ra, file, offset,
+                                         ra->ra_pages);
                return;
        }
 
-       if (ra->mmap_miss < INT_MAX)
+       /* Avoid banging the cache line if not needed */
+       if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
                ra->mmap_miss++;
 
        /*
@@ -1488,14 +1608,14 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
        if (ra->mmap_miss > MMAP_LOTSAMISS)
                return;
 
+       /*
+        * mmap read-around
+        */
        ra_pages = max_sane_readahead(ra->ra_pages);
-       if (ra_pages) {
-               pgoff_t start = 0;
-
-               if (offset > ra_pages / 2)
-                       start = offset - ra_pages / 2;
-               do_page_cache_readahead(mapping, file, start, ra_pages);
-       }
+       ra->start = max_t(long, 0, offset - ra_pages / 2);
+       ra->size = ra_pages;
+       ra->async_size = ra_pages / 4;
+       ra_submit(ra, mapping, file);
 }
 
 /*
@@ -1516,7 +1636,8 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
        if (ra->mmap_miss > 0)
                ra->mmap_miss--;
        if (PageReadahead(page))
-               page_cache_async_readahead(mapping, ra, file, page, offset, 1);
+               page_cache_async_readahead(mapping, ra, file,
+                                          page, offset, ra->ra_pages);
 }
 
 /**
@@ -1557,25 +1678,31 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                 * waiting for the lock.
                 */
                do_async_mmap_readahead(vma, ra, file, page, offset);
-               lock_page(page);
-
-               /* Did it get truncated? */
-               if (unlikely(page->mapping != mapping)) {
-                       unlock_page(page);
-                       put_page(page);
-                       goto no_cached_page;
-               }
        } else {
                /* No page in the page cache at all */
                do_sync_mmap_readahead(vma, ra, file, offset);
                count_vm_event(PGMAJFAULT);
+               mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
                ret = VM_FAULT_MAJOR;
 retry_find:
-               page = find_lock_page(mapping, offset);
+               page = find_get_page(mapping, offset);
                if (!page)
                        goto no_cached_page;
        }
 
+       if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+               page_cache_release(page);
+               return ret | VM_FAULT_RETRY;
+       }
+
+       /* Did it get truncated? */
+       if (unlikely(page->mapping != mapping)) {
+               unlock_page(page);
+               put_page(page);
+               goto retry_find;
+       }
+       VM_BUG_ON(page->index != offset);
+
        /*
         * We have a locked page in the page cache, now we need to check
         * that it's up-to-date. If not, it is going to be due to an error.
@@ -1594,7 +1721,6 @@ retry_find:
                return VM_FAULT_SIGBUS;
        }
 
-       ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
        vmf->page = page;
        return ret | VM_FAULT_LOCKED;
 
@@ -1647,7 +1773,7 @@ page_not_uptodate:
 }
 EXPORT_SYMBOL(filemap_fault);
 
-struct vm_operations_struct generic_file_vm_ops = {
+const struct vm_operations_struct generic_file_vm_ops = {
        .fault          = filemap_fault,
 };
 
@@ -1690,15 +1816,16 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
 
 static struct page *__read_cache_page(struct address_space *mapping,
                                pgoff_t index,
-                               int (*filler)(void *,struct page*),
-                               void *data)
+                               int (*filler)(void *, struct page *),
+                               void *data,
+                               gfp_t gfp)
 {
        struct page *page;
        int err;
 repeat:
        page = find_get_page(mapping, index);
        if (!page) {
-               page = page_cache_alloc_cold(mapping);
+               page = __page_cache_alloc(gfp | __GFP_COLD);
                if (!page)
                        return ERR_PTR(-ENOMEM);
                err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
@@ -1718,31 +1845,18 @@ repeat:
        return page;
 }
 
-/**
- * read_cache_page_async - read into page cache, fill it if needed
- * @mapping:   the page's address_space
- * @index:     the page index
- * @filler:    function to perform the read
- * @data:      destination for read data
- *
- * Same as read_cache_page, but don't wait for page to become unlocked
- * after submitting it to the filler.
- *
- * Read into the page cache. If a page already exists, and PageUptodate() is
- * not set, try to fill the page but don't wait for it to become unlocked.
- *
- * If the page does not get brought uptodate, return -EIO.
- */
-struct page *read_cache_page_async(struct address_space *mapping,
+static struct page *do_read_cache_page(struct address_space *mapping,
                                pgoff_t index,
-                               int (*filler)(void *,struct page*),
-                               void *data)
+                               int (*filler)(void *, struct page *),
+                               void *data,
+                               gfp_t gfp)
+
 {
        struct page *page;
        int err;
 
 retry:
-       page = __read_cache_page(mapping, index, filler, data);
+       page = __read_cache_page(mapping, index, filler, data, gfp);
        if (IS_ERR(page))
                return page;
        if (PageUptodate(page))
@@ -1767,14 +1881,73 @@ out:
        mark_page_accessed(page);
        return page;
 }
+
+/**
+ * read_cache_page_async - read into page cache, fill it if needed
+ * @mapping:   the page's address_space
+ * @index:     the page index
+ * @filler:    function to perform the read
+ * @data:      first arg to filler(data, page) function, often left as NULL
+ *
+ * Same as read_cache_page, but don't wait for page to become unlocked
+ * after submitting it to the filler.
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page but don't wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_async(struct address_space *mapping,
+                               pgoff_t index,
+                               int (*filler)(void *, struct page *),
+                               void *data)
+{
+       return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
+}
 EXPORT_SYMBOL(read_cache_page_async);
 
+static struct page *wait_on_page_read(struct page *page)
+{
+       if (!IS_ERR(page)) {
+               wait_on_page_locked(page);
+               if (!PageUptodate(page)) {
+                       page_cache_release(page);
+                       page = ERR_PTR(-EIO);
+               }
+       }
+       return page;
+}
+
+/**
+ * read_cache_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping:   the page's address_space
+ * @index:     the page index
+ * @gfp:       the page allocator flags to use if allocating
+ *
+ * This is the same as "read_mapping_page(mapping, index, NULL)", but with
+ * any new page allocations done using the specified allocation flags. Note
+ * that the Radix tree operations will still use GFP_KERNEL, so you can't
+ * expect to do this atomically or anything like that - but you can pass in
+ * other page requirements.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_gfp(struct address_space *mapping,
+                               pgoff_t index,
+                               gfp_t gfp)
+{
+       filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+
+       return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
+}
+EXPORT_SYMBOL(read_cache_page_gfp);
+
 /**
  * read_cache_page - read into page cache, fill it if needed
  * @mapping:   the page's address_space
  * @index:     the page index
  * @filler:    function to perform the read
- * @data:      destination for read data
+ * @data:      first arg to filler(data, page) function, often left as NULL
  *
  * Read into the page cache. If a page already exists, and PageUptodate() is
  * not set, try to fill the page then wait for it to become unlocked.
@@ -1783,21 +1956,10 @@ EXPORT_SYMBOL(read_cache_page_async);
  */
 struct page *read_cache_page(struct address_space *mapping,
                                pgoff_t index,
-                               int (*filler)(void *,struct page*),
+                               int (*filler)(void *, struct page *),
                                void *data)
 {
-       struct page *page;
-
-       page = read_cache_page_async(mapping, index, filler, data);
-       if (IS_ERR(page))
-               goto out;
-       wait_on_page_locked(page);
-       if (!PageUptodate(page)) {
-               page_cache_release(page);
-               page = ERR_PTR(-EIO);
-       }
- out:
-       return page;
+       return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
 }
 EXPORT_SYMBOL(read_cache_page);
 
@@ -1841,16 +2003,26 @@ static int __remove_suid(struct dentry *dentry, int kill)
 int file_remove_suid(struct file *file)
 {
        struct dentry *dentry = file->f_path.dentry;
-       int killsuid = should_remove_suid(dentry);
-       int killpriv = security_inode_need_killpriv(dentry);
+       struct inode *inode = dentry->d_inode;
+       int killsuid;
+       int killpriv;
        int error = 0;
 
+       /* Fast path for nothing security related */
+       if (IS_NOSEC(inode))
+               return 0;
+
+       killsuid = should_remove_suid(dentry);
+       killpriv = security_inode_need_killpriv(dentry);
+
        if (killpriv < 0)
                return killpriv;
        if (killpriv)
                error = security_inode_killpriv(dentry);
        if (!error && killsuid)
                error = __remove_suid(dentry, killsuid);
+       if (!error && (inode->i_sb->s_flags & MS_NOSEC))
+               inode->i_flags |= S_NOSEC;
 
        return error;
 }
@@ -1880,7 +2052,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr,
 
 /*
  * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then return the number of
+ * were successfully copied.  If a fault is encountered then return the number of
  * bytes which were copied.
  */
 size_t iov_iter_copy_from_user_atomic(struct page *page,
@@ -2007,7 +2179,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count);
 inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
 {
        struct inode *inode = file->f_mapping->host;
-       unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+       unsigned long limit = rlimit(RLIMIT_FSIZE);
 
         if (unlikely(*pos < 0))
                 return -EINVAL;
@@ -2159,27 +2331,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        }
 
        if (written > 0) {
-               loff_t end = pos + written;
-               if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
-                       i_size_write(inode,  end);
+               pos += written;
+               if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+                       i_size_write(inode, pos);
                        mark_inode_dirty(inode);
                }
-               *ppos = end;
+               *ppos = pos;
        }
-
-       /*
-        * Sync the fs metadata but not the minor inode changes and
-        * of course not the data as we did direct DMA for the IO.
-        * i_mutex is held, which protects generic_osync_inode() from
-        * livelocking.  AIO O_DIRECT ops attempt to sync metadata here.
-        */
 out:
-       if ((written >= 0 || written == -EIOCBQUEUED) &&
-           ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-               int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
-               if (err < 0)
-                       written = err;
-       }
        return written;
 }
 EXPORT_SYMBOL(generic_file_direct_write);
@@ -2198,8 +2357,8 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
                gfp_notmask = __GFP_FS;
 repeat:
        page = find_lock_page(mapping, index);
-       if (likely(page))
-               return page;
+       if (page)
+               goto found;
 
        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
        if (!page)
@@ -2212,6 +2371,8 @@ repeat:
                        goto repeat;
                return NULL;
        }
+found:
+       wait_on_page_writeback(page);
        return page;
 }
 EXPORT_SYMBOL(grab_cache_page_write_begin);
@@ -2233,14 +2394,12 @@ static ssize_t generic_perform_write(struct file *file,
 
        do {
                struct page *page;
-               pgoff_t index;          /* Pagecache index for current page */
                unsigned long offset;   /* Offset into pagecache page */
                unsigned long bytes;    /* Bytes to write to page */
                size_t copied;          /* Bytes copied from user */
                void *fsdata;
 
                offset = (pos & (PAGE_CACHE_SIZE - 1));
-               index = pos >> PAGE_CACHE_SHIFT;
                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
                                                iov_iter_count(i));
 
@@ -2266,11 +2425,15 @@ again:
                if (unlikely(status))
                        break;
 
+               if (mapping_writably_mapped(mapping))
+                       flush_dcache_page(page);
+
                pagefault_disable();
                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
                pagefault_enable();
                flush_dcache_page(page);
 
+               mark_page_accessed(page);
                status = a_ops->write_end(file, mapping, pos, bytes, copied,
                                                page, fsdata);
                if (unlikely(status < 0))
@@ -2309,9 +2472,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                size_t count, ssize_t written)
 {
        struct file *file = iocb->ki_filp;
-       struct address_space *mapping = file->f_mapping;
-       const struct address_space_operations *a_ops = mapping->a_ops;
-       struct inode *inode = mapping->host;
        ssize_t status;
        struct iov_iter i;
 
@@ -2321,34 +2481,33 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
        if (likely(status >= 0)) {
                written += status;
                *ppos = pos + status;
-
-               /*
-                * For now, when the user asks for O_SYNC, we'll actually give
-                * O_DSYNC
-                */
-               if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-                       if (!a_ops->writepage || !is_sync_kiocb(iocb))
-                               status = generic_osync_inode(inode, mapping,
-                                               OSYNC_METADATA|OSYNC_DATA);
-               }
        }
        
-       /*
-        * If we get here for O_DIRECT writes then we must have fallen through
-        * to buffered writes (block instantiation inside i_size).  So we sync
-        * the file data here, to try to honour O_DIRECT expectations.
-        */
-       if (unlikely(file->f_flags & O_DIRECT) && written)
-               status = filemap_write_and_wait_range(mapping,
-                                       pos, pos + written - 1);
-
        return written ? written : status;
 }
 EXPORT_SYMBOL(generic_file_buffered_write);
 
-static ssize_t
-__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
-                               unsigned long nr_segs, loff_t *ppos)
+/**
+ * __generic_file_aio_write - write data to a file
+ * @iocb:      IO state structure (file, offset, etc.)
+ * @iov:       vector with data to write
+ * @nr_segs:   number of segments in the vector
+ * @ppos:      position where to write
+ *
+ * This function does all the work needed for actually writing data to a
+ * file. It does all basic checks, removes SUID from the file, updates
+ * modification times and calls proper subroutines depending on whether we
+ * do direct IO or a standard buffered write.
+ *
+ * It expects i_mutex to be grabbed unless we work on a block device or similar
+ * object which does not need locking at all.
+ *
+ * This function does *not* take care of syncing data in case of O_SYNC write.
+ * A caller has to handle it. This is mainly due to the fact that we want to
+ * avoid syncing under i_mutex.
+ */
+ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+                                unsigned long nr_segs, loff_t *ppos)
 {
        struct file *file = iocb->ki_filp;
        struct address_space * mapping = file->f_mapping;
@@ -2422,10 +2581,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
                 * semantics.
                 */
                endbyte = pos + written_buffered - written - 1;
-               err = do_sync_mapping_range(file->f_mapping, pos, endbyte,
-                                           SYNC_FILE_RANGE_WAIT_BEFORE|
-                                           SYNC_FILE_RANGE_WRITE|
-                                           SYNC_FILE_RANGE_WAIT_AFTER);
+               err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
                if (err == 0) {
                        written = written_buffered;
                        invalidate_mapping_pages(mapping,
@@ -2445,53 +2601,42 @@ out:
        current->backing_dev_info = NULL;
        return written ? written : err;
 }
+EXPORT_SYMBOL(__generic_file_aio_write);
 
-ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
-               const struct iovec *iov, unsigned long nr_segs, loff_t pos)
-{
-       struct file *file = iocb->ki_filp;
-       struct address_space *mapping = file->f_mapping;
-       struct inode *inode = mapping->host;
-       ssize_t ret;
-
-       BUG_ON(iocb->ki_pos != pos);
-
-       ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
-                       &iocb->ki_pos);
-
-       if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-               ssize_t err;
-
-               err = sync_page_range_nolock(inode, mapping, pos, ret);
-               if (err < 0)
-                       ret = err;
-       }
-       return ret;
-}
-EXPORT_SYMBOL(generic_file_aio_write_nolock);
-
+/**
+ * generic_file_aio_write - write data to a file
+ * @iocb:      IO state structure
+ * @iov:       vector with data to write
+ * @nr_segs:   number of segments in the vector
+ * @pos:       position in file where to write
+ *
+ * This is a wrapper around __generic_file_aio_write() to be used by most
+ * filesystems. It takes care of syncing the file in case of O_SYNC file
+ * and acquires i_mutex as needed.
+ */
 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                unsigned long nr_segs, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
-       struct address_space *mapping = file->f_mapping;
-       struct inode *inode = mapping->host;
+       struct inode *inode = file->f_mapping->host;
+       struct blk_plug plug;
        ssize_t ret;
 
        BUG_ON(iocb->ki_pos != pos);
 
        mutex_lock(&inode->i_mutex);
-       ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
-                       &iocb->ki_pos);
+       blk_start_plug(&plug);
+       ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        mutex_unlock(&inode->i_mutex);
 
-       if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+       if (ret > 0 || ret == -EIOCBQUEUED) {
                ssize_t err;
 
-               err = sync_page_range(inode, mapping, pos, ret);
-               if (err < 0)
+               err = generic_write_sync(file, pos, ret);
+               if (err < 0 && ret > 0)
                        ret = err;
        }
+       blk_finish_plug(&plug);
        return ret;
 }
 EXPORT_SYMBOL(generic_file_aio_write);