mm: avoid livelock on !__GFP_FS allocations
[linux-2.6.git] / mm / filemap.c
index 7455ccd..7771871 100644 (file)
@@ -33,7 +33,6 @@
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
-#include <linux/mm_inline.h> /* for page_is_file_cache() */
 #include <linux/cleancache.h>
 #include "internal.h"
 
  *  ->i_mutex                  (generic_file_buffered_write)
  *    ->mmap_sem               (fault_in_pages_readable->do_page_fault)
  *
- *  ->i_mutex
- *    ->i_alloc_sem             (various)
- *
- *  inode_wb_list_lock
+ *  bdi->wb.list_lock
  *    sb_lock                  (fs/fs-writeback.c)
  *    ->mapping->tree_lock     (__sync_single_inode)
  *
@@ -99,9 +95,9 @@
  *    ->zone.lru_lock          (check_pte_range->isolate_lru_page)
  *    ->private_lock           (page_remove_rmap->set_page_dirty)
  *    ->tree_lock              (page_remove_rmap->set_page_dirty)
- *    inode_wb_list_lock       (page_remove_rmap->set_page_dirty)
+ *    bdi.wb->list_lock                (page_remove_rmap->set_page_dirty)
  *    ->inode->i_lock          (page_remove_rmap->set_page_dirty)
- *    inode_wb_list_lock       (zap_pte_range->set_page_dirty)
+ *    bdi.wb->list_lock                (zap_pte_range->set_page_dirty)
  *    ->inode->i_lock          (zap_pte_range->set_page_dirty)
  *    ->private_lock           (zap_pte_range->__set_page_dirty_buffers)
  *
@@ -131,6 +127,7 @@ void __delete_from_page_cache(struct page *page)
 
        radix_tree_delete(&mapping->page_tree, page->index);
        page->mapping = NULL;
+       /* Leave page->index set: truncation lookup relies upon it */
        mapping->nrpages--;
        __dec_zone_page_state(page, NR_FILE_PAGES);
        if (PageSwapBacked(page))
@@ -464,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
        int error;
 
        VM_BUG_ON(!PageLocked(page));
+       VM_BUG_ON(PageSwapBacked(page));
 
        error = mem_cgroup_cache_charge(page, current->mm,
                                        gfp_mask & GFP_RECLAIM_MASK);
@@ -481,11 +479,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                if (likely(!error)) {
                        mapping->nrpages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                       if (PageSwapBacked(page))
-                               __inc_zone_page_state(page, NR_SHMEM);
                        spin_unlock_irq(&mapping->tree_lock);
                } else {
                        page->mapping = NULL;
+                       /* Leave page->index set: truncation relies upon it */
                        spin_unlock_irq(&mapping->tree_lock);
                        mem_cgroup_uncharge_cache_page(page);
                        page_cache_release(page);
@@ -503,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 {
        int ret;
 
-       /*
-        * Splice_read and readahead add shmem/tmpfs pages into the page cache
-        * before shmem_readpage has a chance to mark them as SwapBacked: they
-        * need to go on the anon lru below, and mem_cgroup_cache_charge
-        * (called in add_to_page_cache) needs to know where they're going too.
-        */
-       if (mapping_cap_swap_backed(mapping))
-               SetPageSwapBacked(page);
-
        ret = add_to_page_cache(page, mapping, offset, gfp_mask);
-       if (ret == 0) {
-               if (page_is_file_cache(page))
-                       lru_cache_add_file(page);
-               else
-                       lru_cache_add_anon(page);
-       }
+       if (ret == 0)
+               lru_cache_add_file(page);
        return ret;
 }
 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
@@ -715,9 +699,16 @@ repeat:
                page = radix_tree_deref_slot(pagep);
                if (unlikely(!page))
                        goto out;
-               if (radix_tree_deref_retry(page))
-                       goto repeat;
-
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page))
+                               goto repeat;
+                       /*
+                        * Otherwise, shmem/tmpfs must be storing a swap entry
+                        * here as an exceptional entry: so return it without
+                        * attempting to raise page count.
+                        */
+                       goto out;
+               }
                if (!page_cache_get_speculative(page))
                        goto repeat;
 
@@ -754,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
 
 repeat:
        page = find_get_page(mapping, offset);
-       if (page) {
+       if (page && !radix_tree_exception(page)) {
                lock_page(page);
                /* Has the page been truncated? */
                if (unlikely(page->mapping != mapping)) {
@@ -836,13 +827,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 {
        unsigned int i;
        unsigned int ret;
-       unsigned int nr_found;
+       unsigned int nr_found, nr_skip;
 
        rcu_read_lock();
 restart:
        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-                               (void ***)pages, start, nr_pages);
+                               (void ***)pages, NULL, start, nr_pages);
        ret = 0;
+       nr_skip = 0;
        for (i = 0; i < nr_found; i++) {
                struct page *page;
 repeat:
@@ -850,13 +842,23 @@ repeat:
                if (unlikely(!page))
                        continue;
 
-               /*
-                * This can only trigger when the entry at index 0 moves out
-                * of or back to the root: none yet gotten, safe to restart.
-                */
-               if (radix_tree_deref_retry(page)) {
-                       WARN_ON(start | i);
-                       goto restart;
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page)) {
+                               /*
+                                * Transient condition which can only trigger
+                                * when entry at index 0 moves out of or back
+                                * to root: none yet gotten, safe to restart.
+                                */
+                               WARN_ON(start | i);
+                               goto restart;
+                       }
+                       /*
+                        * Otherwise, shmem/tmpfs must be storing a swap entry
+                        * here as an exceptional entry: so skip over it -
+                        * we only reach this from invalidate_mapping_pages().
+                        */
+                       nr_skip++;
+                       continue;
                }
 
                if (!page_cache_get_speculative(page))
@@ -876,7 +878,7 @@ repeat:
         * If all entries were removed before we could secure them,
         * try again, because callers stop trying once 0 is returned.
         */
-       if (unlikely(!ret && nr_found))
+       if (unlikely(!ret && nr_found > nr_skip))
                goto restart;
        rcu_read_unlock();
        return ret;
@@ -904,7 +906,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
        rcu_read_lock();
 restart:
        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-                               (void ***)pages, index, nr_pages);
+                               (void ***)pages, NULL, index, nr_pages);
        ret = 0;
        for (i = 0; i < nr_found; i++) {
                struct page *page;
@@ -913,12 +915,22 @@ repeat:
                if (unlikely(!page))
                        continue;
 
-               /*
-                * This can only trigger when the entry at index 0 moves out
-                * of or back to the root: none yet gotten, safe to restart.
-                */
-               if (radix_tree_deref_retry(page))
-                       goto restart;
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page)) {
+                               /*
+                                * Transient condition which can only trigger
+                                * when entry at index 0 moves out of or back
+                                * to root: none yet gotten, safe to restart.
+                                */
+                               goto restart;
+                       }
+                       /*
+                        * Otherwise, shmem/tmpfs must be storing a swap entry
+                        * here as an exceptional entry: so stop looking for
+                        * contiguous pages.
+                        */
+                       break;
+               }
 
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -978,12 +990,21 @@ repeat:
                if (unlikely(!page))
                        continue;
 
-               /*
-                * This can only trigger when the entry at index 0 moves out
-                * of or back to the root: none yet gotten, safe to restart.
-                */
-               if (radix_tree_deref_retry(page))
-                       goto restart;
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page)) {
+                               /*
+                                * Transient condition which can only trigger
+                                * when entry at index 0 moves out of or back
+                                * to root: none yet gotten, safe to restart.
+                                */
+                               goto restart;
+                       }
+                       /*
+                        * This function is never used on a shmem/tmpfs
+                        * mapping, so a swap entry won't be found here.
+                        */
+                       BUG();
+               }
 
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -1661,6 +1682,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                /* No page in the page cache at all */
                do_sync_mmap_readahead(vma, ra, file, offset);
                count_vm_event(PGMAJFAULT);
+               mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
                ret = VM_FAULT_MAJOR;
 retry_find:
                page = find_get_page(mapping, offset);
@@ -1794,7 +1816,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
 
 static struct page *__read_cache_page(struct address_space *mapping,
                                pgoff_t index,
-                               int (*filler)(void *,struct page*),
+                               int (*filler)(void *, struct page *),
                                void *data,
                                gfp_t gfp)
 {
@@ -1825,7 +1847,7 @@ repeat:
 
 static struct page *do_read_cache_page(struct address_space *mapping,
                                pgoff_t index,
-                               int (*filler)(void *,struct page*),
+                               int (*filler)(void *, struct page *),
                                void *data,
                                gfp_t gfp)
 
@@ -1865,7 +1887,7 @@ out:
  * @mapping:   the page's address_space
  * @index:     the page index
  * @filler:    function to perform the read
- * @data:      destination for read data
+ * @data:      first arg to filler(data, page) function, often left as NULL
  *
  * Same as read_cache_page, but don't wait for page to become unlocked
  * after submitting it to the filler.
@@ -1877,7 +1899,7 @@ out:
  */
 struct page *read_cache_page_async(struct address_space *mapping,
                                pgoff_t index,
-                               int (*filler)(void *,struct page*),
+                               int (*filler)(void *, struct page *),
                                void *data)
 {
        return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
@@ -1925,7 +1947,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
  * @mapping:   the page's address_space
  * @index:     the page index
  * @filler:    function to perform the read
- * @data:      destination for read data
+ * @data:      first arg to filler(data, page) function, often left as NULL
  *
  * Read into the page cache. If a page already exists, and PageUptodate() is
  * not set, try to fill the page then wait for it to become unlocked.
@@ -1934,7 +1956,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
  */
 struct page *read_cache_page(struct address_space *mapping,
                                pgoff_t index,
-                               int (*filler)(void *,struct page*),
+                               int (*filler)(void *, struct page *),
                                void *data)
 {
        return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
@@ -1981,16 +2003,26 @@ static int __remove_suid(struct dentry *dentry, int kill)
 int file_remove_suid(struct file *file)
 {
        struct dentry *dentry = file->f_path.dentry;
-       int killsuid = should_remove_suid(dentry);
-       int killpriv = security_inode_need_killpriv(dentry);
+       struct inode *inode = dentry->d_inode;
+       int killsuid;
+       int killpriv;
        int error = 0;
 
+       /* Fast path for nothing security related */
+       if (IS_NOSEC(inode))
+               return 0;
+
+       killsuid = should_remove_suid(dentry);
+       killpriv = security_inode_need_killpriv(dentry);
+
        if (killpriv < 0)
                return killpriv;
        if (killpriv)
                error = security_inode_killpriv(dentry);
        if (!error && killsuid)
                error = __remove_suid(dentry, killsuid);
+       if (!error && (inode->i_sb->s_flags & MS_NOSEC))
+               inode->i_flags |= S_NOSEC;
 
        return error;
 }
@@ -2326,7 +2358,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
 repeat:
        page = find_lock_page(mapping, index);
        if (page)
-               return page;
+               goto found;
 
        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
        if (!page)
@@ -2339,6 +2371,8 @@ repeat:
                        goto repeat;
                return NULL;
        }
+found:
+       wait_on_page_writeback(page);
        return page;
 }
 EXPORT_SYMBOL(grab_cache_page_write_begin);