]> nv-tegra.nvidia Code Review - linux-2.6.git/blobdiff - mm/filemap.c
mm: vmscan: fix misused nr_reclaimed in shrink_mem_cgroup_zone()
[linux-2.6.git] / mm / filemap.c
index 10a17111327362f9b0d5f30dc5f8519983e8f1e8..2f8165075a5a9ac67358f2b51dac2bf2cf14f6c0 100644 (file)
@@ -9,7 +9,7 @@
  * most "normal" filesystems (but you don't /have/ to use this:
  * the NFS filesystem used to do this differently, for example)
  */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
@@ -33,7 +33,6 @@
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
-#include <linux/mm_inline.h> /* for page_is_file_cache() */
 #include <linux/cleancache.h>
 #include "internal.h"
 
@@ -78,7 +77,7 @@
  *  ->i_mutex                  (generic_file_buffered_write)
  *    ->mmap_sem               (fault_in_pages_readable->do_page_fault)
  *
- *  inode_wb_list_lock
+ *  bdi->wb.list_lock
  *    sb_lock                  (fs/fs-writeback.c)
  *    ->mapping->tree_lock     (__sync_single_inode)
  *
@@ -96,9 +95,9 @@
  *    ->zone.lru_lock          (check_pte_range->isolate_lru_page)
  *    ->private_lock           (page_remove_rmap->set_page_dirty)
  *    ->tree_lock              (page_remove_rmap->set_page_dirty)
- *    inode_wb_list_lock       (page_remove_rmap->set_page_dirty)
+ *    bdi.wb->list_lock                (page_remove_rmap->set_page_dirty)
  *    ->inode->i_lock          (page_remove_rmap->set_page_dirty)
- *    inode_wb_list_lock       (zap_pte_range->set_page_dirty)
+ *    bdi.wb->list_lock                (zap_pte_range->set_page_dirty)
  *    ->inode->i_lock          (zap_pte_range->set_page_dirty)
  *    ->private_lock           (zap_pte_range->__set_page_dirty_buffers)
  *
@@ -394,24 +393,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range);
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 {
        int error;
-       struct mem_cgroup *memcg = NULL;
 
        VM_BUG_ON(!PageLocked(old));
        VM_BUG_ON(!PageLocked(new));
        VM_BUG_ON(new->mapping);
 
-       /*
-        * This is not page migration, but prepare_migration and
-        * end_migration does enough work for charge replacement.
-        *
-        * In the longer term we probably want a specialized function
-        * for moving the charge from old to new in a more efficient
-        * manner.
-        */
-       error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
-       if (error)
-               return error;
-
        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (!error) {
                struct address_space *mapping = old->mapping;
@@ -433,13 +419,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                if (PageSwapBacked(new))
                        __inc_zone_page_state(new, NR_SHMEM);
                spin_unlock_irq(&mapping->tree_lock);
+               /* mem_cgroup codes must not be called under tree_lock */
+               mem_cgroup_replace_page_cache(old, new);
                radix_tree_preload_end();
                if (freepage)
                        freepage(old);
                page_cache_release(old);
-               mem_cgroup_end_migration(memcg, old, new, true);
-       } else {
-               mem_cgroup_end_migration(memcg, old, new, false);
        }
 
        return error;
@@ -462,6 +447,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
        int error;
 
        VM_BUG_ON(!PageLocked(page));
+       VM_BUG_ON(PageSwapBacked(page));
 
        error = mem_cgroup_cache_charge(page, current->mm,
                                        gfp_mask & GFP_RECLAIM_MASK);
@@ -479,8 +465,6 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                if (likely(!error)) {
                        mapping->nrpages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                       if (PageSwapBacked(page))
-                               __inc_zone_page_state(page, NR_SHMEM);
                        spin_unlock_irq(&mapping->tree_lock);
                } else {
                        page->mapping = NULL;
@@ -502,22 +486,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 {
        int ret;
 
-       /*
-        * Splice_read and readahead add shmem/tmpfs pages into the page cache
-        * before shmem_readpage has a chance to mark them as SwapBacked: they
-        * need to go on the anon lru below, and mem_cgroup_cache_charge
-        * (called in add_to_page_cache) needs to know where they're going too.
-        */
-       if (mapping_cap_swap_backed(mapping))
-               SetPageSwapBacked(page);
-
        ret = add_to_page_cache(page, mapping, offset, gfp_mask);
-       if (ret == 0) {
-               if (page_is_file_cache(page))
-                       lru_cache_add_file(page);
-               else
-                       lru_cache_add_anon(page);
-       }
+       if (ret == 0)
+               lru_cache_add_file(page);
        return ret;
 }
 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
@@ -714,9 +685,16 @@ repeat:
                page = radix_tree_deref_slot(pagep);
                if (unlikely(!page))
                        goto out;
-               if (radix_tree_deref_retry(page))
-                       goto repeat;
-
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page))
+                               goto repeat;
+                       /*
+                        * Otherwise, shmem/tmpfs must be storing a swap entry
+                        * here as an exceptional entry: so return it without
+                        * attempting to raise page count.
+                        */
+                       goto out;
+               }
                if (!page_cache_get_speculative(page))
                        goto repeat;
 
@@ -753,7 +731,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
 
 repeat:
        page = find_get_page(mapping, offset);
-       if (page) {
+       if (page && !radix_tree_exception(page)) {
                lock_page(page);
                /* Has the page been truncated? */
                if (unlikely(page->mapping != mapping)) {
@@ -835,13 +813,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 {
        unsigned int i;
        unsigned int ret;
-       unsigned int nr_found;
+       unsigned int nr_found, nr_skip;
 
        rcu_read_lock();
 restart:
        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-                               (void ***)pages, start, nr_pages);
+                               (void ***)pages, NULL, start, nr_pages);
        ret = 0;
+       nr_skip = 0;
        for (i = 0; i < nr_found; i++) {
                struct page *page;
 repeat:
@@ -849,13 +828,23 @@ repeat:
                if (unlikely(!page))
                        continue;
 
-               /*
-                * This can only trigger when the entry at index 0 moves out
-                * of or back to the root: none yet gotten, safe to restart.
-                */
-               if (radix_tree_deref_retry(page)) {
-                       WARN_ON(start | i);
-                       goto restart;
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page)) {
+                               /*
+                                * Transient condition which can only trigger
+                                * when entry at index 0 moves out of or back
+                                * to root: none yet gotten, safe to restart.
+                                */
+                               WARN_ON(start | i);
+                               goto restart;
+                       }
+                       /*
+                        * Otherwise, shmem/tmpfs must be storing a swap entry
+                        * here as an exceptional entry: so skip over it -
+                        * we only reach this from invalidate_mapping_pages().
+                        */
+                       nr_skip++;
+                       continue;
                }
 
                if (!page_cache_get_speculative(page))
@@ -875,7 +864,7 @@ repeat:
         * If all entries were removed before we could secure them,
         * try again, because callers stop trying once 0 is returned.
         */
-       if (unlikely(!ret && nr_found))
+       if (unlikely(!ret && nr_found > nr_skip))
                goto restart;
        rcu_read_unlock();
        return ret;
@@ -903,7 +892,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
        rcu_read_lock();
 restart:
        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-                               (void ***)pages, index, nr_pages);
+                               (void ***)pages, NULL, index, nr_pages);
        ret = 0;
        for (i = 0; i < nr_found; i++) {
                struct page *page;
@@ -912,12 +901,22 @@ repeat:
                if (unlikely(!page))
                        continue;
 
-               /*
-                * This can only trigger when the entry at index 0 moves out
-                * of or back to the root: none yet gotten, safe to restart.
-                */
-               if (radix_tree_deref_retry(page))
-                       goto restart;
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page)) {
+                               /*
+                                * Transient condition which can only trigger
+                                * when entry at index 0 moves out of or back
+                                * to root: none yet gotten, safe to restart.
+                                */
+                               goto restart;
+                       }
+                       /*
+                        * Otherwise, shmem/tmpfs must be storing a swap entry
+                        * here as an exceptional entry: so stop looking for
+                        * contiguous pages.
+                        */
+                       break;
+               }
 
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -977,12 +976,21 @@ repeat:
                if (unlikely(!page))
                        continue;
 
-               /*
-                * This can only trigger when the entry at index 0 moves out
-                * of or back to the root: none yet gotten, safe to restart.
-                */
-               if (radix_tree_deref_retry(page))
-                       goto restart;
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page)) {
+                               /*
+                                * Transient condition which can only trigger
+                                * when entry at index 0 moves out of or back
+                                * to root: none yet gotten, safe to restart.
+                                */
+                               goto restart;
+                       }
+                       /*
+                        * This function is never used on a shmem/tmpfs
+                        * mapping, so a swap entry won't be found here.
+                        */
+                       BUG();
+               }
 
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -1310,10 +1318,10 @@ int file_read_actor(read_descriptor_t *desc, struct page *page,
         * taking the kmap.
         */
        if (!fault_in_pages_writeable(desc->arg.buf, size)) {
-               kaddr = kmap_atomic(page, KM_USER0);
+               kaddr = kmap_atomic(page);
                left = __copy_to_user_inatomic(desc->arg.buf,
                                                kaddr + offset, size);
-               kunmap_atomic(kaddr, KM_USER0);
+               kunmap_atomic(kaddr);
                if (left == 0)
                        goto success;
        }
@@ -1392,15 +1400,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
        unsigned long seg = 0;
        size_t count;
        loff_t *ppos = &iocb->ki_pos;
-       struct blk_plug plug;
 
        count = 0;
        retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
        if (retval)
                return retval;
 
-       blk_start_plug(&plug);
-
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (filp->f_flags & O_DIRECT) {
                loff_t size;
@@ -1416,8 +1421,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                        retval = filemap_write_and_wait_range(mapping, pos,
                                        pos + iov_length(iov, nr_segs) - 1);
                        if (!retval) {
+                               struct blk_plug plug;
+
+                               blk_start_plug(&plug);
                                retval = mapping->a_ops->direct_IO(READ, iocb,
                                                        iov, pos, nr_segs);
+                               blk_finish_plug(&plug);
                        }
                        if (retval > 0) {
                                *ppos = pos + retval;
@@ -1473,7 +1482,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                        break;
        }
 out:
-       blk_finish_plug(&plug);
        return retval;
 }
 EXPORT_SYMBOL(generic_file_aio_read);
@@ -1806,7 +1814,7 @@ repeat:
                page = __page_cache_alloc(gfp | __GFP_COLD);
                if (!page)
                        return ERR_PTR(-ENOMEM);
-               err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+               err = add_to_page_cache_lru(page, mapping, index, gfp);
                if (unlikely(err)) {
                        page_cache_release(page);
                        if (err == -EEXIST)
@@ -1903,10 +1911,7 @@ static struct page *wait_on_page_read(struct page *page)
  * @gfp:       the page allocator flags to use if allocating
  *
  * This is the same as "read_mapping_page(mapping, index, NULL)", but with
- * any new page allocations done using the specified allocation flags. Note
- * that the Radix tree operations will still use GFP_KERNEL, so you can't
- * expect to do this atomically or anything like that - but you can pass in
- * other page requirements.
+ * any new page allocations done using the specified allocation flags.
  *
  * If the page does not get brought uptodate, return -EIO.
  */
@@ -1949,7 +1954,7 @@ EXPORT_SYMBOL(read_cache_page);
  */
 int should_remove_suid(struct dentry *dentry)
 {
-       mode_t mode = dentry->d_inode->i_mode;
+       umode_t mode = dentry->d_inode->i_mode;
        int kill = 0;
 
        /* suid always must be killed */
@@ -2040,7 +2045,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
        size_t copied;
 
        BUG_ON(!in_atomic());
-       kaddr = kmap_atomic(page, KM_USER0);
+       kaddr = kmap_atomic(page);
        if (likely(i->nr_segs == 1)) {
                int left;
                char __user *buf = i->iov->iov_base + i->iov_offset;
@@ -2050,7 +2055,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
                copied = __iovec_copy_from_user_inatomic(kaddr + offset,
                                                i->iov, i->iov_offset, bytes);
        }
-       kunmap_atomic(kaddr, KM_USER0);
+       kunmap_atomic(kaddr);
 
        return copied;
 }
@@ -2093,6 +2098,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
        } else {
                const struct iovec *iov = i->iov;
                size_t base = i->iov_offset;
+               unsigned long nr_segs = i->nr_segs;
 
                /*
                 * The !iov->iov_len check ensures we skip over unlikely
@@ -2108,11 +2114,13 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
                        base += copy;
                        if (iov->iov_len == base) {
                                iov++;
+                               nr_segs--;
                                base = 0;
                        }
                }
                i->iov = iov;
                i->iov_offset = base;
+               i->nr_segs = nr_segs;
        }
 }
 EXPORT_SYMBOL(iov_iter_advance);
@@ -2329,8 +2337,11 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
                                        pgoff_t index, unsigned flags)
 {
        int status;
+       gfp_t gfp_mask;
        struct page *page;
        gfp_t gfp_notmask = 0;
+
+       gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE;
        if (flags & AOP_FLAG_NOFS)
                gfp_notmask = __GFP_FS;
 repeat:
@@ -2338,7 +2349,7 @@ repeat:
        if (page)
                goto found;
 
-       page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
+       page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
        if (!page)
                return NULL;
        status = add_to_page_cache_lru(page, mapping, index,
@@ -2382,7 +2393,6 @@ static ssize_t generic_perform_write(struct file *file,
                                                iov_iter_count(i));
 
 again:
-
                /*
                 * Bring in the user page that we will copy from _first_.
                 * Otherwise there's a nasty deadlock on copying from the
@@ -2438,7 +2448,10 @@ again:
                written += copied;
 
                balance_dirty_pages_ratelimited(mapping);
-
+               if (fatal_signal_pending(current)) {
+                       status = -EINTR;
+                       break;
+               }
        } while (iov_iter_count(i));
 
        return written ? written : status;