]> nv-tegra.nvidia Code Review - linux-2.6.git/blobdiff - mm/shmem.c
ashmem for 2.6.27.
[linux-2.6.git] / mm / shmem.c
index 24e95ac16053587bde608117ecced86002f5bc23..fba53caba0d47fc8bcdc9d7c639332dac2885cfe 100644 (file)
@@ -6,7 +6,8 @@
  *              2000-2001 Christoph Rohland
  *              2000-2001 SAP AG
  *              2002 Red Hat Inc.
- * Copyright (C) 2002-2005 Hugh Dickins.
+ * Copyright (C) 2002-2011 Hugh Dickins.
+ * Copyright (C) 2011 Google Inc.
  * Copyright (C) 2002-2005 VERITAS Software Corporation.
  * Copyright (C) 2004 Andi Kleen, SuSE Labs
  *
@@ -50,6 +51,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/shmem_fs.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/pagevec.h>
 #include <linux/percpu_counter.h>
 #include <linux/splice.h>
 #include <linux/security.h>
@@ -71,6 +73,9 @@ static struct vfsmount *shm_mnt;
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
 
+/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
+#define SHORT_SYMLINK_LEN 128
+
 struct shmem_xattr {
        struct list_head list;  /* anchored by shmem_inode_info->xattr_list */
        char *name;             /* xattr name */
@@ -165,15 +170,6 @@ static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 static LIST_HEAD(shmem_swaplist);
 static DEFINE_MUTEX(shmem_swaplist_mutex);
 
-static void shmem_free_blocks(struct inode *inode, long pages)
-{
-       struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-       if (sbinfo->max_blocks) {
-               percpu_counter_add(&sbinfo->used_blocks, -pages);
-               inode->i_blocks -= pages*BLOCKS_PER_PAGE;
-       }
-}
-
 static int shmem_reserve_inode(struct super_block *sb)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
@@ -218,58 +214,293 @@ static void shmem_recalc_inode(struct inode *inode)
 
        freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
        if (freed > 0) {
+               struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+               if (sbinfo->max_blocks)
+                       percpu_counter_add(&sbinfo->used_blocks, -freed);
                info->alloced -= freed;
+               inode->i_blocks -= freed * BLOCKS_PER_PAGE;
                shmem_unacct_blocks(info->flags, freed);
-               shmem_free_blocks(inode, freed);
        }
 }
 
-static void shmem_put_swap(struct shmem_inode_info *info, pgoff_t index,
-                          swp_entry_t swap)
+/*
+ * Replace item expected in radix tree by a new item, while holding tree lock.
+ */
+static int shmem_radix_tree_replace(struct address_space *mapping,
+                       pgoff_t index, void *expected, void *replacement)
+{
+       void **pslot;
+       void *item = NULL;
+
+       VM_BUG_ON(!expected);
+       pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
+       if (pslot)
+               item = radix_tree_deref_slot_protected(pslot,
+                                                       &mapping->tree_lock);
+       if (item != expected)
+               return -ENOENT;
+       if (replacement)
+               radix_tree_replace_slot(pslot, replacement);
+       else
+               radix_tree_delete(&mapping->page_tree, index);
+       return 0;
+}
+
+/*
+ * Like add_to_page_cache_locked, but error if expected item has gone.
+ */
+static int shmem_add_to_page_cache(struct page *page,
+                                  struct address_space *mapping,
+                                  pgoff_t index, gfp_t gfp, void *expected)
+{
+       int error = 0;
+
+       VM_BUG_ON(!PageLocked(page));
+       VM_BUG_ON(!PageSwapBacked(page));
+
+       if (!expected)
+               error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+       if (!error) {
+               page_cache_get(page);
+               page->mapping = mapping;
+               page->index = index;
+
+               spin_lock_irq(&mapping->tree_lock);
+               if (!expected)
+                       error = radix_tree_insert(&mapping->page_tree,
+                                                       index, page);
+               else
+                       error = shmem_radix_tree_replace(mapping, index,
+                                                       expected, page);
+               if (!error) {
+                       mapping->nrpages++;
+                       __inc_zone_page_state(page, NR_FILE_PAGES);
+                       __inc_zone_page_state(page, NR_SHMEM);
+                       spin_unlock_irq(&mapping->tree_lock);
+               } else {
+                       page->mapping = NULL;
+                       spin_unlock_irq(&mapping->tree_lock);
+                       page_cache_release(page);
+               }
+               if (!expected)
+                       radix_tree_preload_end();
+       }
+       if (error)
+               mem_cgroup_uncharge_cache_page(page);
+       return error;
+}
+
+/*
+ * Like delete_from_page_cache, but substitutes swap for page.
+ */
+static void shmem_delete_from_page_cache(struct page *page, void *radswap)
 {
-       if (index < SHMEM_NR_DIRECT)
-               info->i_direct[index] = swap;
+       struct address_space *mapping = page->mapping;
+       int error;
+
+       spin_lock_irq(&mapping->tree_lock);
+       error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
+       page->mapping = NULL;
+       mapping->nrpages--;
+       __dec_zone_page_state(page, NR_FILE_PAGES);
+       __dec_zone_page_state(page, NR_SHMEM);
+       spin_unlock_irq(&mapping->tree_lock);
+       page_cache_release(page);
+       BUG_ON(error);
 }
 
-static swp_entry_t shmem_get_swap(struct shmem_inode_info *info, pgoff_t index)
+/*
+ * Like find_get_pages, but collecting swap entries as well as pages.
+ */
+static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
+                                       pgoff_t start, unsigned int nr_pages,
+                                       struct page **pages, pgoff_t *indices)
+{
+       unsigned int i;
+       unsigned int ret;
+       unsigned int nr_found;
+
+       rcu_read_lock();
+restart:
+       nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+                               (void ***)pages, indices, start, nr_pages);
+       ret = 0;
+       for (i = 0; i < nr_found; i++) {
+               struct page *page;
+repeat:
+               page = radix_tree_deref_slot((void **)pages[i]);
+               if (unlikely(!page))
+                       continue;
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page))
+                               goto restart;
+                       /*
+                        * Otherwise, we must be storing a swap entry
+                        * here as an exceptional entry: so return it
+                        * without attempting to raise page count.
+                        */
+                       goto export;
+               }
+               if (!page_cache_get_speculative(page))
+                       goto repeat;
+
+               /* Has the page moved? */
+               if (unlikely(page != *((void **)pages[i]))) {
+                       page_cache_release(page);
+                       goto repeat;
+               }
+export:
+               indices[ret] = indices[i];
+               pages[ret] = page;
+               ret++;
+       }
+       if (unlikely(!ret && nr_found))
+               goto restart;
+       rcu_read_unlock();
+       return ret;
+}
+
+/*
+ * Remove swap entry from radix tree, free the swap and its page cache.
+ */
+static int shmem_free_swap(struct address_space *mapping,
+                          pgoff_t index, void *radswap)
 {
-       return (index < SHMEM_NR_DIRECT) ?
-               info->i_direct[index] : (swp_entry_t){0};
+       int error;
+
+       spin_lock_irq(&mapping->tree_lock);
+       error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
+       spin_unlock_irq(&mapping->tree_lock);
+       if (!error)
+               free_swap_and_cache(radix_to_swp_entry(radswap));
+       return error;
+}
+
+/*
+ * Pagevec may contain swap entries, so shuffle up pages before releasing.
+ */
+static void shmem_pagevec_release(struct pagevec *pvec)
+{
+       int i, j;
+
+       for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
+               struct page *page = pvec->pages[i];
+               if (!radix_tree_exceptional_entry(page))
+                       pvec->pages[j++] = page;
+       }
+       pvec->nr = j;
+       pagevec_release(pvec);
 }
 
+/*
+ * Remove range of pages and swap entries from radix tree, and free them.
+ */
 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 {
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
        pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
+       struct pagevec pvec;
+       pgoff_t indices[PAGEVEC_SIZE];
+       long nr_swaps_freed = 0;
        pgoff_t index;
-       swp_entry_t swap;
+       int i;
 
-       truncate_inode_pages_range(mapping, lstart, lend);
+       BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
 
-       if (end > SHMEM_NR_DIRECT)
-               end = SHMEM_NR_DIRECT;
+       pagevec_init(&pvec, 0);
+       index = start;
+       while (index <= end) {
+               pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+                       min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                                                       pvec.pages, indices);
+               if (!pvec.nr)
+                       break;
+               mem_cgroup_uncharge_start();
+               for (i = 0; i < pagevec_count(&pvec); i++) {
+                       struct page *page = pvec.pages[i];
 
-       spin_lock(&info->lock);
-       for (index = start; index < end; index++) {
-               swap = shmem_get_swap(info, index);
-               if (swap.val) {
-                       free_swap_and_cache(swap);
-                       shmem_put_swap(info, index, (swp_entry_t){0});
-                       info->swapped--;
+                       index = indices[i];
+                       if (index > end)
+                               break;
+
+                       if (radix_tree_exceptional_entry(page)) {
+                               nr_swaps_freed += !shmem_free_swap(mapping,
+                                                               index, page);
+                               continue;
+                       }
+
+                       if (!trylock_page(page))
+                               continue;
+                       if (page->mapping == mapping) {
+                               VM_BUG_ON(PageWriteback(page));
+                               truncate_inode_page(mapping, page);
+                       }
+                       unlock_page(page);
                }
+               shmem_pagevec_release(&pvec);
+               mem_cgroup_uncharge_end();
+               cond_resched();
+               index++;
        }
 
-       if (mapping->nrpages) {
-               spin_unlock(&info->lock);
-               /*
-                * A page may have meanwhile sneaked in from swap.
-                */
-               truncate_inode_pages_range(mapping, lstart, lend);
-               spin_lock(&info->lock);
+       if (partial) {
+               struct page *page = NULL;
+               shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
+               if (page) {
+                       zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+                       set_page_dirty(page);
+                       unlock_page(page);
+                       page_cache_release(page);
+               }
+       }
+
+       index = start;
+       for ( ; ; ) {
+               cond_resched();
+               pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+                       min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                                                       pvec.pages, indices);
+               if (!pvec.nr) {
+                       if (index == start)
+                               break;
+                       index = start;
+                       continue;
+               }
+               if (index == start && indices[0] > end) {
+                       shmem_pagevec_release(&pvec);
+                       break;
+               }
+               mem_cgroup_uncharge_start();
+               for (i = 0; i < pagevec_count(&pvec); i++) {
+                       struct page *page = pvec.pages[i];
+
+                       index = indices[i];
+                       if (index > end)
+                               break;
+
+                       if (radix_tree_exceptional_entry(page)) {
+                               nr_swaps_freed += !shmem_free_swap(mapping,
+                                                               index, page);
+                               continue;
+                       }
+
+                       lock_page(page);
+                       if (page->mapping == mapping) {
+                               VM_BUG_ON(PageWriteback(page));
+                               truncate_inode_page(mapping, page);
+                       }
+                       unlock_page(page);
+               }
+               shmem_pagevec_release(&pvec);
+               mem_cgroup_uncharge_end();
+               index++;
        }
 
+       spin_lock(&info->lock);
+       info->swapped -= nr_swaps_freed;
        shmem_recalc_inode(inode);
        spin_unlock(&info->lock);
 
@@ -289,24 +520,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
                loff_t oldsize = inode->i_size;
                loff_t newsize = attr->ia_size;
-               struct page *page = NULL;
 
-               if (newsize < oldsize) {
-                       /*
-                        * If truncating down to a partial page, then
-                        * if that page is already allocated, hold it
-                        * in memory until the truncation is over, so
-                        * truncate_partial_page cannot miss it were
-                        * it assigned to swap.
-                        */
-                       if (newsize & (PAGE_CACHE_SIZE-1)) {
-                               (void) shmem_getpage(inode,
-                                       newsize >> PAGE_CACHE_SHIFT,
-                                               &page, SGP_READ, NULL);
-                               if (page)
-                                       unlock_page(page);
-                       }
-               }
                if (newsize != oldsize) {
                        i_size_write(inode, newsize);
                        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -318,8 +532,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
                        /* unmap again to remove racily COWed private pages */
                        unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
                }
-               if (page)
-                       page_cache_release(page);
        }
 
        setattr_copy(inode, attr);
@@ -344,7 +556,8 @@ static void shmem_evict_inode(struct inode *inode)
                        list_del_init(&info->swaplist);
                        mutex_unlock(&shmem_swaplist_mutex);
                }
-       }
+       } else
+               kfree(info->symlink);
 
        list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
                kfree(xattr->name);
@@ -355,23 +568,21 @@ static void shmem_evict_inode(struct inode *inode)
        end_writeback(inode);
 }
 
+/*
+ * If swap found in inode, free it and move page from swapcache to filecache.
+ */
 static int shmem_unuse_inode(struct shmem_inode_info *info,
                             swp_entry_t swap, struct page *page)
 {
        struct address_space *mapping = info->vfs_inode.i_mapping;
+       void *radswap;
        pgoff_t index;
        int error;
 
-       for (index = 0; index < SHMEM_NR_DIRECT; index++)
-               if (shmem_get_swap(info, index).val == swap.val)
-                       goto found;
-       return 0;
-found:
-       spin_lock(&info->lock);
-       if (shmem_get_swap(info, index).val != swap.val) {
-               spin_unlock(&info->lock);
+       radswap = swp_to_radix_entry(swap);
+       index = radix_tree_locate_item(&mapping->page_tree, radswap);
+       if (index == -1)
                return 0;
-       }
 
        /*
         * Move _head_ to start search for next from here.
@@ -387,23 +598,30 @@ found:
         * but also to hold up shmem_evict_inode(): so inode cannot be freed
         * beneath us (pagelock doesn't help until the page is in pagecache).
         */
-       error = add_to_page_cache_locked(page, mapping, index, GFP_NOWAIT);
+       error = shmem_add_to_page_cache(page, mapping, index,
+                                               GFP_NOWAIT, radswap);
        /* which does mem_cgroup_uncharge_cache_page on error */
 
        if (error != -ENOMEM) {
+               /*
+                * Truncation and eviction use free_swap_and_cache(), which
+                * only does trylock page: if we raced, best clean up here.
+                */
                delete_from_swap_cache(page);
                set_page_dirty(page);
-               shmem_put_swap(info, index, (swp_entry_t){0});
-               info->swapped--;
-               swap_free(swap);
+               if (!error) {
+                       spin_lock(&info->lock);
+                       info->swapped--;
+                       spin_unlock(&info->lock);
+                       swap_free(swap);
+               }
                error = 1;      /* not an error, but entry was found */
        }
-       spin_unlock(&info->lock);
        return error;
 }
 
 /*
- * shmem_unuse() search for an eventually swapped out shmem page.
+ * Search through swapped inodes to find and replace swap by page.
  */
 int shmem_unuse(swp_entry_t swap, struct page *page)
 {
@@ -416,39 +634,25 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
         * Charge page using GFP_KERNEL while we can wait, before taking
         * the shmem_swaplist_mutex which might hold up shmem_writepage().
         * Charged back to the user (not to caller) when swap account is used.
-        * add_to_page_cache() will be called with GFP_NOWAIT.
         */
        error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
        if (error)
                goto out;
-       /*
-        * Try to preload while we can wait, to not make a habit of
-        * draining atomic reserves; but don't latch on to this cpu,
-        * it's okay if sometimes we get rescheduled after this.
-        */
-       error = radix_tree_preload(GFP_KERNEL);
-       if (error)
-               goto uncharge;
-       radix_tree_preload_end();
+       /* No radix_tree_preload: swap entry keeps a place for page in tree */
 
        mutex_lock(&shmem_swaplist_mutex);
        list_for_each_safe(this, next, &shmem_swaplist) {
                info = list_entry(this, struct shmem_inode_info, swaplist);
-               if (!info->swapped) {
-                       spin_lock(&info->lock);
-                       if (!info->swapped)
-                               list_del_init(&info->swaplist);
-                       spin_unlock(&info->lock);
-               }
                if (info->swapped)
                        found = shmem_unuse_inode(info, swap, page);
+               else
+                       list_del_init(&info->swaplist);
                cond_resched();
                if (found)
                        break;
        }
        mutex_unlock(&shmem_swaplist_mutex);
 
-uncharge:
        if (!found)
                mem_cgroup_uncharge_cache_page(page);
        if (found < 0)
@@ -465,10 +669,10 @@ out:
 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct shmem_inode_info *info;
-       swp_entry_t swap, oswap;
        struct address_space *mapping;
-       pgoff_t index;
        struct inode *inode;
+       swp_entry_t swap;
+       pgoff_t index;
 
        BUG_ON(!PageLocked(page));
        mapping = page->mapping;
@@ -491,56 +695,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
                WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
                goto redirty;
        }
-
-       /*
-        * Just for this patch, we have a toy implementation,
-        * which can swap out only the first SHMEM_NR_DIRECT pages:
-        * for simple demonstration of where we need to think about swap.
-        */
-       if (index >= SHMEM_NR_DIRECT)
-               goto redirty;
-
        swap = get_swap_page();
        if (!swap.val)
                goto redirty;
 
        /*
         * Add inode to shmem_unuse()'s list of swapped-out inodes,
-        * if it's not already there.  Do it now because we cannot take
-        * mutex while holding spinlock, and must do so before the page
-        * is moved to swap cache, when its pagelock no longer protects
+        * if it's not already there.  Do it now before the page is
+        * moved to swap cache, when its pagelock no longer protects
         * the inode from eviction.  But don't unlock the mutex until
-        * we've taken the spinlock, because shmem_unuse_inode() will
-        * prune a !swapped inode from the swaplist under both locks.
+        * we've incremented swapped, because shmem_unuse_inode() will
+        * prune a !swapped inode from the swaplist under this mutex.
         */
        mutex_lock(&shmem_swaplist_mutex);
        if (list_empty(&info->swaplist))
                list_add_tail(&info->swaplist, &shmem_swaplist);
 
-       spin_lock(&info->lock);
-       mutex_unlock(&shmem_swaplist_mutex);
-
-       oswap = shmem_get_swap(info, index);
-       if (oswap.val) {
-               WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
-               free_swap_and_cache(oswap);
-               shmem_put_swap(info, index, (swp_entry_t){0});
-               info->swapped--;
-       }
-       shmem_recalc_inode(inode);
-
        if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
-               delete_from_page_cache(page);
-               shmem_put_swap(info, index, swap);
-               info->swapped++;
                swap_shmem_alloc(swap);
+               shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
+
+               spin_lock(&info->lock);
+               info->swapped++;
+               shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
+
+               mutex_unlock(&shmem_swaplist_mutex);
                BUG_ON(page_mapped(page));
                swap_writepage(page, wbc);
                return 0;
        }
 
-       spin_unlock(&info->lock);
+       mutex_unlock(&shmem_swaplist_mutex);
        swapcache_free(swap, NULL);
 redirty:
        set_page_dirty(page);
@@ -648,205 +834,186 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
        struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
 {
        struct address_space *mapping = inode->i_mapping;
-       struct shmem_inode_info *info = SHMEM_I(inode);
+       struct shmem_inode_info *info;
        struct shmem_sb_info *sbinfo;
        struct page *page;
-       struct page *prealloc_page = NULL;
        swp_entry_t swap;
        int error;
+       int once = 0;
 
        if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
                return -EFBIG;
 repeat:
+       swap.val = 0;
        page = find_lock_page(mapping, index);
-       if (page) {
+       if (radix_tree_exceptional_entry(page)) {
+               swap = radix_to_swp_entry(page);
+               page = NULL;
+       }
+
+       if (sgp != SGP_WRITE &&
+           ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+               error = -EINVAL;
+               goto failed;
+       }
+
+       if (page || (sgp == SGP_READ && !swap.val)) {
                /*
                 * Once we can get the page lock, it must be uptodate:
                 * if there were an error in reading back from swap,
                 * the page would not be inserted into the filecache.
                 */
-               BUG_ON(!PageUptodate(page));
-               goto done;
+               BUG_ON(page && !PageUptodate(page));
+               *pagep = page;
+               return 0;
        }
 
        /*
-        * Try to preload while we can wait, to not make a habit of
-        * draining atomic reserves; but don't latch on to this cpu.
+        * Fast cache lookup did not find it:
+        * bring it back from swap or allocate.
         */
-       error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
-       if (error)
-               goto out;
-       radix_tree_preload_end();
-
-       if (sgp != SGP_READ && !prealloc_page) {
-               prealloc_page = shmem_alloc_page(gfp, info, index);
-               if (prealloc_page) {
-                       SetPageSwapBacked(prealloc_page);
-                       if (mem_cgroup_cache_charge(prealloc_page,
-                                       current->mm, GFP_KERNEL)) {
-                               page_cache_release(prealloc_page);
-                               prealloc_page = NULL;
-                       }
-               }
-       }
+       info = SHMEM_I(inode);
+       sbinfo = SHMEM_SB(inode->i_sb);
 
-       spin_lock(&info->lock);
-       shmem_recalc_inode(inode);
-       swap = shmem_get_swap(info, index);
        if (swap.val) {
                /* Look it up and read it in.. */
                page = lookup_swap_cache(swap);
                if (!page) {
-                       spin_unlock(&info->lock);
                        /* here we actually do the io */
                        if (fault_type)
                                *fault_type |= VM_FAULT_MAJOR;
                        page = shmem_swapin(swap, gfp, info, index);
                        if (!page) {
-                               swp_entry_t nswap = shmem_get_swap(info, index);
-                               if (nswap.val == swap.val) {
-                                       error = -ENOMEM;
-                                       goto out;
-                               }
-                               goto repeat;
+                               error = -ENOMEM;
+                               goto failed;
                        }
-                       wait_on_page_locked(page);
-                       page_cache_release(page);
-                       goto repeat;
                }
 
                /* We have to do this with page locked to prevent races */
-               if (!trylock_page(page)) {
-                       spin_unlock(&info->lock);
-                       wait_on_page_locked(page);
-                       page_cache_release(page);
-                       goto repeat;
-               }
-               if (PageWriteback(page)) {
-                       spin_unlock(&info->lock);
-                       wait_on_page_writeback(page);
-                       unlock_page(page);
-                       page_cache_release(page);
-                       goto repeat;
-               }
+               lock_page(page);
                if (!PageUptodate(page)) {
-                       spin_unlock(&info->lock);
-                       unlock_page(page);
-                       page_cache_release(page);
                        error = -EIO;
-                       goto out;
+                       goto failed;
                }
-
-               error = add_to_page_cache_locked(page, mapping,
-                                                index, GFP_NOWAIT);
-               if (error) {
-                       spin_unlock(&info->lock);
-                       if (error == -ENOMEM) {
-                               /*
-                                * reclaim from proper memory cgroup and
-                                * call memcg's OOM if needed.
-                                */
-                               error = mem_cgroup_shmem_charge_fallback(
-                                               page, current->mm, gfp);
-                               if (error) {
-                                       unlock_page(page);
-                                       page_cache_release(page);
-                                       goto out;
-                               }
-                       }
-                       unlock_page(page);
-                       page_cache_release(page);
-                       goto repeat;
+               wait_on_page_writeback(page);
+
+               /* Someone may have already done it for us */
+               if (page->mapping) {
+                       if (page->mapping == mapping &&
+                           page->index == index)
+                               goto done;
+                       error = -EEXIST;
+                       goto failed;
                }
 
-               delete_from_swap_cache(page);
-               shmem_put_swap(info, index, (swp_entry_t){0});
+               error = mem_cgroup_cache_charge(page, current->mm,
+                                               gfp & GFP_RECLAIM_MASK);
+               if (!error)
+                       error = shmem_add_to_page_cache(page, mapping, index,
+                                               gfp, swp_to_radix_entry(swap));
+               if (error)
+                       goto failed;
+
+               spin_lock(&info->lock);
                info->swapped--;
+               shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
+
+               delete_from_swap_cache(page);
                set_page_dirty(page);
                swap_free(swap);
 
-       } else if (sgp == SGP_READ) {
-               page = find_get_page(mapping, index);
-               if (page && !trylock_page(page)) {
-                       spin_unlock(&info->lock);
-                       wait_on_page_locked(page);
-                       page_cache_release(page);
-                       goto repeat;
+       } else {
+               if (shmem_acct_block(info->flags)) {
+                       error = -ENOSPC;
+                       goto failed;
                }
-               spin_unlock(&info->lock);
-
-       } else if (prealloc_page) {
-               sbinfo = SHMEM_SB(inode->i_sb);
                if (sbinfo->max_blocks) {
                        if (percpu_counter_compare(&sbinfo->used_blocks,
-                                               sbinfo->max_blocks) >= 0 ||
-                           shmem_acct_block(info->flags))
-                               goto nospace;
+                                               sbinfo->max_blocks) >= 0) {
+                               error = -ENOSPC;
+                               goto unacct;
+                       }
                        percpu_counter_inc(&sbinfo->used_blocks);
-                       inode->i_blocks += BLOCKS_PER_PAGE;
-               } else if (shmem_acct_block(info->flags))
-                       goto nospace;
-
-               page = prealloc_page;
-               prealloc_page = NULL;
+               }
 
-               swap = shmem_get_swap(info, index);
-               if (swap.val)
-                       mem_cgroup_uncharge_cache_page(page);
-               else
-                       error = add_to_page_cache_lru(page, mapping,
-                                               index, GFP_NOWAIT);
-               /*
-                * At add_to_page_cache_lru() failure,
-                * uncharge will be done automatically.
-                */
-               if (swap.val || error) {
-                       shmem_unacct_blocks(info->flags, 1);
-                       shmem_free_blocks(inode, 1);
-                       spin_unlock(&info->lock);
-                       page_cache_release(page);
-                       goto repeat;
+               page = shmem_alloc_page(gfp, info, index);
+               if (!page) {
+                       error = -ENOMEM;
+                       goto decused;
                }
 
+               SetPageSwapBacked(page);
+               __set_page_locked(page);
+               error = mem_cgroup_cache_charge(page, current->mm,
+                                               gfp & GFP_RECLAIM_MASK);
+               if (!error)
+                       error = shmem_add_to_page_cache(page, mapping, index,
+                                               gfp, NULL);
+               if (error)
+                       goto decused;
+               lru_cache_add_anon(page);
+
+               spin_lock(&info->lock);
                info->alloced++;
+               inode->i_blocks += BLOCKS_PER_PAGE;
+               shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
+
                clear_highpage(page);
                flush_dcache_page(page);
                SetPageUptodate(page);
                if (sgp == SGP_DIRTY)
                        set_page_dirty(page);
-
-       } else {
-               spin_unlock(&info->lock);
-               error = -ENOMEM;
-               goto out;
        }
 done:
-       *pagep = page;
-       error = 0;
-out:
-       if (prealloc_page) {
-               mem_cgroup_uncharge_cache_page(prealloc_page);
-               page_cache_release(prealloc_page);
+       /* Perhaps the file has been truncated since we checked */
+       if (sgp != SGP_WRITE &&
+           ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+               error = -EINVAL;
+               goto trunc;
        }
-       return error;
+       *pagep = page;
+       return 0;
 
-nospace:
        /*
-        * Perhaps the page was brought in from swap between find_lock_page
-        * and taking info->lock?  We allow for that at add_to_page_cache_lru,
-        * but must also avoid reporting a spurious ENOSPC while working on a
-        * full tmpfs.
+        * Error recovery.
         */
-       page = find_get_page(mapping, index);
+trunc:
+       ClearPageDirty(page);
+       delete_from_page_cache(page);
+       spin_lock(&info->lock);
+       info->alloced--;
+       inode->i_blocks -= BLOCKS_PER_PAGE;
        spin_unlock(&info->lock);
+decused:
+       if (sbinfo->max_blocks)
+               percpu_counter_add(&sbinfo->used_blocks, -1);
+unacct:
+       shmem_unacct_blocks(info->flags, 1);
+failed:
+       if (swap.val && error != -EINVAL) {
+               struct page *test = find_get_page(mapping, index);
+               if (test && !radix_tree_exceptional_entry(test))
+                       page_cache_release(test);
+               /* Have another try if the entry has changed */
+               if (test != swp_to_radix_entry(swap))
+                       error = -EEXIST;
+       }
        if (page) {
+               unlock_page(page);
                page_cache_release(page);
+       }
+       if (error == -ENOSPC && !once++) {
+               info = SHMEM_I(inode);
+               spin_lock(&info->lock);
+               shmem_recalc_inode(inode);
+               spin_unlock(&info->lock);
                goto repeat;
        }
-       error = -ENOSPC;
-       goto out;
+       if (error == -EEXIST)
+               goto repeat;
+       return error;
 }
 
 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -855,9 +1022,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        int error;
        int ret = VM_FAULT_LOCKED;
 
-       if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-               return VM_FAULT_SIGBUS;
-
        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
        if (error)
                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -981,7 +1145,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 
 #ifdef CONFIG_TMPFS
 static const struct inode_operations shmem_symlink_inode_operations;
-static const struct inode_operations shmem_symlink_inline_operations;
+static const struct inode_operations shmem_short_symlink_operations;
 
 static int
 shmem_write_begin(struct file *file, struct address_space *mapping,
@@ -1446,10 +1610,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 
        info = SHMEM_I(inode);
        inode->i_size = len-1;
-       if (len <= SHMEM_SYMLINK_INLINE_LEN) {
-               /* do it inline */
-               memcpy(info->inline_symlink, symname, len);
-               inode->i_op = &shmem_symlink_inline_operations;
+       if (len <= SHORT_SYMLINK_LEN) {
+               info->symlink = kmemdup(symname, len, GFP_KERNEL);
+               if (!info->symlink) {
+                       iput(inode);
+                       return -ENOMEM;
+               }
+               inode->i_op = &shmem_short_symlink_operations;
        } else {
                error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
                if (error) {
@@ -1472,9 +1639,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        return 0;
 }
 
-static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
+static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
 {
-       nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink);
+       nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
        return NULL;
 }
 
@@ -1722,9 +1889,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
 }
 #endif /* CONFIG_TMPFS_XATTR */
 
-static const struct inode_operations shmem_symlink_inline_operations = {
+static const struct inode_operations shmem_short_symlink_operations = {
        .readlink       = generic_readlink,
-       .follow_link    = shmem_follow_link_inline,
+       .follow_link    = shmem_follow_short_symlink,
 #ifdef CONFIG_TMPFS_XATTR
        .setxattr       = shmem_setxattr,
        .getxattr       = shmem_getxattr,
@@ -1924,8 +2091,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
        if (config.max_inodes < inodes)
                goto out;
        /*
-        * Those tests also disallow limited->unlimited while any are in
-        * use, so i_blocks will always be zero when max_blocks is zero;
+        * Those tests disallow limited->unlimited while any are in use;
         * but we must separately disallow unlimited->limited, because
         * in that case we have no record of how much is already in use.
         */
@@ -2068,10 +2234,8 @@ static void shmem_destroy_callback(struct rcu_head *head)
 
 static void shmem_destroy_inode(struct inode *inode)
 {
-       if ((inode->i_mode & S_IFMT) == S_IFREG) {
-               /* only struct inode is valid if it's an inline symlink */
+       if ((inode->i_mode & S_IFMT) == S_IFREG)
                mpol_free_shared_policy(&SHMEM_I(inode)->policy);
-       }
        call_rcu(&inode->i_rcu, shmem_destroy_callback);
 }
 
@@ -2237,42 +2401,6 @@ out4:
        return error;
 }
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_get_shmem_target - find page or swap assigned to the shmem file
- * @inode: the inode to be searched
- * @index: the page offset to be searched
- * @pagep: the pointer for the found page to be stored
- * @swapp: the pointer for the found swap entry to be stored
- *
- * If a page is found, refcount of it is incremented. Callers should handle
- * these refcount.
- */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t index,
-                                struct page **pagep, swp_entry_t *swapp)
-{
-       struct shmem_inode_info *info = SHMEM_I(inode);
-       struct page *page = NULL;
-       swp_entry_t swap = {0};
-
-       if ((index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-               goto out;
-
-       spin_lock(&info->lock);
-#ifdef CONFIG_SWAP
-       swap = shmem_get_swap(info, index);
-       if (swap.val)
-               page = find_get_page(&swapper_space, swap.val);
-       else
-#endif
-               page = find_get_page(inode->i_mapping, index);
-       spin_unlock(&info->lock);
-out:
-       *pagep = page;
-       *swapp = swap;
-}
-#endif
-
 #else /* !CONFIG_SHMEM */
 
 /*
@@ -2318,31 +2446,6 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_get_shmem_target - find page or swap assigned to the shmem file
- * @inode: the inode to be searched
- * @index: the page offset to be searched
- * @pagep: the pointer for the found page to be stored
- * @swapp: the pointer for the found swap entry to be stored
- *
- * If a page is found, refcount of it is incremented. Callers should handle
- * these refcount.
- */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t index,
-                                struct page **pagep, swp_entry_t *swapp)
-{
-       struct page *page = NULL;
-
-       if ((index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-               goto out;
-       page = find_get_page(inode->i_mapping, index);
-out:
-       *pagep = page;
-       *swapp = (swp_entry_t){0};
-}
-#endif
-
 #define shmem_vm_ops                           generic_file_vm_ops
 #define shmem_file_operations                  ramfs_file_operations
 #define shmem_get_inode(sb, dir, mode, dev, flags)     ramfs_get_inode(sb, dir, mode, dev)
@@ -2417,6 +2520,15 @@ put_memory:
 }
 EXPORT_SYMBOL_GPL(shmem_file_setup);
 
+void shmem_set_file(struct vm_area_struct *vma, struct file *file)
+{
+       if (vma->vm_file)
+               fput(vma->vm_file);
+       vma->vm_file = file;
+       vma->vm_ops = &shmem_vm_ops;
+       vma->vm_flags |= VM_CAN_NONLINEAR;
+}
+
 /**
  * shmem_zero_setup - setup a shared anonymous mapping
  * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
@@ -2430,11 +2542,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
        if (IS_ERR(file))
                return PTR_ERR(file);
 
-       if (vma->vm_file)
-               fput(vma->vm_file);
-       vma->vm_file = file;
-       vma->vm_ops = &shmem_vm_ops;
-       vma->vm_flags |= VM_CAN_NONLINEAR;
+       shmem_set_file(vma, file);
        return 0;
 }