Merge branch 'next-evm' of git://git.kernel.org/pub/scm/linux/kernel/git/zohar/ima...
James Morris [Tue, 9 Aug 2011 00:31:03 +0000 (10:31 +1000)]
Conflicts:
fs/attr.c

Resolve conflict manually.

Signed-off-by: James Morris <jmorris@namei.org>

1  2 
Documentation/kernel-parameters.txt
fs/attr.c
fs/btrfs/xattr.c
fs/gfs2/inode.c
fs/jfs/xattr.c
fs/xfs/linux-2.6/xfs_iops.c
include/linux/security.h
mm/shmem.c
security/integrity/ima/ima_main.c
security/security.c

@@@ -48,6 -48,7 +48,7 @@@ parameter is applicable
        EDD     BIOS Enhanced Disk Drive Services (EDD) is enabled
        EFI     EFI Partitioning (GPT) is enabled
        EIDE    EIDE/ATAPI support is enabled.
+       EVM     Extended Verification Module
        FB      The frame buffer device is enabled.
        GCOV    GCOV profiling is enabled.
        HW      Appropriate hardware is enabled.
@@@ -163,11 -164,6 +164,11 @@@ bytes respectively. Such letter suffixe
  
                        See also Documentation/power/pm.txt, pci=noacpi
  
 +      acpi_rsdp=      [ACPI,EFI,KEXEC]
 +                      Pass the RSDP address to the kernel, mostly used
 +                      on machines running EFI runtime service to boot the
 +                      second kernel for kdump.
 +
        acpi_apic_instance=     [ACPI, IOAPIC]
                        Format: <int>
                        2: use 2nd APIC table, if available
                        /proc/<pid>/coredump_filter.
                        See also Documentation/filesystems/proc.txt.
  
 +      cpuidle.off=1   [CPU_IDLE]
 +                      disable the cpuidle sub-system
 +
        cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver
                        Format:
                        <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
                        This option is obsoleted by the "netdev=" option, which
                        has equivalent usage. See its documentation for details.
  
+       evm=            [EVM]
+                       Format: { "fix" }
+                       Permit 'security.evm' to be updated regardless of
+                       current integrity status.
        failslab=
        fail_page_alloc=
        fail_make_request=[KNL]
                        for all guests.
                        Default is 1 (enabled) if in 64bit or 32bit-PAE mode
  
 -      kvm-intel.bypass_guest_pf=
 -                      [KVM,Intel] Disables bypassing of guest page faults
 -                      on Intel chips. Default is 1 (enabled)
 -
        kvm-intel.ept=  [KVM,Intel] Disable extended page tables
                        (virtualized MMU) support on capable Intel chips.
                        Default is 1 (enabled)
        no-kvmapf       [X86,KVM] Disable paravirtualized asynchronous page
                        fault handling.
  
 +      no-steal-acc    [X86,KVM] Disable paravirtualized steal time accounting.
 +                      steal time is computed, but won't influence scheduler
 +                      behaviour
 +
        nolapic         [X86-32,APIC] Do not enable or use the local APIC.
  
        nolapic_timer   [X86-32,APIC] Do not use the local APIC timer.
                        See Documentation/sound/oss/oss-parameters.txt
  
        panic=          [KNL] Kernel behaviour on panic: delay <timeout>
 -                      seconds before rebooting
 +                      timeout > 0: seconds before rebooting
 +                      timeout = 0: wait forever
 +                      timeout < 0: reboot immediately
                        Format: <timeout>
  
        parkbd.port=    [HW] Parallel port number the keyboard adapter is
                                the default.
                                off: Turn ECRC off
                                on: Turn ECRC on.
 +              realloc         reallocate PCI resources if allocations done by BIOS
 +                              are erroneous.
  
        pcie_aspm=      [PCIE] Forcibly enable or disable PCIe Active State Power
                        Management.
                        [HW,MOUSE] Controls Logitech smartscroll autorepeat.
                        0 = disabled, 1 = enabled (default).
  
 +      pstore.backend= Specify the name of the pstore backend to use
 +
        pt.             [PARIDE]
                        See Documentation/blockdev/paride.txt.
  
        ro              [KNL] Mount root device read-only on boot
  
        root=           [KNL] Root filesystem
 +                      See name_to_dev_t comment in init/do_mounts.c.
  
        rootdelay=      [KNL] Delay (in seconds) to pause before attempting to
                        mount the root filesystem
                        <port#>,<js1>,<js2>,<js3>,<js4>,<js5>,<js6>,<js7>
                        See also Documentation/input/joystick-parport.txt
  
 +      udbg-immortal   [PPC] When debugging early kernel crashes that
 +                      happen after console_init() and before a proper 
 +                      console driver takes over, this boot options might
 +                      help "seeing" what's going on.
 +
        uhash_entries=  [KNL,NET]
                        Set number of hash buckets for UDP/UDP-Lite connections
  
        unknown_nmi_panic
                        [X86] Cause panic on unknown NMI.
  
 +      usbcore.authorized_default=
 +                      [USB] Default USB device authorization:
 +                      (default -1 = authorized except for wireless USB,
 +                      0 = not authorized, 1 = authorized)
 +
        usbcore.autosuspend=
                        [USB] The autosuspend time delay (in seconds) used
                        for newly-detected USB devices (default 2).  This
diff --combined fs/attr.c
+++ b/fs/attr.c
@@@ -13,6 -13,7 +13,7 @@@
  #include <linux/fsnotify.h>
  #include <linux/fcntl.h>
  #include <linux/security.h>
+ #include <linux/evm.h>
  
  /**
   * inode_change_ok - check if attribute changes to an inode are allowed
@@@ -232,13 -233,21 +233,15 @@@ int notify_change(struct dentry * dentr
        if (error)
                return error;
  
 -      if (ia_valid & ATTR_SIZE)
 -              down_write(&dentry->d_inode->i_alloc_sem);
 -
        if (inode->i_op->setattr)
                error = inode->i_op->setattr(dentry, attr);
        else
                error = simple_setattr(dentry, attr);
  
-       if (!error)
 -      if (ia_valid & ATTR_SIZE)
 -              up_write(&dentry->d_inode->i_alloc_sem);
 -
+       if (!error) {
                fsnotify_change(dentry, ia_valid);
+               evm_inode_post_setattr(dentry, ia_valid);
+       }
  
        return error;
  }
diff --combined fs/btrfs/xattr.c
@@@ -102,57 -102,43 +102,57 @@@ static int do_setxattr(struct btrfs_tra
        if (!path)
                return -ENOMEM;
  
 -      /* first lets see if we already have this xattr */
 -      di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
 -                              strlen(name), -1);
 -      if (IS_ERR(di)) {
 -              ret = PTR_ERR(di);
 -              goto out;
 -      }
 -
 -      /* ok we already have this xattr, lets remove it */
 -      if (di) {
 -              /* if we want create only exit */
 -              if (flags & XATTR_CREATE) {
 -                      ret = -EEXIST;
 +      if (flags & XATTR_REPLACE) {
 +              di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
 +                                      name_len, -1);
 +              if (IS_ERR(di)) {
 +                      ret = PTR_ERR(di);
 +                      goto out;
 +              } else if (!di) {
 +                      ret = -ENODATA;
                        goto out;
                }
 -
                ret = btrfs_delete_one_dir_name(trans, root, path, di);
 -              BUG_ON(ret);
 +              if (ret)
 +                      goto out;
                btrfs_release_path(path);
 +      }
  
 -              /* if we don't have a value then we are removing the xattr */
 -              if (!value)
 +again:
 +      ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
 +                                    name, name_len, value, size);
 +      if (ret == -EEXIST) {
 +              if (flags & XATTR_CREATE)
                        goto out;
 -      } else {
 +              /*
 +               * We can't use the path we already have since we won't have the
 +               * proper locking for a delete, so release the path and
 +               * re-lookup to delete the thing.
 +               */
                btrfs_release_path(path);
 +              di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
 +                                      name, name_len, -1);
 +              if (IS_ERR(di)) {
 +                      ret = PTR_ERR(di);
 +                      goto out;
 +              } else if (!di) {
 +                      /* Shouldn't happen but just in case... */
 +                      btrfs_release_path(path);
 +                      goto again;
 +              }
  
 -              if (flags & XATTR_REPLACE) {
 -                      /* we couldn't find the attr to replace */
 -                      ret = -ENODATA;
 +              ret = btrfs_delete_one_dir_name(trans, root, path, di);
 +              if (ret)
                        goto out;
 +
 +              /*
 +               * We have a value to set, so go back and try to insert it now.
 +               */
 +              if (value) {
 +                      btrfs_release_path(path);
 +                      goto again;
                }
        }
 -
 -      /* ok we have to create a completely new xattr */
 -      ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
 -                                    name, name_len, value, size);
 -      BUG_ON(ret);
  out:
        btrfs_free_path(path);
        return ret;
@@@ -374,36 -360,36 +374,36 @@@ int btrfs_removexattr(struct dentry *de
                                XATTR_REPLACE);
  }
  
- int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
-                             struct inode *inode, struct inode *dir,
-                             const struct qstr *qstr)
+ int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+                    void *fs_info)
  {
-       int err;
-       size_t len;
-       void *value;
-       char *suffix;
+       const struct xattr *xattr;
+       struct btrfs_trans_handle *trans = fs_info;
        char *name;
+       int err = 0;
  
-       err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
-                                          &len);
-       if (err) {
-               if (err == -EOPNOTSUPP)
-                       return 0;
-               return err;
-       }
-       name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
-                      GFP_NOFS);
-       if (!name) {
-               err = -ENOMEM;
-       } else {
+       for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+               name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+                              strlen(xattr->name) + 1, GFP_NOFS);
+               if (!name) {
+                       err = -ENOMEM;
+                       break;
+               }
                strcpy(name, XATTR_SECURITY_PREFIX);
-               strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
-               err = __btrfs_setxattr(trans, inode, name, value, len, 0);
+               strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+               err = __btrfs_setxattr(trans, inode, name,
+                                      xattr->value, xattr->value_len, 0);
                kfree(name);
+               if (err < 0)
+                       break;
        }
-       kfree(suffix);
-       kfree(value);
        return err;
  }
+ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+                             struct inode *inode, struct inode *dir,
+                             const struct qstr *qstr)
+ {
+       return security_inode_init_security(inode, dir, qstr,
+                                           &btrfs_initxattrs, trans);
+ }
diff --combined fs/gfs2/inode.c
@@@ -307,7 -307,7 +307,7 @@@ struct inode *gfs2_lookupi(struct inod
        }
  
        if (!is_root) {
 -              error = gfs2_permission(dir, MAY_EXEC, 0);
 +              error = gfs2_permission(dir, MAY_EXEC);
                if (error)
                        goto out;
        }
@@@ -337,7 -337,7 +337,7 @@@ static int create_ok(struct gfs2_inode 
  {
        int error;
  
 -      error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
 +      error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
  
@@@ -624,31 -624,29 +624,29 @@@ fail
        return error;
  }
  
- static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
-                             const struct qstr *qstr)
+ int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+                   void *fs_info)
  {
-       int err;
-       size_t len;
-       void *value;
-       char *name;
-       err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
-                                          &name, &value, &len);
-       if (err) {
-               if (err == -EOPNOTSUPP)
-                       return 0;
-               return err;
+       const struct xattr *xattr;
+       int err = 0;
+       for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+               err = __gfs2_xattr_set(inode, xattr->name, xattr->value,
+                                      xattr->value_len, 0,
+                                      GFS2_EATYPE_SECURITY);
+               if (err < 0)
+                       break;
        }
        return err;
  }
  
+ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
+                             const struct qstr *qstr)
+ {
+       return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
+                                           &gfs2_initxattrs, NULL);
+ }
  /**
   * gfs2_create_inode - Create a new inode
   * @dir: The parent directory
@@@ -792,8 -790,13 +790,8 @@@ static int gfs2_create(struct inode *di
  static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
                                  struct nameidata *nd)
  {
 -      struct inode *inode = NULL;
 -
 -      inode = gfs2_lookupi(dir, &dentry->d_name, 0);
 -      if (inode && IS_ERR(inode))
 -              return ERR_CAST(inode);
 -
 -      if (inode) {
 +      struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0);
 +      if (inode && !IS_ERR(inode)) {
                struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
                struct gfs2_holder gh;
                int error;
                        return ERR_PTR(error);
                }
                gfs2_glock_dq_uninit(&gh);
 -              return d_splice_alias(inode, dentry);
        }
 -      d_add(dentry, inode);
 -
 -      return NULL;
 +      return d_splice_alias(inode, dentry);
  }
  
  /**
@@@ -849,7 -855,7 +847,7 @@@ static int gfs2_link(struct dentry *old
        if (inode->i_nlink == 0)
                goto out_gunlock;
  
 -      error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
 +      error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
        if (error)
                goto out_gunlock;
  
@@@ -982,7 -988,7 +980,7 @@@ static int gfs2_unlink_ok(struct gfs2_i
        if (IS_APPEND(&dip->i_inode))
                return -EPERM;
  
 -      error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
 +      error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
  
@@@ -1328,7 -1334,7 +1326,7 @@@ static int gfs2_rename(struct inode *od
                        }
                }
        } else {
 -              error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
 +              error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
                if (error)
                        goto out_gunlock;
  
        /* Check out the dir to be renamed */
  
        if (dir_rename) {
 -              error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
 +              error = gfs2_permission(odentry->d_inode, MAY_WRITE);
                if (error)
                        goto out_gunlock;
        }
@@@ -1535,7 -1541,7 +1533,7 @@@ static void gfs2_put_link(struct dentr
   * Returns: errno
   */
  
 -int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
 +int gfs2_permission(struct inode *inode, int mask)
  {
        struct gfs2_inode *ip;
        struct gfs2_holder i_gh;
  
        ip = GFS2_I(inode);
        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
 -              if (flags & IPERM_FLAG_RCU)
 +              if (mask & MAY_NOT_BLOCK)
                        return -ECHILD;
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                if (error)
        if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
                error = -EACCES;
        else
 -              error = generic_permission(inode, mask, flags, gfs2_check_acl);
 +              error = generic_permission(inode, mask);
        if (unlock)
                gfs2_glock_dq_uninit(&i_gh);
  
@@@ -1846,7 -1852,6 +1844,7 @@@ const struct inode_operations gfs2_file
        .listxattr = gfs2_listxattr,
        .removexattr = gfs2_removexattr,
        .fiemap = gfs2_fiemap,
 +      .get_acl = gfs2_get_acl,
  };
  
  const struct inode_operations gfs2_dir_iops = {
        .listxattr = gfs2_listxattr,
        .removexattr = gfs2_removexattr,
        .fiemap = gfs2_fiemap,
 +      .get_acl = gfs2_get_acl,
  };
  
  const struct inode_operations gfs2_symlink_iops = {
        .listxattr = gfs2_listxattr,
        .removexattr = gfs2_removexattr,
        .fiemap = gfs2_fiemap,
 +      .get_acl = gfs2_get_acl,
  };
  
diff --combined fs/jfs/xattr.c
@@@ -693,7 -693,8 +693,7 @@@ static int can_set_system_xattr(struct 
                        return rc;
                }
                if (acl) {
 -                      mode_t mode = inode->i_mode;
 -                      rc = posix_acl_equiv_mode(acl, &mode);
 +                      rc = posix_acl_equiv_mode(acl, &inode->i_mode);
                        posix_acl_release(acl);
                        if (rc < 0) {
                                printk(KERN_ERR
                                       rc);
                                return rc;
                        }
 -                      inode->i_mode = mode;
                        mark_inode_dirty(inode);
                }
                /*
@@@ -1089,38 -1091,37 +1089,37 @@@ int jfs_removexattr(struct dentry *dent
  }
  
  #ifdef CONFIG_JFS_SECURITY
- int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
-                     const struct qstr *qstr)
+ int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+                  void *fs_info)
  {
-       int rc;
-       size_t len;
-       void *value;
-       char *suffix;
+       const struct xattr *xattr;
+       tid_t *tid = fs_info;
        char *name;
-       rc = security_inode_init_security(inode, dir, qstr, &suffix, &value,
-                                         &len);
-       if (rc) {
-               if (rc == -EOPNOTSUPP)
-                       return 0;
-               return rc;
-       }
-       name = kmalloc(XATTR_SECURITY_PREFIX_LEN + 1 + strlen(suffix),
-                      GFP_NOFS);
-       if (!name) {
-               rc = -ENOMEM;
-               goto kmalloc_failed;
+       int err = 0;
+       for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+               name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+                              strlen(xattr->name) + 1, GFP_NOFS);
+               if (!name) {
+                       err = -ENOMEM;
+                       break;
+               }
+               strcpy(name, XATTR_SECURITY_PREFIX);
+               strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+               err = __jfs_setxattr(*tid, inode, name,
+                                    xattr->value, xattr->value_len, 0);
+               kfree(name);
+               if (err < 0)
+                       break;
        }
-       strcpy(name, XATTR_SECURITY_PREFIX);
-       strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
-       rc = __jfs_setxattr(tid, inode, name, value, len, 0);
-       kfree(name);
- kmalloc_failed:
-       kfree(suffix);
-       kfree(value);
+       return err;
+ }
  
-       return rc;
+ int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
+                     const struct qstr *qstr)
+ {
+       return security_inode_init_security(inode, dir, qstr,
+                                           &jfs_initxattrs, &tid);
  }
  #endif
@@@ -39,7 -39,6 +39,7 @@@
  #include "xfs_buf_item.h"
  #include "xfs_utils.h"
  #include "xfs_vnodeops.h"
 +#include "xfs_inode_item.h"
  #include "xfs_trace.h"
  
  #include <linux/capability.h>
@@@ -94,37 -93,38 +94,38 @@@ xfs_mark_inode_dirty
                mark_inode_dirty(inode);
  }
  
+ int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+                  void *fs_info)
+ {
+       const struct xattr *xattr;
+       struct xfs_inode *ip = XFS_I(inode);
+       int error = 0;
+       for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+               error = xfs_attr_set(ip, xattr->name, xattr->value,
+                                    xattr->value_len, ATTR_SECURE);
+               if (error < 0)
+                       break;
+       }
+       return error;
+ }
  /*
   * Hook in SELinux.  This is not quite correct yet, what we really need
   * here (as we do for default ACLs) is a mechanism by which creation of
   * these attrs can be journalled at inode creation time (along with the
   * inode, of course, such that log replay can't cause these to be lost).
   */
  STATIC int
  xfs_init_security(
        struct inode    *inode,
        struct inode    *dir,
        const struct qstr *qstr)
  {
-       struct xfs_inode *ip = XFS_I(inode);
-       size_t          length;
-       void            *value;
-       unsigned char   *name;
-       int             error;
-       error = security_inode_init_security(inode, dir, qstr, (char **)&name,
-                                            &value, &length);
-       if (error) {
-               if (error == -EOPNOTSUPP)
-                       return 0;
-               return -error;
-       }
-       error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
-       kfree(name);
-       kfree(value);
-       return error;
+       return security_inode_init_security(inode, dir, qstr,
+                                           &xfs_initxattrs, NULL);
  }
  
  static void
@@@ -202,9 -202,9 +203,9 @@@ xfs_vn_mknod
  
        if (default_acl) {
                error = -xfs_inherit_acl(inode, default_acl);
 +              default_acl = NULL;
                if (unlikely(error))
                        goto out_cleanup_inode;
 -              posix_acl_release(default_acl);
        }
  
  
@@@ -498,442 -498,12 +499,442 @@@ xfs_vn_getattr
        return 0;
  }
  
 +int
 +xfs_setattr_nonsize(
 +      struct xfs_inode        *ip,
 +      struct iattr            *iattr,
 +      int                     flags)
 +{
 +      xfs_mount_t             *mp = ip->i_mount;
 +      struct inode            *inode = VFS_I(ip);
 +      int                     mask = iattr->ia_valid;
 +      xfs_trans_t             *tp;
 +      int                     error;
 +      uid_t                   uid = 0, iuid = 0;
 +      gid_t                   gid = 0, igid = 0;
 +      struct xfs_dquot        *udqp = NULL, *gdqp = NULL;
 +      struct xfs_dquot        *olddquot1 = NULL, *olddquot2 = NULL;
 +
 +      trace_xfs_setattr(ip);
 +
 +      if (mp->m_flags & XFS_MOUNT_RDONLY)
 +              return XFS_ERROR(EROFS);
 +
 +      if (XFS_FORCED_SHUTDOWN(mp))
 +              return XFS_ERROR(EIO);
 +
 +      error = -inode_change_ok(inode, iattr);
 +      if (error)
 +              return XFS_ERROR(error);
 +
 +      ASSERT((mask & ATTR_SIZE) == 0);
 +
 +      /*
 +       * If disk quotas is on, we make sure that the dquots do exist on disk,
 +       * before we start any other transactions. Trying to do this later
 +       * is messy. We don't care to take a readlock to look at the ids
 +       * in inode here, because we can't hold it across the trans_reserve.
 +       * If the IDs do change before we take the ilock, we're covered
 +       * because the i_*dquot fields will get updated anyway.
 +       */
 +      if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) {
 +              uint    qflags = 0;
 +
 +              if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
 +                      uid = iattr->ia_uid;
 +                      qflags |= XFS_QMOPT_UQUOTA;
 +              } else {
 +                      uid = ip->i_d.di_uid;
 +              }
 +              if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
 +                      gid = iattr->ia_gid;
 +                      qflags |= XFS_QMOPT_GQUOTA;
 +              }  else {
 +                      gid = ip->i_d.di_gid;
 +              }
 +
 +              /*
 +               * We take a reference when we initialize udqp and gdqp,
 +               * so it is important that we never blindly double trip on
 +               * the same variable. See xfs_create() for an example.
 +               */
 +              ASSERT(udqp == NULL);
 +              ASSERT(gdqp == NULL);
 +              error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
 +                                       qflags, &udqp, &gdqp);
 +              if (error)
 +                      return error;
 +      }
 +
 +      tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 +      error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
 +      if (error)
 +              goto out_dqrele;
 +
 +      xfs_ilock(ip, XFS_ILOCK_EXCL);
 +
 +      /*
 +       * Change file ownership.  Must be the owner or privileged.
 +       */
 +      if (mask & (ATTR_UID|ATTR_GID)) {
 +              /*
 +               * These IDs could have changed since we last looked at them.
 +               * But, we're assured that if the ownership did change
 +               * while we didn't have the inode locked, inode's dquot(s)
 +               * would have changed also.
 +               */
 +              iuid = ip->i_d.di_uid;
 +              igid = ip->i_d.di_gid;
 +              gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
 +              uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
 +
 +              /*
 +               * Do a quota reservation only if uid/gid is actually
 +               * going to change.
 +               */
 +              if (XFS_IS_QUOTA_RUNNING(mp) &&
 +                  ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
 +                   (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
 +                      ASSERT(tp);
 +                      error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
 +                                              capable(CAP_FOWNER) ?
 +                                              XFS_QMOPT_FORCE_RES : 0);
 +                      if (error)      /* out of quota */
 +                              goto out_trans_cancel;
 +              }
 +      }
 +
 +      xfs_trans_ijoin(tp, ip);
 +
 +      /*
 +       * Change file ownership.  Must be the owner or privileged.
 +       */
 +      if (mask & (ATTR_UID|ATTR_GID)) {
 +              /*
 +               * CAP_FSETID overrides the following restrictions:
 +               *
 +               * The set-user-ID and set-group-ID bits of a file will be
 +               * cleared upon successful return from chown()
 +               */
 +              if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
 +                  !capable(CAP_FSETID))
 +                      ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 +
 +              /*
 +               * Change the ownerships and register quota modifications
 +               * in the transaction.
 +               */
 +              if (iuid != uid) {
 +                      if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
 +                              ASSERT(mask & ATTR_UID);
 +                              ASSERT(udqp);
 +                              olddquot1 = xfs_qm_vop_chown(tp, ip,
 +                                                      &ip->i_udquot, udqp);
 +                      }
 +                      ip->i_d.di_uid = uid;
 +                      inode->i_uid = uid;
 +              }
 +              if (igid != gid) {
 +                      if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
 +                              ASSERT(!XFS_IS_PQUOTA_ON(mp));
 +                              ASSERT(mask & ATTR_GID);
 +                              ASSERT(gdqp);
 +                              olddquot2 = xfs_qm_vop_chown(tp, ip,
 +                                                      &ip->i_gdquot, gdqp);
 +                      }
 +                      ip->i_d.di_gid = gid;
 +                      inode->i_gid = gid;
 +              }
 +      }
 +
 +      /*
 +       * Change file access modes.
 +       */
 +      if (mask & ATTR_MODE) {
 +              umode_t mode = iattr->ia_mode;
 +
 +              if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
 +                      mode &= ~S_ISGID;
 +
 +              ip->i_d.di_mode &= S_IFMT;
 +              ip->i_d.di_mode |= mode & ~S_IFMT;
 +
 +              inode->i_mode &= S_IFMT;
 +              inode->i_mode |= mode & ~S_IFMT;
 +      }
 +
 +      /*
 +       * Change file access or modified times.
 +       */
 +      if (mask & ATTR_ATIME) {
 +              inode->i_atime = iattr->ia_atime;
 +              ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
 +              ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
 +              ip->i_update_core = 1;
 +      }
 +      if (mask & ATTR_CTIME) {
 +              inode->i_ctime = iattr->ia_ctime;
 +              ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
 +              ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
 +              ip->i_update_core = 1;
 +      }
 +      if (mask & ATTR_MTIME) {
 +              inode->i_mtime = iattr->ia_mtime;
 +              ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
 +              ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
 +              ip->i_update_core = 1;
 +      }
 +
 +      xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 +
 +      XFS_STATS_INC(xs_ig_attrchg);
 +
 +      if (mp->m_flags & XFS_MOUNT_WSYNC)
 +              xfs_trans_set_sync(tp);
 +      error = xfs_trans_commit(tp, 0);
 +
 +      xfs_iunlock(ip, XFS_ILOCK_EXCL);
 +
 +      /*
 +       * Release any dquot(s) the inode had kept before chown.
 +       */
 +      xfs_qm_dqrele(olddquot1);
 +      xfs_qm_dqrele(olddquot2);
 +      xfs_qm_dqrele(udqp);
 +      xfs_qm_dqrele(gdqp);
 +
 +      if (error)
 +              return XFS_ERROR(error);
 +
 +      /*
 +       * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
 +       *           update.  We could avoid this with linked transactions
 +       *           and passing down the transaction pointer all the way
 +       *           to attr_set.  No previous user of the generic
 +       *           Posix ACL code seems to care about this issue either.
 +       */
 +      if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
 +              error = -xfs_acl_chmod(inode);
 +              if (error)
 +                      return XFS_ERROR(error);
 +      }
 +
 +      return 0;
 +
 +out_trans_cancel:
 +      xfs_trans_cancel(tp, 0);
 +      xfs_iunlock(ip, XFS_ILOCK_EXCL);
 +out_dqrele:
 +      xfs_qm_dqrele(udqp);
 +      xfs_qm_dqrele(gdqp);
 +      return error;
 +}
 +
 +/*
 + * Truncate file.  Must have write permission and not be a directory.
 + */
 +int
 +xfs_setattr_size(
 +      struct xfs_inode        *ip,
 +      struct iattr            *iattr,
 +      int                     flags)
 +{
 +      struct xfs_mount        *mp = ip->i_mount;
 +      struct inode            *inode = VFS_I(ip);
 +      int                     mask = iattr->ia_valid;
 +      struct xfs_trans        *tp;
 +      int                     error;
 +      uint                    lock_flags;
 +      uint                    commit_flags = 0;
 +
 +      trace_xfs_setattr(ip);
 +
 +      if (mp->m_flags & XFS_MOUNT_RDONLY)
 +              return XFS_ERROR(EROFS);
 +
 +      if (XFS_FORCED_SHUTDOWN(mp))
 +              return XFS_ERROR(EIO);
 +
 +      error = -inode_change_ok(inode, iattr);
 +      if (error)
 +              return XFS_ERROR(error);
 +
 +      ASSERT(S_ISREG(ip->i_d.di_mode));
 +      ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
 +                      ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID|
 +                      ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
 +
 +      lock_flags = XFS_ILOCK_EXCL;
 +      if (!(flags & XFS_ATTR_NOLOCK))
 +              lock_flags |= XFS_IOLOCK_EXCL;
 +      xfs_ilock(ip, lock_flags);
 +
 +      /*
 +       * Short circuit the truncate case for zero length files.
 +       */
 +      if (iattr->ia_size == 0 &&
 +          ip->i_size == 0 && ip->i_d.di_nextents == 0) {
 +              if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
 +                      goto out_unlock;
 +
 +              /*
 +               * Use the regular setattr path to update the timestamps.
 +               */
 +              xfs_iunlock(ip, lock_flags);
 +              iattr->ia_valid &= ~ATTR_SIZE;
 +              return xfs_setattr_nonsize(ip, iattr, 0);
 +      }
 +
 +      /*
 +       * Make sure that the dquots are attached to the inode.
 +       */
 +      error = xfs_qm_dqattach_locked(ip, 0);
 +      if (error)
 +              goto out_unlock;
 +
 +      /*
 +       * Now we can make the changes.  Before we join the inode to the
 +       * transaction, take care of the part of the truncation that must be
 +       * done without the inode lock.  This needs to be done before joining
 +       * the inode to the transaction, because the inode cannot be unlocked
 +       * once it is a part of the transaction.
 +       */
 +      if (iattr->ia_size > ip->i_size) {
 +              /*
 +               * Do the first part of growing a file: zero any data in the
 +               * last block that is beyond the old EOF.  We need to do this
 +               * before the inode is joined to the transaction to modify
 +               * i_size.
 +               */
 +              error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
 +              if (error)
 +                      goto out_unlock;
 +      }
 +      xfs_iunlock(ip, XFS_ILOCK_EXCL);
 +      lock_flags &= ~XFS_ILOCK_EXCL;
 +
 +      /*
 +       * We are going to log the inode size change in this transaction so
 +       * any previous writes that are beyond the on disk EOF and the new
 +       * EOF that have not been written out need to be written here.  If we
 +       * do not write the data out, we expose ourselves to the null files
 +       * problem.
 +       *
 +       * Only flush from the on disk size to the smaller of the in memory
 +       * file size or the new size as that's the range we really care about
 +       * here and prevents waiting for other data not within the range we
 +       * care about here.
 +       */
 +      if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
 +              error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size,
 +                                      XBF_ASYNC, FI_NONE);
 +              if (error)
 +                      goto out_unlock;
 +      }
 +
 +      /*
 +       * Wait for all I/O to complete.
 +       */
 +      xfs_ioend_wait(ip);
 +
 +      error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
 +                                   xfs_get_blocks);
 +      if (error)
 +              goto out_unlock;
 +
 +      tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 +      error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
 +                               XFS_TRANS_PERM_LOG_RES,
 +                               XFS_ITRUNCATE_LOG_COUNT);
 +      if (error)
 +              goto out_trans_cancel;
 +
 +      truncate_setsize(inode, iattr->ia_size);
 +
 +      commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 +      lock_flags |= XFS_ILOCK_EXCL;
 +
 +      xfs_ilock(ip, XFS_ILOCK_EXCL);
 +
 +      xfs_trans_ijoin(tp, ip);
 +
 +      /*
 +       * Only change the c/mtime if we are changing the size or we are
 +       * explicitly asked to change it.  This handles the semantic difference
 +       * between truncate() and ftruncate() as implemented in the VFS.
 +       *
 +       * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
 +       * special case where we need to update the times despite not having
 +       * these flags set.  For all other operations the VFS set these flags
 +       * explicitly if it wants a timestamp update.
 +       */
 +      if (iattr->ia_size != ip->i_size &&
 +          (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
 +              iattr->ia_ctime = iattr->ia_mtime =
 +                      current_fs_time(inode->i_sb);
 +              mask |= ATTR_CTIME | ATTR_MTIME;
 +      }
 +
 +      if (iattr->ia_size > ip->i_size) {
 +              ip->i_d.di_size = iattr->ia_size;
 +              ip->i_size = iattr->ia_size;
 +      } else if (iattr->ia_size <= ip->i_size ||
 +                 (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
 +              error = xfs_itruncate_data(&tp, ip, iattr->ia_size);
 +              if (error)
 +                      goto out_trans_abort;
 +
 +              /*
 +               * Truncated "down", so we're removing references to old data
 +               * here - if we delay flushing for a long time, we expose
 +               * ourselves unduly to the notorious NULL files problem.  So,
 +               * we mark this inode and flush it when the file is closed,
 +               * and do not wait the usual (long) time for writeout.
 +               */
 +              xfs_iflags_set(ip, XFS_ITRUNCATED);
 +      }
 +
 +      if (mask & ATTR_CTIME) {
 +              inode->i_ctime = iattr->ia_ctime;
 +              ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
 +              ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
 +              ip->i_update_core = 1;
 +      }
 +      if (mask & ATTR_MTIME) {
 +              inode->i_mtime = iattr->ia_mtime;
 +              ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
 +              ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
 +              ip->i_update_core = 1;
 +      }
 +
 +      xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 +
 +      XFS_STATS_INC(xs_ig_attrchg);
 +
 +      if (mp->m_flags & XFS_MOUNT_WSYNC)
 +              xfs_trans_set_sync(tp);
 +
 +      error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 +out_unlock:
 +      if (lock_flags)
 +              xfs_iunlock(ip, lock_flags);
 +      return error;
 +
 +out_trans_abort:
 +      commit_flags |= XFS_TRANS_ABORT;
 +out_trans_cancel:
 +      xfs_trans_cancel(tp, commit_flags);
 +      goto out_unlock;
 +}
 +
  STATIC int
  xfs_vn_setattr(
        struct dentry   *dentry,
        struct iattr    *iattr)
  {
 -      return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 +      if (iattr->ia_valid & ATTR_SIZE)
 +              return -xfs_setattr_size(XFS_I(dentry->d_inode), iattr, 0);
 +      return -xfs_setattr_nonsize(XFS_I(dentry->d_inode), iattr, 0);
  }
  
  #define XFS_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@@ -1022,7 -592,7 +1023,7 @@@ xfs_vn_fiemap
  }
  
  static const struct inode_operations xfs_inode_operations = {
 -      .check_acl              = xfs_check_acl,
 +      .get_acl                = xfs_get_acl,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
        .setxattr               = generic_setxattr,
@@@ -1048,7 -618,7 +1049,7 @@@ static const struct inode_operations xf
        .rmdir                  = xfs_vn_unlink,
        .mknod                  = xfs_vn_mknod,
        .rename                 = xfs_vn_rename,
 -      .check_acl              = xfs_check_acl,
 +      .get_acl                = xfs_get_acl,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
        .setxattr               = generic_setxattr,
@@@ -1073,7 -643,7 +1074,7 @@@ static const struct inode_operations xf
        .rmdir                  = xfs_vn_unlink,
        .mknod                  = xfs_vn_mknod,
        .rename                 = xfs_vn_rename,
 -      .check_acl              = xfs_check_acl,
 +      .get_acl                = xfs_get_acl,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
        .setxattr               = generic_setxattr,
@@@ -1086,7 -656,7 +1087,7 @@@ static const struct inode_operations xf
        .readlink               = generic_readlink,
        .follow_link            = xfs_vn_follow_link,
        .put_link               = xfs_vn_put_link,
 -      .check_acl              = xfs_check_acl,
 +      .get_acl                = xfs_get_acl,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
        .setxattr               = generic_setxattr,
@@@ -1194,15 -764,6 +1195,15 @@@ xfs_setup_inode
                break;
        }
  
 +      /*
 +       * If there is no attribute fork no ACL can exist on this inode,
 +       * and it can't have any file capabilities attached to it either.
 +       */
 +      if (!XFS_IFORK_Q(ip)) {
 +              inode_has_no_xattr(inode);
 +              cache_no_acl(inode);
 +      }
 +
        xfs_iflags_clear(ip, XFS_INEW);
        barrier();
  
diff --combined include/linux/security.h
@@@ -36,6 -36,7 +36,7 @@@
  #include <linux/key.h>
  #include <linux/xfrm.h>
  #include <linux/slab.h>
+ #include <linux/xattr.h>
  #include <net/flow.h>
  
  /* Maximum number of letters for an LSM name string */
@@@ -147,6 -148,10 +148,10 @@@ extern int mmap_min_addr_handler(struc
                                 void __user *buffer, size_t *lenp, loff_t *ppos);
  #endif
  
+ /* security_inode_init_security callback function to write xattrs */
+ typedef int (*initxattrs) (struct inode *inode,
+                          const struct xattr *xattr_array, void *fs_data);
  #ifdef CONFIG_SECURITY
  
  struct security_mnt_opts {
@@@ -1456,7 -1461,7 +1461,7 @@@ struct security_operations 
                             struct inode *new_dir, struct dentry *new_dentry);
        int (*inode_readlink) (struct dentry *dentry);
        int (*inode_follow_link) (struct dentry *dentry, struct nameidata *nd);
 -      int (*inode_permission) (struct inode *inode, int mask, unsigned flags);
 +      int (*inode_permission) (struct inode *inode, int mask);
        int (*inode_setattr)    (struct dentry *dentry, struct iattr *attr);
        int (*inode_getattr) (struct vfsmount *mnt, struct dentry *dentry);
        int (*inode_setxattr) (struct dentry *dentry, const char *name,
@@@ -1704,8 -1709,11 +1709,11 @@@ int security_sb_parse_opts_str(char *op
  int security_inode_alloc(struct inode *inode);
  void security_inode_free(struct inode *inode);
  int security_inode_init_security(struct inode *inode, struct inode *dir,
-                                const struct qstr *qstr, char **name,
-                                void **value, size_t *len);
+                                const struct qstr *qstr,
+                                initxattrs initxattrs, void *fs_data);
+ int security_old_inode_init_security(struct inode *inode, struct inode *dir,
+                                    const struct qstr *qstr, char **name,
+                                    void **value, size_t *len);
  int security_inode_create(struct inode *dir, struct dentry *dentry, int mode);
  int security_inode_link(struct dentry *old_dentry, struct inode *dir,
                         struct dentry *new_dentry);
@@@ -1720,6 -1728,7 +1728,6 @@@ int security_inode_rename(struct inode 
  int security_inode_readlink(struct dentry *dentry);
  int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd);
  int security_inode_permission(struct inode *inode, int mask);
 -int security_inode_exec_permission(struct inode *inode, unsigned int flags);
  int security_inode_setattr(struct dentry *dentry, struct iattr *attr);
  int security_inode_getattr(struct vfsmount *mnt, struct dentry *dentry);
  int security_inode_setxattr(struct dentry *dentry, const char *name,
@@@ -2034,9 -2043,8 +2042,8 @@@ static inline void security_inode_free(
  static inline int security_inode_init_security(struct inode *inode,
                                                struct inode *dir,
                                                const struct qstr *qstr,
-                                               char **name,
-                                               void **value,
-                                               size_t *len)
+                                               initxattrs initxattrs,
+                                               void *fs_data)
  {
        return -EOPNOTSUPP;
  }
@@@ -2112,6 -2120,12 +2119,6 @@@ static inline int security_inode_permis
        return 0;
  }
  
 -static inline int security_inode_exec_permission(struct inode *inode,
 -                                                unsigned int flags)
 -{
 -      return 0;
 -}
 -
  static inline int security_inode_setattr(struct dentry *dentry,
                                          struct iattr *attr)
  {
diff --combined mm/shmem.c
@@@ -6,8 -6,7 +6,8 @@@
   *             2000-2001 Christoph Rohland
   *             2000-2001 SAP AG
   *             2002 Red Hat Inc.
 - * Copyright (C) 2002-2005 Hugh Dickins.
 + * Copyright (C) 2002-2011 Hugh Dickins.
 + * Copyright (C) 2011 Google Inc.
   * Copyright (C) 2002-2005 VERITAS Software Corporation.
   * Copyright (C) 2004 Andi Kleen, SuSE Labs
   *
@@@ -29,6 -28,7 +29,6 @@@
  #include <linux/file.h>
  #include <linux/mm.h>
  #include <linux/module.h>
 -#include <linux/percpu_counter.h>
  #include <linux/swap.h>
  
  static struct vfsmount *shm_mnt;
@@@ -51,9 -51,6 +51,9 @@@
  #include <linux/shmem_fs.h>
  #include <linux/writeback.h>
  #include <linux/blkdev.h>
 +#include <linux/pagevec.h>
 +#include <linux/percpu_counter.h>
 +#include <linux/splice.h>
  #include <linux/security.h>
  #include <linux/swapops.h>
  #include <linux/mempolicy.h>
  #include <linux/magic.h>
  
  #include <asm/uaccess.h>
 -#include <asm/div64.h>
  #include <asm/pgtable.h>
  
  #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
  #define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
  
  /* Pretend that each entry is of this size in directory's i_size */
  #define BOGO_DIRENT_SIZE 20
  
 +/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
 +#define SHORT_SYMLINK_LEN 128
 +
  struct shmem_xattr {
        struct list_head list;  /* anchored by shmem_inode_info->xattr_list */
        char *name;             /* xattr name */
        char value[0];
  };
  
 -/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
 +/* Flag allocation requirements to shmem_getpage */
  enum sgp_type {
        SGP_READ,       /* don't exceed i_size, don't allocate page */
        SGP_CACHE,      /* don't exceed i_size, may allocate page */
@@@ -103,14 -126,57 +103,14 @@@ static unsigned long shmem_default_max_
  }
  #endif
  
 -static int shmem_getpage(struct inode *inode, unsigned long idx,
 -                       struct page **pagep, enum sgp_type sgp, int *type);
 -
 -static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
 -{
 -      /*
 -       * The above definition of ENTRIES_PER_PAGE, and the use of
 -       * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
 -       * might be reconsidered if it ever diverges from PAGE_SIZE.
 -       *
 -       * Mobility flags are masked out as swap vectors cannot move
 -       */
 -      return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
 -                              PAGE_CACHE_SHIFT-PAGE_SHIFT);
 -}
 -
 -static inline void shmem_dir_free(struct page *page)
 -{
 -      __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
 -}
 -
 -static struct page **shmem_dir_map(struct page *page)
 -{
 -      return (struct page **)kmap_atomic(page, KM_USER0);
 -}
 -
 -static inline void shmem_dir_unmap(struct page **dir)
 -{
 -      kunmap_atomic(dir, KM_USER0);
 -}
 -
 -static swp_entry_t *shmem_swp_map(struct page *page)
 -{
 -      return (swp_entry_t *)kmap_atomic(page, KM_USER1);
 -}
 -
 -static inline void shmem_swp_balance_unmap(void)
 -{
 -      /*
 -       * When passing a pointer to an i_direct entry, to code which
 -       * also handles indirect entries and so will shmem_swp_unmap,
 -       * we must arrange for the preempt count to remain in balance.
 -       * What kmap_atomic of a lowmem page does depends on config
 -       * and architecture, so pretend to kmap_atomic some lowmem page.
 -       */
 -      (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
 -}
 +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 +      struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
  
 -static inline void shmem_swp_unmap(swp_entry_t *entry)
 +static inline int shmem_getpage(struct inode *inode, pgoff_t index,
 +      struct page **pagep, enum sgp_type sgp, int *fault_type)
  {
 -      kunmap_atomic(entry, KM_USER1);
 +      return shmem_getpage_gfp(inode, index, pagep, sgp,
 +                      mapping_gfp_mask(inode->i_mapping), fault_type);
  }
  
  static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@@ -170,6 -236,17 +170,6 @@@ static struct backing_dev_info shmem_ba
  static LIST_HEAD(shmem_swaplist);
  static DEFINE_MUTEX(shmem_swaplist_mutex);
  
 -static void shmem_free_blocks(struct inode *inode, long pages)
 -{
 -      struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 -      if (sbinfo->max_blocks) {
 -              percpu_counter_add(&sbinfo->used_blocks, -pages);
 -              spin_lock(&inode->i_lock);
 -              inode->i_blocks -= pages*BLOCKS_PER_PAGE;
 -              spin_unlock(&inode->i_lock);
 -      }
 -}
 -
  static int shmem_reserve_inode(struct super_block *sb)
  {
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
@@@ -196,7 -273,7 +196,7 @@@ static void shmem_free_inode(struct sup
  }
  
  /**
 - * shmem_recalc_inode - recalculate the size of an inode
 + * shmem_recalc_inode - recalculate the block usage of an inode
   * @inode: inode to recalc
   *
   * We have to calculate the free blocks since the mm can drop
@@@ -214,297 -291,474 +214,297 @@@ static void shmem_recalc_inode(struct i
  
        freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
        if (freed > 0) {
 +              struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 +              if (sbinfo->max_blocks)
 +                      percpu_counter_add(&sbinfo->used_blocks, -freed);
                info->alloced -= freed;
 +              inode->i_blocks -= freed * BLOCKS_PER_PAGE;
                shmem_unacct_blocks(info->flags, freed);
        }
  }
  
 -/**
 - * shmem_swp_entry - find the swap vector position in the info structure
 - * @info:  info structure for the inode
 - * @index: index of the page to find
 - * @page:  optional page to add to the structure. Has to be preset to
 - *         all zeros
 - *
 - * If there is no space allocated yet it will return NULL when
 - * page is NULL, else it will use the page for the needed block,
 - * setting it to NULL on return to indicate that it has been used.
 - *
 - * The swap vector is organized the following way:
 - *
 - * There are SHMEM_NR_DIRECT entries directly stored in the
 - * shmem_inode_info structure. So small files do not need an addional
 - * allocation.
 - *
 - * For pages with index > SHMEM_NR_DIRECT there is the pointer
 - * i_indirect which points to a page which holds in the first half
 - * doubly indirect blocks, in the second half triple indirect blocks:
 - *
 - * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
 - * following layout (for SHMEM_NR_DIRECT == 16):
 - *
 - * i_indirect -> dir --> 16-19
 - *          |      +-> 20-23
 - *          |
 - *          +-->dir2 --> 24-27
 - *          |        +-> 28-31
 - *          |        +-> 32-35
 - *          |        +-> 36-39
 - *          |
 - *          +-->dir3 --> 40-43
 - *                   +-> 44-47
 - *                   +-> 48-51
 - *                   +-> 52-55
 +/*
 + * Replace item expected in radix tree by a new item, while holding tree lock.
   */
 -static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
 -{
 -      unsigned long offset;
 -      struct page **dir;
 -      struct page *subdir;
 +static int shmem_radix_tree_replace(struct address_space *mapping,
 +                      pgoff_t index, void *expected, void *replacement)
 +{
 +      void **pslot;
 +      void *item = NULL;
 +
 +      VM_BUG_ON(!expected);
 +      pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
 +      if (pslot)
 +              item = radix_tree_deref_slot_protected(pslot,
 +                                                      &mapping->tree_lock);
 +      if (item != expected)
 +              return -ENOENT;
 +      if (replacement)
 +              radix_tree_replace_slot(pslot, replacement);
 +      else
 +              radix_tree_delete(&mapping->page_tree, index);
 +      return 0;
 +}
  
 -      if (index < SHMEM_NR_DIRECT) {
 -              shmem_swp_balance_unmap();
 -              return info->i_direct+index;
 -      }
 -      if (!info->i_indirect) {
 -              if (page) {
 -                      info->i_indirect = *page;
 -                      *page = NULL;
 -              }
 -              return NULL;                    /* need another page */
 -      }
 +/*
 + * Like add_to_page_cache_locked, but error if expected item has gone.
 + */
 +static int shmem_add_to_page_cache(struct page *page,
 +                                 struct address_space *mapping,
 +                                 pgoff_t index, gfp_t gfp, void *expected)
 +{
 +      int error = 0;
  
 -      index -= SHMEM_NR_DIRECT;
 -      offset = index % ENTRIES_PER_PAGE;
 -      index /= ENTRIES_PER_PAGE;
 -      dir = shmem_dir_map(info->i_indirect);
 -
 -      if (index >= ENTRIES_PER_PAGE/2) {
 -              index -= ENTRIES_PER_PAGE/2;
 -              dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
 -              index %= ENTRIES_PER_PAGE;
 -              subdir = *dir;
 -              if (!subdir) {
 -                      if (page) {
 -                              *dir = *page;
 -                              *page = NULL;
 -                      }
 -                      shmem_dir_unmap(dir);
 -                      return NULL;            /* need another page */
 -              }
 -              shmem_dir_unmap(dir);
 -              dir = shmem_dir_map(subdir);
 -      }
 +      VM_BUG_ON(!PageLocked(page));
 +      VM_BUG_ON(!PageSwapBacked(page));
  
 -      dir += index;
 -      subdir = *dir;
 -      if (!subdir) {
 -              if (!page || !(subdir = *page)) {
 -                      shmem_dir_unmap(dir);
 -                      return NULL;            /* need a page */
 +      if (!expected)
 +              error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
 +      if (!error) {
 +              page_cache_get(page);
 +              page->mapping = mapping;
 +              page->index = index;
 +
 +              spin_lock_irq(&mapping->tree_lock);
 +              if (!expected)
 +                      error = radix_tree_insert(&mapping->page_tree,
 +                                                      index, page);
 +              else
 +                      error = shmem_radix_tree_replace(mapping, index,
 +                                                      expected, page);
 +              if (!error) {
 +                      mapping->nrpages++;
 +                      __inc_zone_page_state(page, NR_FILE_PAGES);
 +                      __inc_zone_page_state(page, NR_SHMEM);
 +                      spin_unlock_irq(&mapping->tree_lock);
 +              } else {
 +                      page->mapping = NULL;
 +                      spin_unlock_irq(&mapping->tree_lock);
 +                      page_cache_release(page);
                }
 -              *dir = subdir;
 -              *page = NULL;
 +              if (!expected)
 +                      radix_tree_preload_end();
        }
 -      shmem_dir_unmap(dir);
 -      return shmem_swp_map(subdir) + offset;
 +      if (error)
 +              mem_cgroup_uncharge_cache_page(page);
 +      return error;
  }
  
 -static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
 +/*
 + * Like delete_from_page_cache, but substitutes swap for page.
 + */
 +static void shmem_delete_from_page_cache(struct page *page, void *radswap)
  {
 -      long incdec = value? 1: -1;
 +      struct address_space *mapping = page->mapping;
 +      int error;
  
 -      entry->val = value;
 -      info->swapped += incdec;
 -      if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
 -              struct page *page = kmap_atomic_to_page(entry);
 -              set_page_private(page, page_private(page) + incdec);
 -      }
 +      spin_lock_irq(&mapping->tree_lock);
 +      error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
 +      page->mapping = NULL;
 +      mapping->nrpages--;
 +      __dec_zone_page_state(page, NR_FILE_PAGES);
 +      __dec_zone_page_state(page, NR_SHMEM);
 +      spin_unlock_irq(&mapping->tree_lock);
 +      page_cache_release(page);
 +      BUG_ON(error);
  }
  
 -/**
 - * shmem_swp_alloc - get the position of the swap entry for the page.
 - * @info:     info structure for the inode
 - * @index:    index of the page to find
 - * @sgp:      check and recheck i_size? skip allocation?
 - *
 - * If the entry does not exist, allocate it.
 +/*
 + * Like find_get_pages, but collecting swap entries as well as pages.
   */
 -static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
 -{
 -      struct inode *inode = &info->vfs_inode;
 -      struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 -      struct page *page = NULL;
 -      swp_entry_t *entry;
 -
 -      if (sgp != SGP_WRITE &&
 -          ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
 -              return ERR_PTR(-EINVAL);
 -
 -      while (!(entry = shmem_swp_entry(info, index, &page))) {
 -              if (sgp == SGP_READ)
 -                      return shmem_swp_map(ZERO_PAGE(0));
 -              /*
 -               * Test used_blocks against 1 less max_blocks, since we have 1 data
 -               * page (and perhaps indirect index pages) yet to allocate:
 -               * a waste to allocate index if we cannot allocate data.
 -               */
 -              if (sbinfo->max_blocks) {
 -                      if (percpu_counter_compare(&sbinfo->used_blocks,
 -                                              sbinfo->max_blocks - 1) >= 0)
 -                              return ERR_PTR(-ENOSPC);
 -                      percpu_counter_inc(&sbinfo->used_blocks);
 -                      spin_lock(&inode->i_lock);
 -                      inode->i_blocks += BLOCKS_PER_PAGE;
 -                      spin_unlock(&inode->i_lock);
 +static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
 +                                      pgoff_t start, unsigned int nr_pages,
 +                                      struct page **pages, pgoff_t *indices)
 +{
 +      unsigned int i;
 +      unsigned int ret;
 +      unsigned int nr_found;
 +
 +      rcu_read_lock();
 +restart:
 +      nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
 +                              (void ***)pages, indices, start, nr_pages);
 +      ret = 0;
 +      for (i = 0; i < nr_found; i++) {
 +              struct page *page;
 +repeat:
 +              page = radix_tree_deref_slot((void **)pages[i]);
 +              if (unlikely(!page))
 +                      continue;
 +              if (radix_tree_exception(page)) {
 +                      if (radix_tree_deref_retry(page))
 +                              goto restart;
 +                      /*
 +                       * Otherwise, we must be storing a swap entry
 +                       * here as an exceptional entry: so return it
 +                       * without attempting to raise page count.
 +                       */
 +                      goto export;
                }
 +              if (!page_cache_get_speculative(page))
 +                      goto repeat;
  
 -              spin_unlock(&info->lock);
 -              page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
 -              spin_lock(&info->lock);
 -
 -              if (!page) {
 -                      shmem_free_blocks(inode, 1);
 -                      return ERR_PTR(-ENOMEM);
 -              }
 -              if (sgp != SGP_WRITE &&
 -                  ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
 -                      entry = ERR_PTR(-EINVAL);
 -                      break;
 +              /* Has the page moved? */
 +              if (unlikely(page != *((void **)pages[i]))) {
 +                      page_cache_release(page);
 +                      goto repeat;
                }
 -              if (info->next_index <= index)
 -                      info->next_index = index + 1;
 -      }
 -      if (page) {
 -              /* another task gave its page, or truncated the file */
 -              shmem_free_blocks(inode, 1);
 -              shmem_dir_free(page);
 -      }
 -      if (info->next_index <= index && !IS_ERR(entry))
 -              info->next_index = index + 1;
 -      return entry;
 +export:
 +              indices[ret] = indices[i];
 +              pages[ret] = page;
 +              ret++;
 +      }
 +      if (unlikely(!ret && nr_found))
 +              goto restart;
 +      rcu_read_unlock();
 +      return ret;
  }
  
 -/**
 - * shmem_free_swp - free some swap entries in a directory
 - * @dir:        pointer to the directory
 - * @edir:       pointer after last entry of the directory
 - * @punch_lock: pointer to spinlock when needed for the holepunch case
 +/*
 + * Remove swap entry from radix tree, free the swap and its page cache.
   */
 -static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
 -                                              spinlock_t *punch_lock)
 -{
 -      spinlock_t *punch_unlock = NULL;
 -      swp_entry_t *ptr;
 -      int freed = 0;
 -
 -      for (ptr = dir; ptr < edir; ptr++) {
 -              if (ptr->val) {
 -                      if (unlikely(punch_lock)) {
 -                              punch_unlock = punch_lock;
 -                              punch_lock = NULL;
 -                              spin_lock(punch_unlock);
 -                              if (!ptr->val)
 -                                      continue;
 -                      }
 -                      free_swap_and_cache(*ptr);
 -                      *ptr = (swp_entry_t){0};
 -                      freed++;
 -              }
 -      }
 -      if (punch_unlock)
 -              spin_unlock(punch_unlock);
 -      return freed;
 -}
 -
 -static int shmem_map_and_free_swp(struct page *subdir, int offset,
 -              int limit, struct page ***dir, spinlock_t *punch_lock)
 -{
 -      swp_entry_t *ptr;
 -      int freed = 0;
 -
 -      ptr = shmem_swp_map(subdir);
 -      for (; offset < limit; offset += LATENCY_LIMIT) {
 -              int size = limit - offset;
 -              if (size > LATENCY_LIMIT)
 -                      size = LATENCY_LIMIT;
 -              freed += shmem_free_swp(ptr+offset, ptr+offset+size,
 -                                                      punch_lock);
 -              if (need_resched()) {
 -                      shmem_swp_unmap(ptr);
 -                      if (*dir) {
 -                              shmem_dir_unmap(*dir);
 -                              *dir = NULL;
 -                      }
 -                      cond_resched();
 -                      ptr = shmem_swp_map(subdir);
 -              }
 -      }
 -      shmem_swp_unmap(ptr);
 -      return freed;
 +static int shmem_free_swap(struct address_space *mapping,
 +                         pgoff_t index, void *radswap)
 +{
 +      int error;
 +
 +      spin_lock_irq(&mapping->tree_lock);
 +      error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
 +      spin_unlock_irq(&mapping->tree_lock);
 +      if (!error)
 +              free_swap_and_cache(radix_to_swp_entry(radswap));
 +      return error;
  }
  
 -static void shmem_free_pages(struct list_head *next)
 +/*
 + * Pagevec may contain swap entries, so shuffle up pages before releasing.
 + */
 +static void shmem_pagevec_release(struct pagevec *pvec)
  {
 -      struct page *page;
 -      int freed = 0;
 -
 -      do {
 -              page = container_of(next, struct page, lru);
 -              next = next->next;
 -              shmem_dir_free(page);
 -              freed++;
 -              if (freed >= LATENCY_LIMIT) {
 -                      cond_resched();
 -                      freed = 0;
 -              }
 -      } while (next);
 +      int i, j;
 +
 +      for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
 +              struct page *page = pvec->pages[i];
 +              if (!radix_tree_exceptional_entry(page))
 +                      pvec->pages[j++] = page;
 +      }
 +      pvec->nr = j;
 +      pagevec_release(pvec);
  }
  
 -void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 +/*
 + * Remove range of pages and swap entries from radix tree, and free them.
 + */
 +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
  {
 +      struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
 -      unsigned long idx;
 -      unsigned long size;
 -      unsigned long limit;
 -      unsigned long stage;
 -      unsigned long diroff;
 -      struct page **dir;
 -      struct page *topdir;
 -      struct page *middir;
 -      struct page *subdir;
 -      swp_entry_t *ptr;
 -      LIST_HEAD(pages_to_free);
 -      long nr_pages_to_free = 0;
 +      pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 +      unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 +      pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
 +      struct pagevec pvec;
 +      pgoff_t indices[PAGEVEC_SIZE];
        long nr_swaps_freed = 0;
 -      int offset;
 -      int freed;
 -      int punch_hole;
 -      spinlock_t *needs_lock;
 -      spinlock_t *punch_lock;
 -      unsigned long upper_limit;
 +      pgoff_t index;
 +      int i;
  
 -      truncate_inode_pages_range(inode->i_mapping, start, end);
 +      BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
  
 -      inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 -      idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 -      if (idx >= info->next_index)
 -              return;
 +      pagevec_init(&pvec, 0);
 +      index = start;
 +      while (index <= end) {
 +              pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
 +                      min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
 +                                                      pvec.pages, indices);
 +              if (!pvec.nr)
 +                      break;
 +              mem_cgroup_uncharge_start();
 +              for (i = 0; i < pagevec_count(&pvec); i++) {
 +                      struct page *page = pvec.pages[i];
  
 -      spin_lock(&info->lock);
 -      info->flags |= SHMEM_TRUNCATE;
 -      if (likely(end == (loff_t) -1)) {
 -              limit = info->next_index;
 -              upper_limit = SHMEM_MAX_INDEX;
 -              info->next_index = idx;
 -              needs_lock = NULL;
 -              punch_hole = 0;
 -      } else {
 -              if (end + 1 >= inode->i_size) { /* we may free a little more */
 -                      limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
 -                                                      PAGE_CACHE_SHIFT;
 -                      upper_limit = SHMEM_MAX_INDEX;
 -              } else {
 -                      limit = (end + 1) >> PAGE_CACHE_SHIFT;
 -                      upper_limit = limit;
 -              }
 -              needs_lock = &info->lock;
 -              punch_hole = 1;
 -      }
 +                      index = indices[i];
 +                      if (index > end)
 +                              break;
  
 -      topdir = info->i_indirect;
 -      if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
 -              info->i_indirect = NULL;
 -              nr_pages_to_free++;
 -              list_add(&topdir->lru, &pages_to_free);
 +                      if (radix_tree_exceptional_entry(page)) {
 +                              nr_swaps_freed += !shmem_free_swap(mapping,
 +                                                              index, page);
 +                              continue;
 +                      }
 +
 +                      if (!trylock_page(page))
 +                              continue;
 +                      if (page->mapping == mapping) {
 +                              VM_BUG_ON(PageWriteback(page));
 +                              truncate_inode_page(mapping, page);
 +                      }
 +                      unlock_page(page);
 +              }
 +              shmem_pagevec_release(&pvec);
 +              mem_cgroup_uncharge_end();
 +              cond_resched();
 +              index++;
        }
 -      spin_unlock(&info->lock);
  
 -      if (info->swapped && idx < SHMEM_NR_DIRECT) {
 -              ptr = info->i_direct;
 -              size = limit;
 -              if (size > SHMEM_NR_DIRECT)
 -                      size = SHMEM_NR_DIRECT;
 -              nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
 +      if (partial) {
 +              struct page *page = NULL;
 +              shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
 +              if (page) {
 +                      zero_user_segment(page, partial, PAGE_CACHE_SIZE);
 +                      set_page_dirty(page);
 +                      unlock_page(page);
 +                      page_cache_release(page);
 +              }
        }
  
 -      /*
 -       * If there are no indirect blocks or we are punching a hole
 -       * below indirect blocks, nothing to be done.
 -       */
 -      if (!topdir || limit <= SHMEM_NR_DIRECT)
 -              goto done2;
 +      index = start;
 +      for ( ; ; ) {
 +              cond_resched();
 +              pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
 +                      min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
 +                                                      pvec.pages, indices);
 +              if (!pvec.nr) {
 +                      if (index == start)
 +                              break;
 +                      index = start;
 +                      continue;
 +              }
 +              if (index == start && indices[0] > end) {
 +                      shmem_pagevec_release(&pvec);
 +                      break;
 +              }
 +              mem_cgroup_uncharge_start();
 +              for (i = 0; i < pagevec_count(&pvec); i++) {
 +                      struct page *page = pvec.pages[i];
  
 -      /*
 -       * The truncation case has already dropped info->lock, and we're safe
 -       * because i_size and next_index have already been lowered, preventing
 -       * access beyond.  But in the punch_hole case, we still need to take
 -       * the lock when updating the swap directory, because there might be
 -       * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
 -       * shmem_writepage.  However, whenever we find we can remove a whole
 -       * directory page (not at the misaligned start or end of the range),
 -       * we first NULLify its pointer in the level above, and then have no
 -       * need to take the lock when updating its contents: needs_lock and
 -       * punch_lock (either pointing to info->lock or NULL) manage this.
 -       */
 +                      index = indices[i];
 +                      if (index > end)
 +                              break;
  
 -      upper_limit -= SHMEM_NR_DIRECT;
 -      limit -= SHMEM_NR_DIRECT;
 -      idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
 -      offset = idx % ENTRIES_PER_PAGE;
 -      idx -= offset;
 -
 -      dir = shmem_dir_map(topdir);
 -      stage = ENTRIES_PER_PAGEPAGE/2;
 -      if (idx < ENTRIES_PER_PAGEPAGE/2) {
 -              middir = topdir;
 -              diroff = idx/ENTRIES_PER_PAGE;
 -      } else {
 -              dir += ENTRIES_PER_PAGE/2;
 -              dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
 -              while (stage <= idx)
 -                      stage += ENTRIES_PER_PAGEPAGE;
 -              middir = *dir;
 -              if (*dir) {
 -                      diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
 -                              ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
 -                      if (!diroff && !offset && upper_limit >= stage) {
 -                              if (needs_lock) {
 -                                      spin_lock(needs_lock);
 -                                      *dir = NULL;
 -                                      spin_unlock(needs_lock);
 -                                      needs_lock = NULL;
 -                              } else
 -                                      *dir = NULL;
 -                              nr_pages_to_free++;
 -                              list_add(&middir->lru, &pages_to_free);
 +                      if (radix_tree_exceptional_entry(page)) {
 +                              nr_swaps_freed += !shmem_free_swap(mapping,
 +                                                              index, page);
 +                              continue;
                        }
 -                      shmem_dir_unmap(dir);
 -                      dir = shmem_dir_map(middir);
 -              } else {
 -                      diroff = 0;
 -                      offset = 0;
 -                      idx = stage;
 -              }
 -      }
  
 -      for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
 -              if (unlikely(idx == stage)) {
 -                      shmem_dir_unmap(dir);
 -                      dir = shmem_dir_map(topdir) +
 -                          ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
 -                      while (!*dir) {
 -                              dir++;
 -                              idx += ENTRIES_PER_PAGEPAGE;
 -                              if (idx >= limit)
 -                                      goto done1;
 +                      lock_page(page);
 +                      if (page->mapping == mapping) {
 +                              VM_BUG_ON(PageWriteback(page));
 +                              truncate_inode_page(mapping, page);
                        }
 -                      stage = idx + ENTRIES_PER_PAGEPAGE;
 -                      middir = *dir;
 -                      if (punch_hole)
 -                              needs_lock = &info->lock;
 -                      if (upper_limit >= stage) {
 -                              if (needs_lock) {
 -                                      spin_lock(needs_lock);
 -                                      *dir = NULL;
 -                                      spin_unlock(needs_lock);
 -                                      needs_lock = NULL;
 -                              } else
 -                                      *dir = NULL;
 -                              nr_pages_to_free++;
 -                              list_add(&middir->lru, &pages_to_free);
 -                      }
 -                      shmem_dir_unmap(dir);
 -                      cond_resched();
 -                      dir = shmem_dir_map(middir);
 -                      diroff = 0;
 -              }
 -              punch_lock = needs_lock;
 -              subdir = dir[diroff];
 -              if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
 -                      if (needs_lock) {
 -                              spin_lock(needs_lock);
 -                              dir[diroff] = NULL;
 -                              spin_unlock(needs_lock);
 -                              punch_lock = NULL;
 -                      } else
 -                              dir[diroff] = NULL;
 -                      nr_pages_to_free++;
 -                      list_add(&subdir->lru, &pages_to_free);
 -              }
 -              if (subdir && page_private(subdir) /* has swap entries */) {
 -                      size = limit - idx;
 -                      if (size > ENTRIES_PER_PAGE)
 -                              size = ENTRIES_PER_PAGE;
 -                      freed = shmem_map_and_free_swp(subdir,
 -                                      offset, size, &dir, punch_lock);
 -                      if (!dir)
 -                              dir = shmem_dir_map(middir);
 -                      nr_swaps_freed += freed;
 -                      if (offset || punch_lock) {
 -                              spin_lock(&info->lock);
 -                              set_page_private(subdir,
 -                                      page_private(subdir) - freed);
 -                              spin_unlock(&info->lock);
 -                      } else
 -                              BUG_ON(page_private(subdir) != freed);
 +                      unlock_page(page);
                }
 -              offset = 0;
 -      }
 -done1:
 -      shmem_dir_unmap(dir);
 -done2:
 -      if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
 -              /*
 -               * Call truncate_inode_pages again: racing shmem_unuse_inode
 -               * may have swizzled a page in from swap since
 -               * truncate_pagecache or generic_delete_inode did it, before we
 -               * lowered next_index.  Also, though shmem_getpage checks
 -               * i_size before adding to cache, no recheck after: so fix the
 -               * narrow window there too.
 -               */
 -              truncate_inode_pages_range(inode->i_mapping, start, end);
 +              shmem_pagevec_release(&pvec);
 +              mem_cgroup_uncharge_end();
 +              index++;
        }
  
        spin_lock(&info->lock);
 -      info->flags &= ~SHMEM_TRUNCATE;
        info->swapped -= nr_swaps_freed;
        shmem_recalc_inode(inode);
        spin_unlock(&info->lock);
  
 -      /*
 -       * Empty swap vector directory pages to be freed?
 -       */
 -      if (!list_empty(&pages_to_free)) {
 -              pages_to_free.prev->next = NULL;
 -              shmem_free_pages(pages_to_free.next);
 -      }
 +      inode->i_ctime = inode->i_mtime = CURRENT_TIME;
  }
  EXPORT_SYMBOL_GPL(shmem_truncate_range);
  
@@@ -520,7 -774,37 +520,7 @@@ static int shmem_setattr(struct dentry 
        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
                loff_t oldsize = inode->i_size;
                loff_t newsize = attr->ia_size;
 -              struct page *page = NULL;
  
 -              if (newsize < oldsize) {
 -                      /*
 -                       * If truncating down to a partial page, then
 -                       * if that page is already allocated, hold it
 -                       * in memory until the truncation is over, so
 -                       * truncate_partial_page cannot miss it were
 -                       * it assigned to swap.
 -                       */
 -                      if (newsize & (PAGE_CACHE_SIZE-1)) {
 -                              (void) shmem_getpage(inode,
 -                                      newsize >> PAGE_CACHE_SHIFT,
 -                                              &page, SGP_READ, NULL);
 -                              if (page)
 -                                      unlock_page(page);
 -                      }
 -                      /*
 -                       * Reset SHMEM_PAGEIN flag so that shmem_truncate can
 -                       * detect if any pages might have been added to cache
 -                       * after truncate_inode_pages.  But we needn't bother
 -                       * if it's being fully truncated to zero-length: the
 -                       * nrpages check is efficient enough in that case.
 -                       */
 -                      if (newsize) {
 -                              struct shmem_inode_info *info = SHMEM_I(inode);
 -                              spin_lock(&info->lock);
 -                              info->flags &= ~SHMEM_PAGEIN;
 -                              spin_unlock(&info->lock);
 -                      }
 -              }
                if (newsize != oldsize) {
                        i_size_write(inode, newsize);
                        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
                        /* unmap again to remove racily COWed private pages */
                        unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
                }
 -              if (page)
 -                      page_cache_release(page);
        }
  
        setattr_copy(inode, attr);
@@@ -556,8 -842,7 +556,8 @@@ static void shmem_evict_inode(struct in
                        list_del_init(&info->swaplist);
                        mutex_unlock(&shmem_swaplist_mutex);
                }
 -      }
 +      } else
 +              kfree(info->symlink);
  
        list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
                kfree(xattr->name);
        end_writeback(inode);
  }
  
 -static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
 -{
 -      swp_entry_t *ptr;
 -
 -      for (ptr = dir; ptr < edir; ptr++) {
 -              if (ptr->val == entry.val)
 -                      return ptr - dir;
 -      }
 -      return -1;
 -}
 -
 -static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
 +/*
 + * If swap found in inode, free it and move page from swapcache to filecache.
 + */
 +static int shmem_unuse_inode(struct shmem_inode_info *info,
 +                           swp_entry_t swap, struct page *page)
  {
 -      struct address_space *mapping;
 -      unsigned long idx;
 -      unsigned long size;
 -      unsigned long limit;
 -      unsigned long stage;
 -      struct page **dir;
 -      struct page *subdir;
 -      swp_entry_t *ptr;
 -      int offset;
 +      struct address_space *mapping = info->vfs_inode.i_mapping;
 +      void *radswap;
 +      pgoff_t index;
        int error;
  
 -      idx = 0;
 -      ptr = info->i_direct;
 -      spin_lock(&info->lock);
 -      if (!info->swapped) {
 -              list_del_init(&info->swaplist);
 -              goto lost2;
 -      }
 -      limit = info->next_index;
 -      size = limit;
 -      if (size > SHMEM_NR_DIRECT)
 -              size = SHMEM_NR_DIRECT;
 -      offset = shmem_find_swp(entry, ptr, ptr+size);
 -      if (offset >= 0) {
 -              shmem_swp_balance_unmap();
 -              goto found;
 -      }
 -      if (!info->i_indirect)
 -              goto lost2;
 -
 -      dir = shmem_dir_map(info->i_indirect);
 -      stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
 -
 -      for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
 -              if (unlikely(idx == stage)) {
 -                      shmem_dir_unmap(dir-1);
 -                      if (cond_resched_lock(&info->lock)) {
 -                              /* check it has not been truncated */
 -                              if (limit > info->next_index) {
 -                                      limit = info->next_index;
 -                                      if (idx >= limit)
 -                                              goto lost2;
 -                              }
 -                      }
 -                      dir = shmem_dir_map(info->i_indirect) +
 -                          ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
 -                      while (!*dir) {
 -                              dir++;
 -                              idx += ENTRIES_PER_PAGEPAGE;
 -                              if (idx >= limit)
 -                                      goto lost1;
 -                      }
 -                      stage = idx + ENTRIES_PER_PAGEPAGE;
 -                      subdir = *dir;
 -                      shmem_dir_unmap(dir);
 -                      dir = shmem_dir_map(subdir);
 -              }
 -              subdir = *dir;
 -              if (subdir && page_private(subdir)) {
 -                      ptr = shmem_swp_map(subdir);
 -                      size = limit - idx;
 -                      if (size > ENTRIES_PER_PAGE)
 -                              size = ENTRIES_PER_PAGE;
 -                      offset = shmem_find_swp(entry, ptr, ptr+size);
 -                      shmem_swp_unmap(ptr);
 -                      if (offset >= 0) {
 -                              shmem_dir_unmap(dir);
 -                              ptr = shmem_swp_map(subdir);
 -                              goto found;
 -                      }
 -              }
 -      }
 -lost1:
 -      shmem_dir_unmap(dir-1);
 -lost2:
 -      spin_unlock(&info->lock);
 -      return 0;
 -found:
 -      idx += offset;
 -      ptr += offset;
 +      radswap = swp_to_radix_entry(swap);
 +      index = radix_tree_locate_item(&mapping->page_tree, radswap);
 +      if (index == -1)
 +              return 0;
  
        /*
         * Move _head_ to start search for next from here.
         * But be careful: shmem_evict_inode checks list_empty without taking
         * mutex, and there's an instant in list_move_tail when info->swaplist
 -       * would appear empty, if it were the only one on shmem_swaplist.  We
 -       * could avoid doing it if inode NULL; or use this minor optimization.
 +       * would appear empty, if it were the only one on shmem_swaplist.
         */
        if (shmem_swaplist.next != &info->swaplist)
                list_move_tail(&shmem_swaplist, &info->swaplist);
         * but also to hold up shmem_evict_inode(): so inode cannot be freed
         * beneath us (pagelock doesn't help until the page is in pagecache).
         */
 -      mapping = info->vfs_inode.i_mapping;
 -      error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
 +      error = shmem_add_to_page_cache(page, mapping, index,
 +                                              GFP_NOWAIT, radswap);
        /* which does mem_cgroup_uncharge_cache_page on error */
  
 -      if (error == -EEXIST) {
 -              struct page *filepage = find_get_page(mapping, idx);
 -              error = 1;
 -              if (filepage) {
 -                      /*
 -                       * There might be a more uptodate page coming down
 -                       * from a stacked writepage: forget our swappage if so.
 -                       */
 -                      if (PageUptodate(filepage))
 -                              error = 0;
 -                      page_cache_release(filepage);
 -              }
 -      }
 -      if (!error) {
 +      if (error != -ENOMEM) {
 +              /*
 +               * Truncation and eviction use free_swap_and_cache(), which
 +               * only does trylock page: if we raced, best clean up here.
 +               */
                delete_from_swap_cache(page);
                set_page_dirty(page);
 -              info->flags |= SHMEM_PAGEIN;
 -              shmem_swp_set(info, ptr, 0);
 -              swap_free(entry);
 +              if (!error) {
 +                      spin_lock(&info->lock);
 +                      info->swapped--;
 +                      spin_unlock(&info->lock);
 +                      swap_free(swap);
 +              }
                error = 1;      /* not an error, but entry was found */
        }
 -      shmem_swp_unmap(ptr);
 -      spin_unlock(&info->lock);
        return error;
  }
  
  /*
 - * shmem_unuse() search for an eventually swapped out shmem page.
 + * Search through swapped inodes to find and replace swap by page.
   */
 -int shmem_unuse(swp_entry_t entry, struct page *page)
 +int shmem_unuse(swp_entry_t swap, struct page *page)
  {
 -      struct list_head *p, *next;
 +      struct list_head *this, *next;
        struct shmem_inode_info *info;
        int found = 0;
        int error;
         * Charge page using GFP_KERNEL while we can wait, before taking
         * the shmem_swaplist_mutex which might hold up shmem_writepage().
         * Charged back to the user (not to caller) when swap account is used.
 -       * add_to_page_cache() will be called with GFP_NOWAIT.
         */
        error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
        if (error)
                goto out;
 -      /*
 -       * Try to preload while we can wait, to not make a habit of
 -       * draining atomic reserves; but don't latch on to this cpu,
 -       * it's okay if sometimes we get rescheduled after this.
 -       */
 -      error = radix_tree_preload(GFP_KERNEL);
 -      if (error)
 -              goto uncharge;
 -      radix_tree_preload_end();
 +      /* No radix_tree_preload: swap entry keeps a place for page in tree */
  
        mutex_lock(&shmem_swaplist_mutex);
 -      list_for_each_safe(p, next, &shmem_swaplist) {
 -              info = list_entry(p, struct shmem_inode_info, swaplist);
 -              found = shmem_unuse_inode(info, entry, page);
 +      list_for_each_safe(this, next, &shmem_swaplist) {
 +              info = list_entry(this, struct shmem_inode_info, swaplist);
 +              if (info->swapped)
 +                      found = shmem_unuse_inode(info, swap, page);
 +              else
 +                      list_del_init(&info->swaplist);
                cond_resched();
                if (found)
                        break;
        }
        mutex_unlock(&shmem_swaplist_mutex);
  
 -uncharge:
        if (!found)
                mem_cgroup_uncharge_cache_page(page);
        if (found < 0)
@@@ -669,10 -1048,10 +669,10 @@@ out
  static int shmem_writepage(struct page *page, struct writeback_control *wbc)
  {
        struct shmem_inode_info *info;
 -      swp_entry_t *entry, swap;
        struct address_space *mapping;
 -      unsigned long index;
        struct inode *inode;
 +      swp_entry_t swap;
 +      pgoff_t index;
  
        BUG_ON(!PageLocked(page));
        mapping = page->mapping;
        /*
         * shmem_backing_dev_info's capabilities prevent regular writeback or
         * sync from ever calling shmem_writepage; but a stacking filesystem
 -       * may use the ->writepage of its underlying filesystem, in which case
 +       * might use ->writepage of its underlying filesystem, in which case
         * tmpfs should write out to swap only in response to memory pressure,
 -       * and not for the writeback threads or sync.  However, in those cases,
 -       * we do still want to check if there's a redundant swappage to be
 -       * discarded.
 +       * and not for the writeback threads or sync.
         */
 -      if (wbc->for_reclaim)
 -              swap = get_swap_page();
 -      else
 -              swap.val = 0;
 +      if (!wbc->for_reclaim) {
 +              WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
 +              goto redirty;
 +      }
 +      swap = get_swap_page();
 +      if (!swap.val)
 +              goto redirty;
  
        /*
         * Add inode to shmem_unuse()'s list of swapped-out inodes,
 -       * if it's not already there.  Do it now because we cannot take
 -       * mutex while holding spinlock, and must do so before the page
 -       * is moved to swap cache, when its pagelock no longer protects
 +       * if it's not already there.  Do it now before the page is
 +       * moved to swap cache, when its pagelock no longer protects
         * the inode from eviction.  But don't unlock the mutex until
 -       * we've taken the spinlock, because shmem_unuse_inode() will
 -       * prune a !swapped inode from the swaplist under both locks.
 +       * we've incremented swapped, because shmem_unuse_inode() will
 +       * prune a !swapped inode from the swaplist under this mutex.
         */
 -      if (swap.val) {
 -              mutex_lock(&shmem_swaplist_mutex);
 -              if (list_empty(&info->swaplist))
 -                      list_add_tail(&info->swaplist, &shmem_swaplist);
 -      }
 -
 -      spin_lock(&info->lock);
 -      if (swap.val)
 -              mutex_unlock(&shmem_swaplist_mutex);
 -
 -      if (index >= info->next_index) {
 -              BUG_ON(!(info->flags & SHMEM_TRUNCATE));
 -              goto unlock;
 -      }
 -      entry = shmem_swp_entry(info, index, NULL);
 -      if (entry->val) {
 -              /*
 -               * The more uptodate page coming down from a stacked
 -               * writepage should replace our old swappage.
 -               */
 -              free_swap_and_cache(*entry);
 -              shmem_swp_set(info, entry, 0);
 -      }
 -      shmem_recalc_inode(inode);
 +      mutex_lock(&shmem_swaplist_mutex);
 +      if (list_empty(&info->swaplist))
 +              list_add_tail(&info->swaplist, &shmem_swaplist);
  
 -      if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
 -              delete_from_page_cache(page);
 -              shmem_swp_set(info, entry, swap.val);
 -              shmem_swp_unmap(entry);
 +      if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
                swap_shmem_alloc(swap);
 +              shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
 +
 +              spin_lock(&info->lock);
 +              info->swapped++;
 +              shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
 +
 +              mutex_unlock(&shmem_swaplist_mutex);
                BUG_ON(page_mapped(page));
                swap_writepage(page, wbc);
                return 0;
        }
  
 -      shmem_swp_unmap(entry);
 -unlock:
 -      spin_unlock(&info->lock);
 -      /*
 -       * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 -       * clear SWAP_HAS_CACHE flag.
 -       */
 +      mutex_unlock(&shmem_swaplist_mutex);
        swapcache_free(swap, NULL);
  redirty:
        set_page_dirty(page);
@@@ -763,33 -1165,35 +763,33 @@@ static struct mempolicy *shmem_get_sbmp
  }
  #endif /* CONFIG_TMPFS */
  
 -static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
 -                      struct shmem_inode_info *info, unsigned long idx)
 +static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
 +                      struct shmem_inode_info *info, pgoff_t index)
  {
        struct mempolicy mpol, *spol;
        struct vm_area_struct pvma;
 -      struct page *page;
  
        spol = mpol_cond_copy(&mpol,
 -                              mpol_shared_policy_lookup(&info->policy, idx));
 +                      mpol_shared_policy_lookup(&info->policy, index));
  
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
 -      pvma.vm_pgoff = idx;
 +      pvma.vm_pgoff = index;
        pvma.vm_ops = NULL;
        pvma.vm_policy = spol;
 -      page = swapin_readahead(entry, gfp, &pvma, 0);
 -      return page;
 +      return swapin_readahead(swap, gfp, &pvma, 0);
  }
  
  static struct page *shmem_alloc_page(gfp_t gfp,
 -                      struct shmem_inode_info *info, unsigned long idx)
 +                      struct shmem_inode_info *info, pgoff_t index)
  {
        struct vm_area_struct pvma;
  
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
 -      pvma.vm_pgoff = idx;
 +      pvma.vm_pgoff = index;
        pvma.vm_ops = NULL;
 -      pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
 +      pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
  
        /*
         * alloc_page_vma() will drop the shared policy reference
  }
  #else /* !CONFIG_NUMA */
  #ifdef CONFIG_TMPFS
 -static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
 +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
  {
  }
  #endif /* CONFIG_TMPFS */
  
 -static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
 -                      struct shmem_inode_info *info, unsigned long idx)
 +static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
 +                      struct shmem_inode_info *info, pgoff_t index)
  {
 -      return swapin_readahead(entry, gfp, NULL, 0);
 +      return swapin_readahead(swap, gfp, NULL, 0);
  }
  
  static inline struct page *shmem_alloc_page(gfp_t gfp,
 -                      struct shmem_inode_info *info, unsigned long idx)
 +                      struct shmem_inode_info *info, pgoff_t index)
  {
        return alloc_page(gfp);
  }
@@@ -824,195 -1228,311 +824,195 @@@ static inline struct mempolicy *shmem_g
  #endif
  
  /*
 - * shmem_getpage - either get the page from swap or allocate a new one
 + * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
   *
   * If we allocate a new one we do not mark it dirty. That's up to the
   * vm. If we swap it in we mark it dirty since we also free the swap
   * entry since a page cannot live in both the swap and page cache
   */
 -static int shmem_getpage(struct inode *inode, unsigned long idx,
 -                      struct page **pagep, enum sgp_type sgp, int *type)
 +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 +      struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
  {
        struct address_space *mapping = inode->i_mapping;
 -      struct shmem_inode_info *info = SHMEM_I(inode);
 +      struct shmem_inode_info *info;
        struct shmem_sb_info *sbinfo;
 -      struct page *filepage = *pagep;
 -      struct page *swappage;
 -      struct page *prealloc_page = NULL;
 -      swp_entry_t *entry;
 +      struct page *page;
        swp_entry_t swap;
 -      gfp_t gfp;
        int error;
 +      int once = 0;
  
 -      if (idx >= SHMEM_MAX_INDEX)
 +      if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
                return -EFBIG;
 +repeat:
 +      swap.val = 0;
 +      page = find_lock_page(mapping, index);
 +      if (radix_tree_exceptional_entry(page)) {
 +              swap = radix_to_swp_entry(page);
 +              page = NULL;
 +      }
  
 -      if (type)
 -              *type = 0;
 +      if (sgp != SGP_WRITE &&
 +          ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
 +              error = -EINVAL;
 +              goto failed;
 +      }
  
 -      /*
 -       * Normally, filepage is NULL on entry, and either found
 -       * uptodate immediately, or allocated and zeroed, or read
 -       * in under swappage, which is then assigned to filepage.
 -       * But shmem_readpage (required for splice) passes in a locked
 -       * filepage, which may be found not uptodate by other callers
 -       * too, and may need to be copied from the swappage read in.
 -       */
 -repeat:
 -      if (!filepage)
 -              filepage = find_lock_page(mapping, idx);
 -      if (filepage && PageUptodate(filepage))
 -              goto done;
 -      gfp = mapping_gfp_mask(mapping);
 -      if (!filepage) {
 +      if (page || (sgp == SGP_READ && !swap.val)) {
                /*
 -               * Try to preload while we can wait, to not make a habit of
 -               * draining atomic reserves; but don't latch on to this cpu.
 +               * Once we can get the page lock, it must be uptodate:
 +               * if there were an error in reading back from swap,
 +               * the page would not be inserted into the filecache.
                 */
 -              error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
 -              if (error)
 -                      goto failed;
 -              radix_tree_preload_end();
 -              if (sgp != SGP_READ && !prealloc_page) {
 -                      /* We don't care if this fails */
 -                      prealloc_page = shmem_alloc_page(gfp, info, idx);
 -                      if (prealloc_page) {
 -                              if (mem_cgroup_cache_charge(prealloc_page,
 -                                              current->mm, GFP_KERNEL)) {
 -                                      page_cache_release(prealloc_page);
 -                                      prealloc_page = NULL;
 -                              }
 -                      }
 -              }
 +              BUG_ON(page && !PageUptodate(page));
 +              *pagep = page;
 +              return 0;
        }
 -      error = 0;
  
 -      spin_lock(&info->lock);
 -      shmem_recalc_inode(inode);
 -      entry = shmem_swp_alloc(info, idx, sgp);
 -      if (IS_ERR(entry)) {
 -              spin_unlock(&info->lock);
 -              error = PTR_ERR(entry);
 -              goto failed;
 -      }
 -      swap = *entry;
 +      /*
 +       * Fast cache lookup did not find it:
 +       * bring it back from swap or allocate.
 +       */
 +      info = SHMEM_I(inode);
 +      sbinfo = SHMEM_SB(inode->i_sb);
  
        if (swap.val) {
                /* Look it up and read it in.. */
 -              swappage = lookup_swap_cache(swap);
 -              if (!swappage) {
 -                      shmem_swp_unmap(entry);
 -                      spin_unlock(&info->lock);
 +              page = lookup_swap_cache(swap);
 +              if (!page) {
                        /* here we actually do the io */
 -                      if (type)
 -                              *type |= VM_FAULT_MAJOR;
 -                      swappage = shmem_swapin(swap, gfp, info, idx);
 -                      if (!swappage) {
 -                              spin_lock(&info->lock);
 -                              entry = shmem_swp_alloc(info, idx, sgp);
 -                              if (IS_ERR(entry))
 -                                      error = PTR_ERR(entry);
 -                              else {
 -                                      if (entry->val == swap.val)
 -                                              error = -ENOMEM;
 -                                      shmem_swp_unmap(entry);
 -                              }
 -                              spin_unlock(&info->lock);
 -                              if (error)
 -                                      goto failed;
 -                              goto repeat;
 +                      if (fault_type)
 +                              *fault_type |= VM_FAULT_MAJOR;
 +                      page = shmem_swapin(swap, gfp, info, index);
 +                      if (!page) {
 +                              error = -ENOMEM;
 +                              goto failed;
                        }
                }
  
                /* We have to do this with page locked to prevent races */
 -              if (!trylock_page(swappage)) {
 -                      shmem_swp_unmap(entry);
 -                      spin_unlock(&info->lock);
 -                      wait_on_page_locked(swappage);
 -                      page_cache_release(swappage);
 -                      goto repeat;
 -              }
 -              if (PageWriteback(swappage)) {
 -                      shmem_swp_unmap(entry);
 -                      spin_unlock(&info->lock);
 -                      wait_on_page_writeback(swappage);
 -                      unlock_page(swappage);
 -                      page_cache_release(swappage);
 -                      goto repeat;
 -              }
 -              if (!PageUptodate(swappage)) {
 -                      shmem_swp_unmap(entry);
 -                      spin_unlock(&info->lock);
 -                      unlock_page(swappage);
 -                      page_cache_release(swappage);
 +              lock_page(page);
 +              if (!PageUptodate(page)) {
                        error = -EIO;
                        goto failed;
                }
 -
 -              if (filepage) {
 -                      shmem_swp_set(info, entry, 0);
 -                      shmem_swp_unmap(entry);
 -                      delete_from_swap_cache(swappage);
 -                      spin_unlock(&info->lock);
 -                      copy_highpage(filepage, swappage);
 -                      unlock_page(swappage);
 -                      page_cache_release(swappage);
 -                      flush_dcache_page(filepage);
 -                      SetPageUptodate(filepage);
 -                      set_page_dirty(filepage);
 -                      swap_free(swap);
 -              } else if (!(error = add_to_page_cache_locked(swappage, mapping,
 -                                      idx, GFP_NOWAIT))) {
 -                      info->flags |= SHMEM_PAGEIN;
 -                      shmem_swp_set(info, entry, 0);
 -                      shmem_swp_unmap(entry);
 -                      delete_from_swap_cache(swappage);
 -                      spin_unlock(&info->lock);
 -                      filepage = swappage;
 -                      set_page_dirty(filepage);
 -                      swap_free(swap);
 -              } else {
 -                      shmem_swp_unmap(entry);
 -                      spin_unlock(&info->lock);
 -                      if (error == -ENOMEM) {
 -                              /*
 -                               * reclaim from proper memory cgroup and
 -                               * call memcg's OOM if needed.
 -                               */
 -                              error = mem_cgroup_shmem_charge_fallback(
 -                                                              swappage,
 -                                                              current->mm,
 -                                                              gfp);
 -                              if (error) {
 -                                      unlock_page(swappage);
 -                                      page_cache_release(swappage);
 -                                      goto failed;
 -                              }
 -                      }
 -                      unlock_page(swappage);
 -                      page_cache_release(swappage);
 -                      goto repeat;
 -              }
 -      } else if (sgp == SGP_READ && !filepage) {
 -              shmem_swp_unmap(entry);
 -              filepage = find_get_page(mapping, idx);
 -              if (filepage &&
 -                  (!PageUptodate(filepage) || !trylock_page(filepage))) {
 -                      spin_unlock(&info->lock);
 -                      wait_on_page_locked(filepage);
 -                      page_cache_release(filepage);
 -                      filepage = NULL;
 -                      goto repeat;
 +              wait_on_page_writeback(page);
 +
 +              /* Someone may have already done it for us */
 +              if (page->mapping) {
 +                      if (page->mapping == mapping &&
 +                          page->index == index)
 +                              goto done;
 +                      error = -EEXIST;
 +                      goto failed;
                }
 +
 +              error = mem_cgroup_cache_charge(page, current->mm,
 +                                              gfp & GFP_RECLAIM_MASK);
 +              if (!error)
 +                      error = shmem_add_to_page_cache(page, mapping, index,
 +                                              gfp, swp_to_radix_entry(swap));
 +              if (error)
 +                      goto failed;
 +
 +              spin_lock(&info->lock);
 +              info->swapped--;
 +              shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
 +
 +              delete_from_swap_cache(page);
 +              set_page_dirty(page);
 +              swap_free(swap);
 +
        } else {
 -              shmem_swp_unmap(entry);
 -              sbinfo = SHMEM_SB(inode->i_sb);
 +              if (shmem_acct_block(info->flags)) {
 +                      error = -ENOSPC;
 +                      goto failed;
 +              }
                if (sbinfo->max_blocks) {
                        if (percpu_counter_compare(&sbinfo->used_blocks,
 -                                              sbinfo->max_blocks) >= 0 ||
 -                          shmem_acct_block(info->flags))
 -                              goto nospace;
 -                      percpu_counter_inc(&sbinfo->used_blocks);
 -                      spin_lock(&inode->i_lock);
 -                      inode->i_blocks += BLOCKS_PER_PAGE;
 -                      spin_unlock(&inode->i_lock);
 -              } else if (shmem_acct_block(info->flags))
 -                      goto nospace;
 -
 -              if (!filepage) {
 -                      int ret;
 -
 -                      if (!prealloc_page) {
 -                              spin_unlock(&info->lock);
 -                              filepage = shmem_alloc_page(gfp, info, idx);
 -                              if (!filepage) {
 -                                      shmem_unacct_blocks(info->flags, 1);
 -                                      shmem_free_blocks(inode, 1);
 -                                      error = -ENOMEM;
 -                                      goto failed;
 -                              }
 -                              SetPageSwapBacked(filepage);
 -
 -                              /*
 -                               * Precharge page while we can wait, compensate
 -                               * after
 -                               */
 -                              error = mem_cgroup_cache_charge(filepage,
 -                                      current->mm, GFP_KERNEL);
 -                              if (error) {
 -                                      page_cache_release(filepage);
 -                                      shmem_unacct_blocks(info->flags, 1);
 -                                      shmem_free_blocks(inode, 1);
 -                                      filepage = NULL;
 -                                      goto failed;
 -                              }
 -
 -                              spin_lock(&info->lock);
 -                      } else {
 -                              filepage = prealloc_page;
 -                              prealloc_page = NULL;
 -                              SetPageSwapBacked(filepage);
 +                                              sbinfo->max_blocks) >= 0) {
 +                              error = -ENOSPC;
 +                              goto unacct;
                        }
 +                      percpu_counter_inc(&sbinfo->used_blocks);
 +              }
  
 -                      entry = shmem_swp_alloc(info, idx, sgp);
 -                      if (IS_ERR(entry))
 -                              error = PTR_ERR(entry);
 -                      else {
 -                              swap = *entry;
 -                              shmem_swp_unmap(entry);
 -                      }
 -                      ret = error || swap.val;
 -                      if (ret)
 -                              mem_cgroup_uncharge_cache_page(filepage);
 -                      else
 -                              ret = add_to_page_cache_lru(filepage, mapping,
 -                                              idx, GFP_NOWAIT);
 -                      /*
 -                       * At add_to_page_cache_lru() failure, uncharge will
 -                       * be done automatically.
 -                       */
 -                      if (ret) {
 -                              spin_unlock(&info->lock);
 -                              page_cache_release(filepage);
 -                              shmem_unacct_blocks(info->flags, 1);
 -                              shmem_free_blocks(inode, 1);
 -                              filepage = NULL;
 -                              if (error)
 -                                      goto failed;
 -                              goto repeat;
 -                      }
 -                      info->flags |= SHMEM_PAGEIN;
 +              page = shmem_alloc_page(gfp, info, index);
 +              if (!page) {
 +                      error = -ENOMEM;
 +                      goto decused;
                }
  
 +              SetPageSwapBacked(page);
 +              __set_page_locked(page);
 +              error = mem_cgroup_cache_charge(page, current->mm,
 +                                              gfp & GFP_RECLAIM_MASK);
 +              if (!error)
 +                      error = shmem_add_to_page_cache(page, mapping, index,
 +                                              gfp, NULL);
 +              if (error)
 +                      goto decused;
 +              lru_cache_add_anon(page);
 +
 +              spin_lock(&info->lock);
                info->alloced++;
 +              inode->i_blocks += BLOCKS_PER_PAGE;
 +              shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
 -              clear_highpage(filepage);
 -              flush_dcache_page(filepage);
 -              SetPageUptodate(filepage);
 +
 +              clear_highpage(page);
 +              flush_dcache_page(page);
 +              SetPageUptodate(page);
                if (sgp == SGP_DIRTY)
 -                      set_page_dirty(filepage);
 +                      set_page_dirty(page);
        }
  done:
 -      *pagep = filepage;
 -      error = 0;
 -      goto out;
 +      /* Perhaps the file has been truncated since we checked */
 +      if (sgp != SGP_WRITE &&
 +          ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
 +              error = -EINVAL;
 +              goto trunc;
 +      }
 +      *pagep = page;
 +      return 0;
  
 -nospace:
        /*
 -       * Perhaps the page was brought in from swap between find_lock_page
 -       * and taking info->lock?  We allow for that at add_to_page_cache_lru,
 -       * but must also avoid reporting a spurious ENOSPC while working on a
 -       * full tmpfs.  (When filepage has been passed in to shmem_getpage, it
 -       * is already in page cache, which prevents this race from occurring.)
 +       * Error recovery.
         */
 -      if (!filepage) {
 -              struct page *page = find_get_page(mapping, idx);
 -              if (page) {
 -                      spin_unlock(&info->lock);
 -                      page_cache_release(page);
 -                      goto repeat;
 -              }
 -      }
 +trunc:
 +      ClearPageDirty(page);
 +      delete_from_page_cache(page);
 +      spin_lock(&info->lock);
 +      info->alloced--;
 +      inode->i_blocks -= BLOCKS_PER_PAGE;
        spin_unlock(&info->lock);
 -      error = -ENOSPC;
 +decused:
 +      if (sbinfo->max_blocks)
 +              percpu_counter_add(&sbinfo->used_blocks, -1);
 +unacct:
 +      shmem_unacct_blocks(info->flags, 1);
  failed:
 -      if (*pagep != filepage) {
 -              unlock_page(filepage);
 -              page_cache_release(filepage);
 +      if (swap.val && error != -EINVAL) {
 +              struct page *test = find_get_page(mapping, index);
 +              if (test && !radix_tree_exceptional_entry(test))
 +                      page_cache_release(test);
 +              /* Have another try if the entry has changed */
 +              if (test != swp_to_radix_entry(swap))
 +                      error = -EEXIST;
        }
 -out:
 -      if (prealloc_page) {
 -              mem_cgroup_uncharge_cache_page(prealloc_page);
 -              page_cache_release(prealloc_page);
 +      if (page) {
 +              unlock_page(page);
 +              page_cache_release(page);
 +      }
 +      if (error == -ENOSPC && !once++) {
 +              info = SHMEM_I(inode);
 +              spin_lock(&info->lock);
 +              shmem_recalc_inode(inode);
 +              spin_unlock(&info->lock);
 +              goto repeat;
        }
 +      if (error == -EEXIST)
 +              goto repeat;
        return error;
  }
  
@@@ -1020,34 -1540,36 +1020,34 @@@ static int shmem_fault(struct vm_area_s
  {
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        int error;
 -      int ret;
 -
 -      if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
 -              return VM_FAULT_SIGBUS;
 +      int ret = VM_FAULT_LOCKED;
  
        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
        if (error)
                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
 +
        if (ret & VM_FAULT_MAJOR) {
                count_vm_event(PGMAJFAULT);
                mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
        }
 -      return ret | VM_FAULT_LOCKED;
 +      return ret;
  }
  
  #ifdef CONFIG_NUMA
 -static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
 +static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
  {
 -      struct inode *i = vma->vm_file->f_path.dentry->d_inode;
 -      return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
 +      struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 +      return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
  }
  
  static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
                                          unsigned long addr)
  {
 -      struct inode *i = vma->vm_file->f_path.dentry->d_inode;
 -      unsigned long idx;
 +      struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 +      pgoff_t index;
  
 -      idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 -      return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
 +      index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 +      return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
  }
  #endif
  
@@@ -1145,7 -1667,20 +1145,7 @@@ static struct inode *shmem_get_inode(st
  
  #ifdef CONFIG_TMPFS
  static const struct inode_operations shmem_symlink_inode_operations;
 -static const struct inode_operations shmem_symlink_inline_operations;
 -
 -/*
 - * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
 - * but providing them allows a tmpfs file to be used for splice, sendfile, and
 - * below the loop driver, in the generic fashion that many filesystems support.
 - */
 -static int shmem_readpage(struct file *file, struct page *page)
 -{
 -      struct inode *inode = page->mapping->host;
 -      int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
 -      unlock_page(page);
 -      return error;
 -}
 +static const struct inode_operations shmem_short_symlink_operations;
  
  static int
  shmem_write_begin(struct file *file, struct address_space *mapping,
  {
        struct inode *inode = mapping->host;
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 -      *pagep = NULL;
        return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
  }
  
@@@ -1178,8 -1714,7 +1178,8 @@@ static void do_shmem_file_read(struct f
  {
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
 -      unsigned long index, offset;
 +      pgoff_t index;
 +      unsigned long offset;
        enum sgp_type sgp = SGP_READ;
  
        /*
  
        for (;;) {
                struct page *page = NULL;
 -              unsigned long end_index, nr, ret;
 +              pgoff_t end_index;
 +              unsigned long nr, ret;
                loff_t i_size = i_size_read(inode);
  
                end_index = i_size >> PAGE_CACHE_SHIFT;
@@@ -1312,119 -1846,6 +1312,119 @@@ static ssize_t shmem_file_aio_read(stru
        return retval;
  }
  
 +static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 +                              struct pipe_inode_info *pipe, size_t len,
 +                              unsigned int flags)
 +{
 +      struct address_space *mapping = in->f_mapping;
 +      struct inode *inode = mapping->host;
 +      unsigned int loff, nr_pages, req_pages;
 +      struct page *pages[PIPE_DEF_BUFFERS];
 +      struct partial_page partial[PIPE_DEF_BUFFERS];
 +      struct page *page;
 +      pgoff_t index, end_index;
 +      loff_t isize, left;
 +      int error, page_nr;
 +      struct splice_pipe_desc spd = {
 +              .pages = pages,
 +              .partial = partial,
 +              .flags = flags,
 +              .ops = &page_cache_pipe_buf_ops,
 +              .spd_release = spd_release_page,
 +      };
 +
 +      isize = i_size_read(inode);
 +      if (unlikely(*ppos >= isize))
 +              return 0;
 +
 +      left = isize - *ppos;
 +      if (unlikely(left < len))
 +              len = left;
 +
 +      if (splice_grow_spd(pipe, &spd))
 +              return -ENOMEM;
 +
 +      index = *ppos >> PAGE_CACHE_SHIFT;
 +      loff = *ppos & ~PAGE_CACHE_MASK;
 +      req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 +      nr_pages = min(req_pages, pipe->buffers);
 +
 +      spd.nr_pages = find_get_pages_contig(mapping, index,
 +                                              nr_pages, spd.pages);
 +      index += spd.nr_pages;
 +      error = 0;
 +
 +      while (spd.nr_pages < nr_pages) {
 +              error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
 +              if (error)
 +                      break;
 +              unlock_page(page);
 +              spd.pages[spd.nr_pages++] = page;
 +              index++;
 +      }
 +
 +      index = *ppos >> PAGE_CACHE_SHIFT;
 +      nr_pages = spd.nr_pages;
 +      spd.nr_pages = 0;
 +
 +      for (page_nr = 0; page_nr < nr_pages; page_nr++) {
 +              unsigned int this_len;
 +
 +              if (!len)
 +                      break;
 +
 +              this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
 +              page = spd.pages[page_nr];
 +
 +              if (!PageUptodate(page) || page->mapping != mapping) {
 +                      error = shmem_getpage(inode, index, &page,
 +                                                      SGP_CACHE, NULL);
 +                      if (error)
 +                              break;
 +                      unlock_page(page);
 +                      page_cache_release(spd.pages[page_nr]);
 +                      spd.pages[page_nr] = page;
 +              }
 +
 +              isize = i_size_read(inode);
 +              end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 +              if (unlikely(!isize || index > end_index))
 +                      break;
 +
 +              if (end_index == index) {
 +                      unsigned int plen;
 +
 +                      plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
 +                      if (plen <= loff)
 +                              break;
 +
 +                      this_len = min(this_len, plen - loff);
 +                      len = this_len;
 +              }
 +
 +              spd.partial[page_nr].offset = loff;
 +              spd.partial[page_nr].len = this_len;
 +              len -= this_len;
 +              loff = 0;
 +              spd.nr_pages++;
 +              index++;
 +      }
 +
 +      while (page_nr < nr_pages)
 +              page_cache_release(spd.pages[page_nr++]);
 +
 +      if (spd.nr_pages)
 +              error = splice_to_pipe(pipe, &spd);
 +
 +      splice_shrink_spd(pipe, &spd);
 +
 +      if (error > 0) {
 +              *ppos += error;
 +              file_accessed(in);
 +      }
 +      return error;
 +}
 +
  static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
  {
        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
        buf->f_namelen = NAME_MAX;
        if (sbinfo->max_blocks) {
                buf->f_blocks = sbinfo->max_blocks;
 -              buf->f_bavail = buf->f_bfree =
 -                              sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
 +              buf->f_bavail =
 +              buf->f_bfree  = sbinfo->max_blocks -
 +                              percpu_counter_sum(&sbinfo->used_blocks);
        }
        if (sbinfo->max_inodes) {
                buf->f_files = sbinfo->max_inodes;
@@@ -1458,7 -1878,7 +1458,7 @@@ shmem_mknod(struct inode *dir, struct d
        inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
        if (inode) {
                error = security_inode_init_security(inode, dir,
-                                                    &dentry->d_name, NULL,
+                                                    &dentry->d_name,
                                                     NULL, NULL);
                if (error) {
                        if (error != -EOPNOTSUPP) {
@@@ -1586,7 -2006,7 +1586,7 @@@ static int shmem_symlink(struct inode *
        int error;
        int len;
        struct inode *inode;
 -      struct page *page = NULL;
 +      struct page *page;
        char *kaddr;
        struct shmem_inode_info *info;
  
        if (!inode)
                return -ENOSPC;
  
-       error = security_inode_init_security(inode, dir, &dentry->d_name, NULL,
+       error = security_inode_init_security(inode, dir, &dentry->d_name,
                                             NULL, NULL);
        if (error) {
                if (error != -EOPNOTSUPP) {
  
        info = SHMEM_I(inode);
        inode->i_size = len-1;
 -      if (len <= SHMEM_SYMLINK_INLINE_LEN) {
 -              /* do it inline */
 -              memcpy(info->inline_symlink, symname, len);
 -              inode->i_op = &shmem_symlink_inline_operations;
 +      if (len <= SHORT_SYMLINK_LEN) {
 +              info->symlink = kmemdup(symname, len, GFP_KERNEL);
 +              if (!info->symlink) {
 +                      iput(inode);
 +                      return -ENOMEM;
 +              }
 +              inode->i_op = &shmem_short_symlink_operations;
        } else {
                error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
                if (error) {
        return 0;
  }
  
 -static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
 +static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
  {
 -      nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink);
 +      nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
        return NULL;
  }
  
  static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
  {
        struct page *page = NULL;
 -      int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
 -      nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
 +      int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
 +      nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
        if (page)
                unlock_page(page);
        return page;
@@@ -1760,6 -2177,7 +1760,6 @@@ out
        return err;
  }
  
 -
  static const struct xattr_handler *shmem_xattr_handlers[] = {
  #ifdef CONFIG_TMPFS_POSIX_ACL
        &generic_acl_access_handler,
@@@ -1889,9 -2307,9 +1889,9 @@@ static ssize_t shmem_listxattr(struct d
  }
  #endif /* CONFIG_TMPFS_XATTR */
  
 -static const struct inode_operations shmem_symlink_inline_operations = {
 +static const struct inode_operations shmem_short_symlink_operations = {
        .readlink       = generic_readlink,
 -      .follow_link    = shmem_follow_link_inline,
 +      .follow_link    = shmem_follow_short_symlink,
  #ifdef CONFIG_TMPFS_XATTR
        .setxattr       = shmem_setxattr,
        .getxattr       = shmem_getxattr,
@@@ -2091,7 -2509,8 +2091,7 @@@ static int shmem_remount_fs(struct supe
        if (config.max_inodes < inodes)
                goto out;
        /*
 -       * Those tests also disallow limited->unlimited while any are in
 -       * use, so i_blocks will always be zero when max_blocks is zero;
 +       * Those tests disallow limited->unlimited while any are in use;
         * but we must separately disallow unlimited->limited, because
         * in that case we have no record of how much is already in use.
         */
@@@ -2183,7 -2602,7 +2183,7 @@@ int shmem_fill_super(struct super_bloc
                goto failed;
        sbinfo->free_inodes = sbinfo->max_inodes;
  
 -      sb->s_maxbytes = SHMEM_MAX_BYTES;
 +      sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize = PAGE_CACHE_SIZE;
        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
        sb->s_magic = TMPFS_MAGIC;
@@@ -2218,14 -2637,14 +2218,14 @@@ static struct kmem_cache *shmem_inode_c
  
  static struct inode *shmem_alloc_inode(struct super_block *sb)
  {
 -      struct shmem_inode_info *p;
 -      p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
 -      if (!p)
 +      struct shmem_inode_info *info;
 +      info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
 +      if (!info)
                return NULL;
 -      return &p->vfs_inode;
 +      return &info->vfs_inode;
  }
  
 -static void shmem_i_callback(struct rcu_head *head)
 +static void shmem_destroy_callback(struct rcu_head *head)
  {
        struct inode *inode = container_of(head, struct inode, i_rcu);
        INIT_LIST_HEAD(&inode->i_dentry);
  
  static void shmem_destroy_inode(struct inode *inode)
  {
 -      if ((inode->i_mode & S_IFMT) == S_IFREG) {
 -              /* only struct inode is valid if it's an inline symlink */
 +      if ((inode->i_mode & S_IFMT) == S_IFREG)
                mpol_free_shared_policy(&SHMEM_I(inode)->policy);
 -      }
 -      call_rcu(&inode->i_rcu, shmem_i_callback);
 +      call_rcu(&inode->i_rcu, shmem_destroy_callback);
  }
  
 -static void init_once(void *foo)
 +static void shmem_init_inode(void *foo)
  {
 -      struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
 -
 -      inode_init_once(&p->vfs_inode);
 +      struct shmem_inode_info *info = foo;
 +      inode_init_once(&info->vfs_inode);
  }
  
 -static int init_inodecache(void)
 +static int shmem_init_inodecache(void)
  {
        shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
                                sizeof(struct shmem_inode_info),
 -                              0, SLAB_PANIC, init_once);
 +                              0, SLAB_PANIC, shmem_init_inode);
        return 0;
  }
  
 -static void destroy_inodecache(void)
 +static void shmem_destroy_inodecache(void)
  {
        kmem_cache_destroy(shmem_inode_cachep);
  }
@@@ -2262,6 -2684,7 +2262,6 @@@ static const struct address_space_opera
        .writepage      = shmem_writepage,
        .set_page_dirty = __set_page_dirty_no_writeback,
  #ifdef CONFIG_TMPFS
 -      .readpage       = shmem_readpage,
        .write_begin    = shmem_write_begin,
        .write_end      = shmem_write_end,
  #endif
@@@ -2278,7 -2701,7 +2278,7 @@@ static const struct file_operations shm
        .aio_read       = shmem_file_aio_read,
        .aio_write      = generic_file_aio_write,
        .fsync          = noop_fsync,
 -      .splice_read    = generic_file_splice_read,
 +      .splice_read    = shmem_file_splice_read,
        .splice_write   = generic_file_splice_write,
  #endif
  };
@@@ -2292,6 -2715,10 +2292,6 @@@ static const struct inode_operations sh
        .listxattr      = shmem_listxattr,
        .removexattr    = shmem_removexattr,
  #endif
 -#ifdef CONFIG_TMPFS_POSIX_ACL
 -      .check_acl      = generic_check_acl,
 -#endif
 -
  };
  
  static const struct inode_operations shmem_dir_inode_operations = {
  #endif
  #ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_setattr,
 -      .check_acl      = generic_check_acl,
  #endif
  };
  
@@@ -2326,6 -2754,7 +2326,6 @@@ static const struct inode_operations sh
  #endif
  #ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_setattr,
 -      .check_acl      = generic_check_acl,
  #endif
  };
  
@@@ -2350,20 -2779,21 +2350,20 @@@ static const struct vm_operations_struc
  #endif
  };
  
 -
  static struct dentry *shmem_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
  {
        return mount_nodev(fs_type, flags, data, shmem_fill_super);
  }
  
 -static struct file_system_type tmpfs_fs_type = {
 +static struct file_system_type shmem_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "tmpfs",
        .mount          = shmem_mount,
        .kill_sb        = kill_litter_super,
  };
  
 -int __init init_tmpfs(void)
 +int __init shmem_init(void)
  {
        int error;
  
        if (error)
                goto out4;
  
 -      error = init_inodecache();
 +      error = shmem_init_inodecache();
        if (error)
                goto out3;
  
 -      error = register_filesystem(&tmpfs_fs_type);
 +      error = register_filesystem(&shmem_fs_type);
        if (error) {
                printk(KERN_ERR "Could not register tmpfs\n");
                goto out2;
        }
  
 -      shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
 -                              tmpfs_fs_type.name, NULL);
 +      shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
 +                               shmem_fs_type.name, NULL);
        if (IS_ERR(shm_mnt)) {
                error = PTR_ERR(shm_mnt);
                printk(KERN_ERR "Could not kern_mount tmpfs\n");
        return 0;
  
  out1:
 -      unregister_filesystem(&tmpfs_fs_type);
 +      unregister_filesystem(&shmem_fs_type);
  out2:
 -      destroy_inodecache();
 +      shmem_destroy_inodecache();
  out3:
        bdi_destroy(&shmem_backing_dev_info);
  out4:
        return error;
  }
  
 -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
 -/**
 - * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
 - * @inode: the inode to be searched
 - * @pgoff: the offset to be searched
 - * @pagep: the pointer for the found page to be stored
 - * @ent: the pointer for the found swap entry to be stored
 - *
 - * If a page is found, refcount of it is incremented. Callers should handle
 - * these refcount.
 - */
 -void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
 -                                      struct page **pagep, swp_entry_t *ent)
 -{
 -      swp_entry_t entry = { .val = 0 }, *ptr;
 -      struct page *page = NULL;
 -      struct shmem_inode_info *info = SHMEM_I(inode);
 -
 -      if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
 -              goto out;
 -
 -      spin_lock(&info->lock);
 -      ptr = shmem_swp_entry(info, pgoff, NULL);
 -#ifdef CONFIG_SWAP
 -      if (ptr && ptr->val) {
 -              entry.val = ptr->val;
 -              page = find_get_page(&swapper_space, entry.val);
 -      } else
 -#endif
 -              page = find_get_page(inode->i_mapping, pgoff);
 -      if (ptr)
 -              shmem_swp_unmap(ptr);
 -      spin_unlock(&info->lock);
 -out:
 -      *pagep = page;
 -      *ent = entry;
 -}
 -#endif
 -
  #else /* !CONFIG_SHMEM */
  
  /*
  
  #include <linux/ramfs.h>
  
 -static struct file_system_type tmpfs_fs_type = {
 +static struct file_system_type shmem_fs_type = {
        .name           = "tmpfs",
        .mount          = ramfs_mount,
        .kill_sb        = kill_litter_super,
  };
  
 -int __init init_tmpfs(void)
 +int __init shmem_init(void)
  {
 -      BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
 +      BUG_ON(register_filesystem(&shmem_fs_type) != 0);
  
 -      shm_mnt = kern_mount(&tmpfs_fs_type);
 +      shm_mnt = kern_mount(&shmem_fs_type);
        BUG_ON(IS_ERR(shm_mnt));
  
        return 0;
  }
  
 -int shmem_unuse(swp_entry_t entry, struct page *page)
 +int shmem_unuse(swp_entry_t swap, struct page *page)
  {
        return 0;
  }
@@@ -2440,17 -2909,43 +2440,17 @@@ int shmem_lock(struct file *file, int l
        return 0;
  }
  
 -void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
  {
 -      truncate_inode_pages_range(inode->i_mapping, start, end);
 +      truncate_inode_pages_range(inode->i_mapping, lstart, lend);
  }
  EXPORT_SYMBOL_GPL(shmem_truncate_range);
  
 -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
 -/**
 - * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
 - * @inode: the inode to be searched
 - * @pgoff: the offset to be searched
 - * @pagep: the pointer for the found page to be stored
 - * @ent: the pointer for the found swap entry to be stored
 - *
 - * If a page is found, refcount of it is incremented. Callers should handle
 - * these refcount.
 - */
 -void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
 -                                      struct page **pagep, swp_entry_t *ent)
 -{
 -      struct page *page = NULL;
 -
 -      if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
 -              goto out;
 -      page = find_get_page(inode->i_mapping, pgoff);
 -out:
 -      *pagep = page;
 -      *ent = (swp_entry_t){ .val = 0 };
 -}
 -#endif
 -
  #define shmem_vm_ops                          generic_file_vm_ops
  #define shmem_file_operations                 ramfs_file_operations
  #define shmem_get_inode(sb, dir, mode, dev, flags)    ramfs_get_inode(sb, dir, mode, dev)
  #define shmem_acct_size(flags, size)          0
  #define shmem_unacct_size(flags, size)                do {} while (0)
 -#define SHMEM_MAX_BYTES                               MAX_LFS_FILESIZE
  
  #endif /* CONFIG_SHMEM */
  
@@@ -2474,7 -2969,7 +2474,7 @@@ struct file *shmem_file_setup(const cha
        if (IS_ERR(shm_mnt))
                return (void *)shm_mnt;
  
 -      if (size < 0 || size > SHMEM_MAX_BYTES)
 +      if (size < 0 || size > MAX_LFS_FILESIZE)
                return ERR_PTR(-EINVAL);
  
        if (shmem_acct_size(flags, size))
@@@ -2553,29 -3048,13 +2553,29 @@@ int shmem_zero_setup(struct vm_area_str
   * suit tmpfs, since it may have pages in swapcache, and needs to find those
   * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
   *
 - * Provide a stub for those callers to start using now, then later
 - * flesh it out to call shmem_getpage() with additional gfp mask, when
 - * shmem_file_splice_read() is added and shmem_readpage() is removed.
 + * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
 + * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
   */
  struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
                                         pgoff_t index, gfp_t gfp)
  {
 +#ifdef CONFIG_SHMEM
 +      struct inode *inode = mapping->host;
 +      struct page *page;
 +      int error;
 +
 +      BUG_ON(mapping->a_ops != &shmem_aops);
 +      error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
 +      if (error)
 +              page = ERR_PTR(error);
 +      else
 +              unlock_page(page);
 +      return page;
 +#else
 +      /*
 +       * The tiny !SHMEM case uses ramfs without swap
 +       */
        return read_cache_page_gfp(mapping, index, gfp);
 +#endif
  }
  EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
@@@ -82,11 -82,11 +82,11 @@@ out
                                  "open_writers");
  }
  
- static void ima_check_last_writer(struct ima_iint_cache *iint,
+ static void ima_check_last_writer(struct integrity_iint_cache *iint,
                                  struct inode *inode,
                                  struct file *file)
  {
 -      mode_t mode = file->f_mode;
 +      fmode_t mode = file->f_mode;
  
        mutex_lock(&iint->mutex);
        if (mode & FMODE_WRITE &&
  void ima_file_free(struct file *file)
  {
        struct inode *inode = file->f_dentry->d_inode;
-       struct ima_iint_cache *iint;
+       struct integrity_iint_cache *iint;
  
        if (!iint_initialized || !S_ISREG(inode->i_mode))
                return;
  
-       iint = ima_iint_find(inode);
+       iint = integrity_iint_find(inode);
        if (!iint)
                return;
  
@@@ -121,7 -121,7 +121,7 @@@ static int process_measurement(struct f
                               int mask, int function)
  {
        struct inode *inode = file->f_dentry->d_inode;
-       struct ima_iint_cache *iint;
+       struct integrity_iint_cache *iint;
        int rc = 0;
  
        if (!ima_initialized || !S_ISREG(inode->i_mode))
        if (rc != 0)
                return rc;
  retry:
-       iint = ima_iint_find(inode);
+       iint = integrity_iint_find(inode);
        if (!iint) {
-               rc = ima_inode_alloc(inode);
+               rc = integrity_inode_alloc(inode);
                if (!rc || rc == -EEXIST)
                        goto retry;
                return rc;
diff --combined security/security.c
  #include <linux/init.h>
  #include <linux/kernel.h>
  #include <linux/security.h>
+ #include <linux/integrity.h>
  #include <linux/ima.h>
+ #include <linux/evm.h>
+ #define MAX_LSM_EVM_XATTR     2
  
  /* Boot-time LSM user choice */
  static __initdata char chosen_lsm[SECURITY_NAME_MAX + 1] =
@@@ -334,20 -338,57 +338,57 @@@ int security_inode_alloc(struct inode *
  
  void security_inode_free(struct inode *inode)
  {
-       ima_inode_free(inode);
+       integrity_inode_free(inode);
        security_ops->inode_free_security(inode);
  }
  
  int security_inode_init_security(struct inode *inode, struct inode *dir,
-                                const struct qstr *qstr, char **name,
-                                void **value, size_t *len)
+                                const struct qstr *qstr,
+                                const initxattrs initxattrs, void *fs_data)
+ {
+       struct xattr new_xattrs[MAX_LSM_EVM_XATTR + 1];
+       struct xattr *lsm_xattr, *evm_xattr, *xattr;
+       int ret;
+       if (unlikely(IS_PRIVATE(inode)))
+               return -EOPNOTSUPP;
+       memset(new_xattrs, 0, sizeof new_xattrs);
+       if (!initxattrs)
+               return security_ops->inode_init_security(inode, dir, qstr,
+                                                        NULL, NULL, NULL);
+       lsm_xattr = new_xattrs;
+       ret = security_ops->inode_init_security(inode, dir, qstr,
+                                               &lsm_xattr->name,
+                                               &lsm_xattr->value,
+                                               &lsm_xattr->value_len);
+       if (ret)
+               goto out;
+       evm_xattr = lsm_xattr + 1;
+       ret = evm_inode_init_security(inode, lsm_xattr, evm_xattr);
+       if (ret)
+               goto out;
+       ret = initxattrs(inode, new_xattrs, fs_data);
+ out:
+       for (xattr = new_xattrs; xattr->name != NULL; xattr++) {
+               kfree(xattr->name);
+               kfree(xattr->value);
+       }
+       return (ret == -EOPNOTSUPP) ? 0 : ret;
+ }
+ EXPORT_SYMBOL(security_inode_init_security);
+ int security_old_inode_init_security(struct inode *inode, struct inode *dir,
+                                    const struct qstr *qstr, char **name,
+                                    void **value, size_t *len)
  {
        if (unlikely(IS_PRIVATE(inode)))
                return -EOPNOTSUPP;
        return security_ops->inode_init_security(inode, dir, qstr, name, value,
                                                 len);
  }
- EXPORT_SYMBOL(security_inode_init_security);
+ EXPORT_SYMBOL(security_old_inode_init_security);
  
  #ifdef CONFIG_SECURITY_PATH
  int security_path_mknod(struct path *dir, struct dentry *dentry, int mode,
@@@ -518,14 -559,26 +559,19 @@@ int security_inode_permission(struct in
  {
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
 -      return security_ops->inode_permission(inode, mask, 0);
 -}
 -
 -int security_inode_exec_permission(struct inode *inode, unsigned int flags)
 -{
 -      if (unlikely(IS_PRIVATE(inode)))
 -              return 0;
 -      return security_ops->inode_permission(inode, MAY_EXEC, flags);
 +      return security_ops->inode_permission(inode, mask);
  }
  
  int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
  {
+       int ret;
        if (unlikely(IS_PRIVATE(dentry->d_inode)))
                return 0;
-       return security_ops->inode_setattr(dentry, attr);
+       ret = security_ops->inode_setattr(dentry, attr);
+       if (ret)
+               return ret;
+       return evm_inode_setattr(dentry, attr);
  }
  EXPORT_SYMBOL_GPL(security_inode_setattr);
  
@@@ -539,9 -592,14 +585,14 @@@ int security_inode_getattr(struct vfsmo
  int security_inode_setxattr(struct dentry *dentry, const char *name,
                            const void *value, size_t size, int flags)
  {
+       int ret;
        if (unlikely(IS_PRIVATE(dentry->d_inode)))
                return 0;
-       return security_ops->inode_setxattr(dentry, name, value, size, flags);
+       ret = security_ops->inode_setxattr(dentry, name, value, size, flags);
+       if (ret)
+               return ret;
+       return evm_inode_setxattr(dentry, name, value, size);
  }
  
  void security_inode_post_setxattr(struct dentry *dentry, const char *name,
        if (unlikely(IS_PRIVATE(dentry->d_inode)))
                return;
        security_ops->inode_post_setxattr(dentry, name, value, size, flags);
+       evm_inode_post_setxattr(dentry, name, value, size);
  }
  
  int security_inode_getxattr(struct dentry *dentry, const char *name)
@@@ -568,9 -627,14 +620,14 @@@ int security_inode_listxattr(struct den
  
  int security_inode_removexattr(struct dentry *dentry, const char *name)
  {
+       int ret;
        if (unlikely(IS_PRIVATE(dentry->d_inode)))
                return 0;
-       return security_ops->inode_removexattr(dentry, name);
+       ret = security_ops->inode_removexattr(dentry, name);
+       if (ret)
+               return ret;
+       return evm_inode_removexattr(dentry, name);
  }
  
  int security_inode_need_killpriv(struct dentry *dentry)