ext4: insert 3.18 version of fs/ext4, fs/jbd2 and associated header files
Ian Chang [Tue, 10 May 2016 04:37:37 +0000 (12:37 +0800)]
This will cause ext3 and gfs2 to not compile correctly, but allows us
to get a modern version of ext4 into 3.10.  This makes it easier to
backport newer features such as ext4 encryption into a downrev kernel.
It also fixes a number of xfstest failures that were fixed since 3.10.

The subsequent commits will fix up the 3.18 ext4 codebase so it will
compile against 3.10.

Change-Id: I866e28baf4f11e5ef3fbb3ea9f205a18f4576303
Signed-off-by: Ian Chang <ianc@nvidia.com>

42 files changed:
fs/ext4/acl.c
fs/ext4/acl.h
fs/ext4/balloc.c
fs/ext4/bitmap.c
fs/ext4/block_validity.c
fs/ext4/dir.c
fs/ext4/ext4.h
fs/ext4/ext4_extents.h
fs/ext4/ext4_jbd2.c
fs/ext4/ext4_jbd2.h
fs/ext4/extents.c
fs/ext4/extents_status.c
fs/ext4/extents_status.h
fs/ext4/file.c
fs/ext4/fsync.c
fs/ext4/ialloc.c
fs/ext4/indirect.c
fs/ext4/inline.c
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/mballoc.c
fs/ext4/mballoc.h
fs/ext4/migrate.c
fs/ext4/mmp.c
fs/ext4/move_extent.c
fs/ext4/namei.c
fs/ext4/page-io.c
fs/ext4/resize.c
fs/ext4/super.c
fs/ext4/xattr.c
fs/ext4/xattr.h
fs/jbd2/Kconfig
fs/jbd2/checkpoint.c
fs/jbd2/commit.c
fs/jbd2/journal.c
fs/jbd2/recovery.c
fs/jbd2/revoke.c
fs/jbd2/transaction.c
include/linux/jbd.h
include/linux/jbd2.h
include/linux/jbd_common.h
include/trace/events/ext4.h

index 39a54a0..d40c8db 100644 (file)
@@ -152,13 +152,6 @@ ext4_get_acl(struct inode *inode, int type)
        struct posix_acl *acl;
        int retval;
 
-       if (!test_opt(inode->i_sb, POSIX_ACL))
-               return NULL;
-
-       acl = get_cached_acl(inode, type);
-       if (acl != ACL_NOT_CACHED)
-               return acl;
-
        switch (type) {
        case ACL_TYPE_ACCESS:
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -196,7 +189,7 @@ ext4_get_acl(struct inode *inode, int type)
  * inode->i_mutex: down unless called from ext4_new_inode
  */
 static int
-ext4_set_acl(handle_t *handle, struct inode *inode, int type,
+__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
             struct posix_acl *acl)
 {
        int name_index;
@@ -204,9 +197,6 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
        size_t size = 0;
        int error;
 
-       if (S_ISLNK(inode->i_mode))
-               return -EOPNOTSUPP;
-
        switch (type) {
        case ACL_TYPE_ACCESS:
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -248,208 +238,51 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
        return error;
 }
 
-/*
- * Initialize the ACLs of a new inode. Called from ext4_new_inode.
- *
- * dir->i_mutex: down
- * inode->i_mutex: up (access to inode is still exclusive)
- */
 int
-ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-       struct posix_acl *acl = NULL;
-       int error = 0;
-
-       if (!S_ISLNK(inode->i_mode)) {
-               if (test_opt(dir->i_sb, POSIX_ACL)) {
-                       acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
-                       if (IS_ERR(acl))
-                               return PTR_ERR(acl);
-               }
-               if (!acl)
-                       inode->i_mode &= ~current_umask();
-       }
-       if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
-               if (S_ISDIR(inode->i_mode)) {
-                       error = ext4_set_acl(handle, inode,
-                                            ACL_TYPE_DEFAULT, acl);
-                       if (error)
-                               goto cleanup;
-               }
-               error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
-               if (error < 0)
-                       return error;
-
-               if (error > 0) {
-                       /* This is an extended ACL */
-                       error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
-               }
-       }
-cleanup:
-       posix_acl_release(acl);
-       return error;
-}
-
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like  ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
-int
-ext4_acl_chmod(struct inode *inode)
-{
-       struct posix_acl *acl;
        handle_t *handle;
-       int retries = 0;
-       int error;
-
+       int error, retries = 0;
 
-       if (S_ISLNK(inode->i_mode))
-               return -EOPNOTSUPP;
-       if (!test_opt(inode->i_sb, POSIX_ACL))
-               return 0;
-       acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
-       if (IS_ERR(acl) || !acl)
-               return PTR_ERR(acl);
-       error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-       if (error)
-               return error;
 retry:
        handle = ext4_journal_start(inode, EXT4_HT_XATTR,
                                    ext4_jbd2_credits_xattr(inode));
-       if (IS_ERR(handle)) {
-               error = PTR_ERR(handle);
-               ext4_std_error(inode->i_sb, error);
-               goto out;
-       }
-       error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+
+       error = __ext4_set_acl(handle, inode, type, acl);
        ext4_journal_stop(handle);
-       if (error == -ENOSPC &&
-           ext4_should_retry_alloc(inode->i_sb, &retries))
+       if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
-out:
-       posix_acl_release(acl);
        return error;
 }
 
 /*
- * Extended attribute handlers
+ * Initialize the ACLs of a new inode. Called from ext4_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
  */
-static size_t
-ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
-                          const char *name, size_t name_len, int type)
-{
-       const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
-       if (!test_opt(dentry->d_sb, POSIX_ACL))
-               return 0;
-       if (list && size <= list_len)
-               memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
-       return size;
-}
-
-static size_t
-ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
-                           const char *name, size_t name_len, int type)
-{
-       const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
-       if (!test_opt(dentry->d_sb, POSIX_ACL))
-               return 0;
-       if (list && size <= list_len)
-               memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
-       return size;
-}
-
-static int
-ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
-                  size_t size, int type)
+int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 {
-       struct posix_acl *acl;
+       struct posix_acl *default_acl, *acl;
        int error;
 
-       if (strcmp(name, "") != 0)
-               return -EINVAL;
-       if (!test_opt(dentry->d_sb, POSIX_ACL))
-               return -EOPNOTSUPP;
-
-       acl = ext4_get_acl(dentry->d_inode, type);
-       if (IS_ERR(acl))
-               return PTR_ERR(acl);
-       if (acl == NULL)
-               return -ENODATA;
-       error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-       posix_acl_release(acl);
-
-       return error;
-}
-
-static int
-ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
-                  size_t size, int flags, int type)
-{
-       struct inode *inode = dentry->d_inode;
-       handle_t *handle;
-       struct posix_acl *acl;
-       int error, retries = 0;
-
-       if (strcmp(name, "") != 0)
-               return -EINVAL;
-       if (!test_opt(inode->i_sb, POSIX_ACL))
-               return -EOPNOTSUPP;
-       if (!inode_owner_or_capable(inode))
-               return -EPERM;
-
-       if (value) {
-               acl = posix_acl_from_xattr(&init_user_ns, value, size);
-               if (IS_ERR(acl))
-                       return PTR_ERR(acl);
-               else if (acl) {
-                       error = posix_acl_valid(acl);
-                       if (error)
-                               goto release_and_out;
-               }
-       } else
-               acl = NULL;
+       error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+       if (error)
+               return error;
 
-retry:
-       handle = ext4_journal_start(inode, EXT4_HT_XATTR,
-                                   ext4_jbd2_credits_xattr(inode));
-       if (IS_ERR(handle)) {
-               error = PTR_ERR(handle);
-               goto release_and_out;
+       if (default_acl) {
+               error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
+                                      default_acl);
+               posix_acl_release(default_acl);
+       }
+       if (acl) {
+               if (!error)
+                       error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
+                                              acl);
+               posix_acl_release(acl);
        }
-       error = ext4_set_acl(handle, inode, type, acl);
-       ext4_journal_stop(handle);
-       if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-               goto retry;
-
-release_and_out:
-       posix_acl_release(acl);
        return error;
 }
-
-const struct xattr_handler ext4_xattr_acl_access_handler = {
-       .prefix = POSIX_ACL_XATTR_ACCESS,
-       .flags  = ACL_TYPE_ACCESS,
-       .list   = ext4_xattr_list_acl_access,
-       .get    = ext4_xattr_get_acl,
-       .set    = ext4_xattr_set_acl,
-};
-
-const struct xattr_handler ext4_xattr_acl_default_handler = {
-       .prefix = POSIX_ACL_XATTR_DEFAULT,
-       .flags  = ACL_TYPE_DEFAULT,
-       .list   = ext4_xattr_list_acl_default,
-       .get    = ext4_xattr_get_acl,
-       .set    = ext4_xattr_set_acl,
-};
index 18cb39e..da2c795 100644 (file)
@@ -55,18 +55,13 @@ static inline int ext4_acl_count(size_t size)
 
 /* acl.c */
 struct posix_acl *ext4_get_acl(struct inode *inode, int type);
-extern int ext4_acl_chmod(struct inode *);
+int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 
 #else  /* CONFIG_EXT4_FS_POSIX_ACL */
 #include <linux/sched.h>
 #define ext4_get_acl NULL
-
-static inline int
-ext4_acl_chmod(struct inode *inode)
-{
-       return 0;
-}
+#define ext4_set_acl NULL
 
 static inline int
 ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
index 3742e4c..83a6f49 100644 (file)
@@ -83,9 +83,9 @@ static inline int ext4_block_in_group(struct super_block *sb,
 /* Return the number of clusters used for file system metadata; this
  * represents the overhead needed by the file system.
  */
-unsigned ext4_num_overhead_clusters(struct super_block *sb,
-                                   ext4_group_t block_group,
-                                   struct ext4_group_desc *gdp)
+static unsigned ext4_num_overhead_clusters(struct super_block *sb,
+                                          ext4_group_t block_group,
+                                          struct ext4_group_desc *gdp)
 {
        unsigned num_clusters;
        int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
@@ -176,27 +176,35 @@ static unsigned int num_clusters_in_group(struct super_block *sb,
 }
 
 /* Initializes an uninitialized block bitmap */
-void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
-                           ext4_group_t block_group,
-                           struct ext4_group_desc *gdp)
+static int ext4_init_block_bitmap(struct super_block *sb,
+                                  struct buffer_head *bh,
+                                  ext4_group_t block_group,
+                                  struct ext4_group_desc *gdp)
 {
        unsigned int bit, bit_max;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t start, tmp;
        int flex_bg = 0;
+       struct ext4_group_info *grp;
 
        J_ASSERT_BH(bh, buffer_locked(bh));
 
        /* If checksum is bad mark all blocks used to prevent allocation
         * essentially implementing a per-group read-only flag. */
        if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
-               ext4_error(sb, "Checksum bad for group %u", block_group);
-               ext4_free_group_clusters_set(sb, gdp, 0);
-               ext4_free_inodes_set(sb, gdp, 0);
-               ext4_itable_unused_set(sb, gdp, 0);
-               memset(bh->b_data, 0xff, sb->s_blocksize);
-               ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
-               return;
+               grp = ext4_get_group_info(sb, block_group);
+               if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+                       percpu_counter_sub(&sbi->s_freeclusters_counter,
+                                          grp->bb_free);
+               set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+               if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+                       int count;
+                       count = ext4_free_inodes_count(sb, gdp);
+                       percpu_counter_sub(&sbi->s_freeinodes_counter,
+                                          count);
+               }
+               set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+               return -EIO;
        }
        memset(bh->b_data, 0, sb->s_blocksize);
 
@@ -234,6 +242,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                             sb->s_blocksize * 8, bh->b_data);
        ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
        ext4_group_desc_csum_set(sb, block_group, gdp);
+       return 0;
 }
 
 /* Return the number of free blocks in a block group.  It is used when
@@ -305,9 +314,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
  */
 static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
                                            struct ext4_group_desc *desc,
-                                           unsigned int block_group,
+                                           ext4_group_t block_group,
                                            struct buffer_head *bh)
 {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_grpblk_t offset;
        ext4_grpblk_t next_zero_bit;
        ext4_fsblk_t blk;
@@ -327,14 +337,14 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
        /* check whether block bitmap block number is set */
        blk = ext4_block_bitmap(sb, desc);
        offset = blk - group_first_block;
-       if (!ext4_test_bit(offset, bh->b_data))
+       if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
                /* bad block bitmap */
                return blk;
 
        /* check whether the inode bitmap block number is set */
        blk = ext4_inode_bitmap(sb, desc);
        offset = blk - group_first_block;
-       if (!ext4_test_bit(offset, bh->b_data))
+       if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
                /* bad block bitmap */
                return blk;
 
@@ -342,20 +352,23 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
        blk = ext4_inode_table(sb, desc);
        offset = blk - group_first_block;
        next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
-                               offset + EXT4_SB(sb)->s_itb_per_group,
-                               offset);
-       if (next_zero_bit < offset + EXT4_SB(sb)->s_itb_per_group)
+                       EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group),
+                       EXT4_B2C(sbi, offset));
+       if (next_zero_bit <
+           EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group))
                /* bad bitmap for inode tables */
                return blk;
        return 0;
 }
 
-void ext4_validate_block_bitmap(struct super_block *sb,
-                              struct ext4_group_desc *desc,
-                              unsigned int block_group,
-                              struct buffer_head *bh)
+static void ext4_validate_block_bitmap(struct super_block *sb,
+                                      struct ext4_group_desc *desc,
+                                      ext4_group_t block_group,
+                                      struct buffer_head *bh)
 {
        ext4_fsblk_t    blk;
+       struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
 
        if (buffer_verified(bh))
                return;
@@ -366,12 +379,20 @@ void ext4_validate_block_bitmap(struct super_block *sb,
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
                           block_group, blk);
+               if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+                       percpu_counter_sub(&sbi->s_freeclusters_counter,
+                                          grp->bb_free);
+               set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
                return;
        }
        if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
                        desc, bh))) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+               if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+                       percpu_counter_sub(&sbi->s_freeclusters_counter,
+                                          grp->bb_free);
+               set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
                return;
        }
        set_buffer_verified(bh);
@@ -417,11 +438,15 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
        }
        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-               ext4_init_block_bitmap(sb, bh, block_group, desc);
+               int err;
+
+               err = ext4_init_block_bitmap(sb, bh, block_group, desc);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
+               if (err)
+                       ext4_error(sb, "Checksum bad for grp %u", block_group);
                return bh;
        }
        ext4_unlock_group(sb, block_group);
@@ -445,7 +470,10 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
        return bh;
 verify:
        ext4_validate_block_bitmap(sb, desc, block_group, bh);
-       return bh;
+       if (buffer_verified(bh))
+               return bh;
+       put_bh(bh);
+       return NULL;
 }
 
 /* Returns 0 on success, 1 on error */
@@ -469,7 +497,8 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
        clear_buffer_new(bh);
        /* Panic or remount fs read-only if block bitmap is invalid */
        ext4_validate_block_bitmap(sb, desc, block_group, bh);
-       return 0;
+       /* ...but check for error just in case errors=continue. */
+       return !buffer_verified(bh);
 }
 
 struct buffer_head *
@@ -611,10 +640,8 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
         * Account for the allocated meta blocks.  We will never
         * fail EDQUOT for metdata, but we do account for it.
         */
-       if (!(*errp) &&
-           ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
+       if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-               EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
                dquot_alloc_block_nofail(inode,
                                EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
@@ -634,6 +661,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
        struct ext4_group_desc *gdp;
        ext4_group_t i;
        ext4_group_t ngroups = ext4_get_groups_count(sb);
+       struct ext4_group_info *grp;
 #ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        ext4_fsblk_t bitmap_count;
@@ -649,14 +677,18 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-               desc_count += ext4_free_group_clusters(sb, gdp);
+               grp = NULL;
+               if (EXT4_SB(sb)->s_group_info)
+                       grp = ext4_get_group_info(sb, i);
+               if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+                       desc_count += ext4_free_group_clusters(sb, gdp);
                brelse(bitmap_bh);
                bitmap_bh = ext4_read_block_bitmap(sb, i);
                if (bitmap_bh == NULL)
                        continue;
 
                x = ext4_count_free(bitmap_bh->b_data,
-                                   EXT4_BLOCKS_PER_GROUP(sb) / 8);
+                                   EXT4_CLUSTERS_PER_GROUP(sb) / 8);
                printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
                        i, ext4_free_group_clusters(sb, gdp), x);
                bitmap_count += x;
@@ -673,7 +705,11 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-               desc_count += ext4_free_group_clusters(sb, gdp);
+               grp = NULL;
+               if (EXT4_SB(sb)->s_group_info)
+                       grp = ext4_get_group_info(sb, i);
+               if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+                       desc_count += ext4_free_group_clusters(sb, gdp);
        }
 
        return desc_count;
@@ -682,21 +718,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
 
 static inline int test_root(ext4_group_t a, int b)
 {
-       int num = b;
-
-       while (a > num)
-               num *= b;
-       return num == a;
-}
-
-static int ext4_group_sparse(ext4_group_t group)
-{
-       if (group <= 1)
-               return 1;
-       if (!(group & 1))
-               return 0;
-       return (test_root(group, 7) || test_root(group, 5) ||
-               test_root(group, 3));
+       while (1) {
+               if (a < b)
+                       return 0;
+               if (a == b)
+                       return 1;
+               if ((a % b) != 0)
+                       return 0;
+               a = a / b;
+       }
 }
 
 /**
@@ -709,11 +739,26 @@ static int ext4_group_sparse(ext4_group_t group)
  */
 int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
 {
-       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                               EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
-                       !ext4_group_sparse(group))
+       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+       if (group == 0)
+               return 1;
+       if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) {
+               if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
+                   group == le32_to_cpu(es->s_backup_bgs[1]))
+                       return 1;
                return 0;
-       return 1;
+       }
+       if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                       EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER))
+               return 1;
+       if (!(group & 1))
+               return 0;
+       if (test_root(group, 3) || (test_root(group, 5)) ||
+           test_root(group, 7))
+               return 1;
+
+       return 0;
 }
 
 static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
index 3285aa5..b610779 100644 (file)
@@ -24,8 +24,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
        __u32 provided, calculated;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+       if (!ext4_has_metadata_csum(sb))
                return 1;
 
        provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
@@ -46,8 +45,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
        __u32 csum;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+       if (!ext4_has_metadata_csum(sb))
                return;
 
        csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
@@ -65,8 +63,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
 
-       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+       if (!ext4_has_metadata_csum(sb))
                return 1;
 
        provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
@@ -91,8 +88,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
        __u32 csum;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+       if (!ext4_has_metadata_csum(sb))
                return;
 
        csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
index 3f11656..41eb9dc 100644 (file)
@@ -180,37 +180,12 @@ int ext4_setup_system_zone(struct super_block *sb)
 /* Called when the filesystem is unmounted */
 void ext4_release_system_zone(struct super_block *sb)
 {
-       struct rb_node  *n = EXT4_SB(sb)->system_blks.rb_node;
-       struct rb_node  *parent;
-       struct ext4_system_zone *entry;
+       struct ext4_system_zone *entry, *n;
 
-       while (n) {
-               /* Do the node's children first */
-               if (n->rb_left) {
-                       n = n->rb_left;
-                       continue;
-               }
-               if (n->rb_right) {
-                       n = n->rb_right;
-                       continue;
-               }
-               /*
-                * The node has no children; free it, and then zero
-                * out parent's link to it.  Finally go to the
-                * beginning of the loop and try to free the parent
-                * node.
-                */
-               parent = rb_parent(n);
-               entry = rb_entry(n, struct ext4_system_zone, node);
+       rbtree_postorder_for_each_entry_safe(entry, n,
+                       &EXT4_SB(sb)->system_blks, node)
                kmem_cache_free(ext4_system_zone_cachep, entry);
-               if (!parent)
-                       EXT4_SB(sb)->system_blks = RB_ROOT;
-               else if (parent->rb_left == n)
-                       parent->rb_left = NULL;
-               else if (parent->rb_right == n)
-                       parent->rb_right = NULL;
-               n = parent;
-       }
+
        EXT4_SB(sb)->system_blks = RB_ROOT;
 }
 
index f8d56e4..c24143e 100644 (file)
 #include "ext4.h"
 #include "xattr.h"
 
-static int ext4_dx_readdir(struct file *filp,
-                          void *dirent, filldir_t filldir);
+static int ext4_dx_readdir(struct file *, struct dir_context *);
 
 /**
  * Check if the given dir-inode refers to an htree-indexed directory
- * (or a directory which chould potentially get coverted to use htree
+ * (or a directory which could potentially get converted to use htree
  * indexing).
  *
  * Return 1 if it is a dx dir, 0 if not
@@ -103,79 +102,72 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
        return 1;
 }
 
-static int ext4_readdir(struct file *filp,
-                        void *dirent, filldir_t filldir)
+static int ext4_readdir(struct file *file, struct dir_context *ctx)
 {
-       int error = 0;
        unsigned int offset;
-       int i, stored;
+       int i;
        struct ext4_dir_entry_2 *de;
        int err;
-       struct inode *inode = file_inode(filp);
+       struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
-       int ret = 0;
        int dir_has_error = 0;
 
        if (is_dx_dir(inode)) {
-               err = ext4_dx_readdir(filp, dirent, filldir);
+               err = ext4_dx_readdir(file, ctx);
                if (err != ERR_BAD_DX_DIR) {
-                       ret = err;
-                       goto out;
+                       return err;
                }
                /*
                 * We don't set the inode dirty flag since it's not
                 * critical that it get flushed back to the disk.
                 */
-               ext4_clear_inode_flag(file_inode(filp),
+               ext4_clear_inode_flag(file_inode(file),
                                      EXT4_INODE_INDEX);
        }
 
        if (ext4_has_inline_data(inode)) {
                int has_inline_data = 1;
-               ret = ext4_read_inline_dir(filp, dirent, filldir,
+               int ret = ext4_read_inline_dir(file, ctx,
                                           &has_inline_data);
                if (has_inline_data)
                        return ret;
        }
 
-       stored = 0;
-       offset = filp->f_pos & (sb->s_blocksize - 1);
+       offset = ctx->pos & (sb->s_blocksize - 1);
 
-       while (!error && !stored && filp->f_pos < inode->i_size) {
+       while (ctx->pos < inode->i_size) {
                struct ext4_map_blocks map;
                struct buffer_head *bh = NULL;
 
-               map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+               map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
                map.m_len = 1;
                err = ext4_map_blocks(NULL, inode, &map, 0);
                if (err > 0) {
                        pgoff_t index = map.m_pblk >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
-                       if (!ra_has_index(&filp->f_ra, index))
+                       if (!ra_has_index(&file->f_ra, index))
                                page_cache_sync_readahead(
                                        sb->s_bdev->bd_inode->i_mapping,
-                                       &filp->f_ra, filp,
+                                       &file->f_ra, file,
                                        index, 1);
-                       filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
-                       bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
+                       file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+                       bh = ext4_bread(NULL, inode, map.m_lblk, 0);
+                       if (IS_ERR(bh))
+                               return PTR_ERR(bh);
                }
 
-               /*
-                * We ignore I/O errors on directories so users have a chance
-                * of recovering data when there's a bad sector
-                */
                if (!bh) {
                        if (!dir_has_error) {
-                               EXT4_ERROR_FILE(filp, 0,
+                               EXT4_ERROR_FILE(file, 0,
                                                "directory contains a "
                                                "hole at offset %llu",
-                                          (unsigned long long) filp->f_pos);
+                                          (unsigned long long) ctx->pos);
                                dir_has_error = 1;
                        }
                        /* corrupt size?  Maybe no more blocks to read */
-                       if (filp->f_pos > inode->i_blocks << 9)
+                       if (ctx->pos > inode->i_blocks << 9)
                                break;
-                       filp->f_pos += sb->s_blocksize - offset;
+                       ctx->pos += sb->s_blocksize - offset;
                        continue;
                }
 
@@ -183,21 +175,20 @@ static int ext4_readdir(struct file *filp,
                if (!buffer_verified(bh) &&
                    !ext4_dirent_csum_verify(inode,
                                (struct ext4_dir_entry *)bh->b_data)) {
-                       EXT4_ERROR_FILE(filp, 0, "directory fails checksum "
+                       EXT4_ERROR_FILE(file, 0, "directory fails checksum "
                                        "at offset %llu",
-                                       (unsigned long long)filp->f_pos);
-                       filp->f_pos += sb->s_blocksize - offset;
+                                       (unsigned long long)ctx->pos);
+                       ctx->pos += sb->s_blocksize - offset;
                        brelse(bh);
                        continue;
                }
                set_buffer_verified(bh);
 
-revalidate:
                /* If the dir block has changed since the last call to
                 * readdir(2), then we might be pointing to an invalid
                 * dirent right now.  Scan from the start of the block
                 * to make sure. */
-               if (filp->f_version != inode->i_version) {
+               if (file->f_version != inode->i_version) {
                        for (i = 0; i < sb->s_blocksize && i < offset; ) {
                                de = (struct ext4_dir_entry_2 *)
                                        (bh->b_data + i);
@@ -214,57 +205,46 @@ revalidate:
                                                            sb->s_blocksize);
                        }
                        offset = i;
-                       filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+                       ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
                                | offset;
-                       filp->f_version = inode->i_version;
+                       file->f_version = inode->i_version;
                }
 
-               while (!error && filp->f_pos < inode->i_size
+               while (ctx->pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-                       if (ext4_check_dir_entry(inode, filp, de, bh,
+                       if (ext4_check_dir_entry(inode, file, de, bh,
                                                 bh->b_data, bh->b_size,
                                                 offset)) {
                                /*
-                                * On error, skip the f_pos to the next block
+                                * On error, skip to the next block
                                 */
-                               filp->f_pos = (filp->f_pos |
+                               ctx->pos = (ctx->pos |
                                                (sb->s_blocksize - 1)) + 1;
-                               brelse(bh);
-                               ret = stored;
-                               goto out;
+                               break;
                        }
                        offset += ext4_rec_len_from_disk(de->rec_len,
                                        sb->s_blocksize);
                        if (le32_to_cpu(de->inode)) {
-                               /* We might block in the next section
-                                * if the data destination is
-                                * currently swapped out.  So, use a
-                                * version stamp to detect whether or
-                                * not the directory has been modified
-                                * during the copy operation.
-                                */
-                               u64 version = filp->f_version;
-
-                               error = filldir(dirent, de->name,
+                               if (!dir_emit(ctx, de->name,
                                                de->name_len,
-                                               filp->f_pos,
                                                le32_to_cpu(de->inode),
-                                               get_dtype(sb, de->file_type));
-                               if (error)
-                                       break;
-                               if (version != filp->f_version)
-                                       goto revalidate;
-                               stored++;
+                                               get_dtype(sb, de->file_type))) {
+                                       brelse(bh);
+                                       return 0;
+                               }
                        }
-                       filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+                       ctx->pos += ext4_rec_len_from_disk(de->rec_len,
                                                sb->s_blocksize);
                }
                offset = 0;
                brelse(bh);
+               if (ctx->pos < inode->i_size) {
+                       if (!dir_relax(inode))
+                               return 0;
+               }
        }
-out:
-       return ret;
+       return 0;
 }
 
 static inline int is_32bit_api(void)
@@ -370,41 +350,16 @@ struct fname {
  */
 static void free_rb_tree_fname(struct rb_root *root)
 {
-       struct rb_node  *n = root->rb_node;
-       struct rb_node  *parent;
-       struct fname    *fname;
-
-       while (n) {
-               /* Do the node's children first */
-               if (n->rb_left) {
-                       n = n->rb_left;
-                       continue;
-               }
-               if (n->rb_right) {
-                       n = n->rb_right;
-                       continue;
-               }
-               /*
-                * The node has no children; free it, and then zero
-                * out parent's link to it.  Finally go to the
-                * beginning of the loop and try to free the parent
-                * node.
-                */
-               parent = rb_parent(n);
-               fname = rb_entry(n, struct fname, rb_hash);
+       struct fname *fname, *next;
+
+       rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
                while (fname) {
                        struct fname *old = fname;
                        fname = fname->next;
                        kfree(old);
                }
-               if (!parent)
-                       *root = RB_ROOT;
-               else if (parent->rb_left == n)
-                       parent->rb_left = NULL;
-               else if (parent->rb_right == n)
-                       parent->rb_right = NULL;
-               n = parent;
-       }
+
+       *root = RB_ROOT;
 }
 
 
@@ -492,16 +447,12 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
  * for all entres on the fname linked list.  (Normally there is only
  * one entry on the linked list, unless there are 62 bit hash collisions.)
  */
-static int call_filldir(struct file *filp, void *dirent,
-                       filldir_t filldir, struct fname *fname)
+static int call_filldir(struct file *file, struct dir_context *ctx,
+                       struct fname *fname)
 {
-       struct dir_private_info *info = filp->private_data;
-       loff_t  curr_pos;
-       struct inode *inode = file_inode(filp);
-       struct super_block *sb;
-       int error;
-
-       sb = inode->i_sb;
+       struct dir_private_info *info = file->private_data;
+       struct inode *inode = file_inode(file);
+       struct super_block *sb = inode->i_sb;
 
        if (!fname) {
                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
@@ -509,47 +460,44 @@ static int call_filldir(struct file *filp, void *dirent,
                         inode->i_ino, current->comm);
                return 0;
        }
-       curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
+       ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
        while (fname) {
-               error = filldir(dirent, fname->name,
-                               fname->name_len, curr_pos,
+               if (!dir_emit(ctx, fname->name,
+                               fname->name_len,
                                fname->inode,
-                               get_dtype(sb, fname->file_type));
-               if (error) {
-                       filp->f_pos = curr_pos;
+                               get_dtype(sb, fname->file_type))) {
                        info->extra_fname = fname;
-                       return error;
+                       return 1;
                }
                fname = fname->next;
        }
        return 0;
 }
 
-static int ext4_dx_readdir(struct file *filp,
-                        void *dirent, filldir_t filldir)
+static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
 {
-       struct dir_private_info *info = filp->private_data;
-       struct inode *inode = file_inode(filp);
+       struct dir_private_info *info = file->private_data;
+       struct inode *inode = file_inode(file);
        struct fname *fname;
        int     ret;
 
        if (!info) {
-               info = ext4_htree_create_dir_info(filp, filp->f_pos);
+               info = ext4_htree_create_dir_info(file, ctx->pos);
                if (!info)
                        return -ENOMEM;
-               filp->private_data = info;
+               file->private_data = info;
        }
 
-       if (filp->f_pos == ext4_get_htree_eof(filp))
+       if (ctx->pos == ext4_get_htree_eof(file))
                return 0;       /* EOF */
 
        /* Some one has messed with f_pos; reset the world */
-       if (info->last_pos != filp->f_pos) {
+       if (info->last_pos != ctx->pos) {
                free_rb_tree_fname(&info->root);
                info->curr_node = NULL;
                info->extra_fname = NULL;
-               info->curr_hash = pos2maj_hash(filp, filp->f_pos);
-               info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
+               info->curr_hash = pos2maj_hash(file, ctx->pos);
+               info->curr_minor_hash = pos2min_hash(file, ctx->pos);
        }
 
        /*
@@ -557,7 +505,7 @@ static int ext4_dx_readdir(struct file *filp,
         * chain, return them first.
         */
        if (info->extra_fname) {
-               if (call_filldir(filp, dirent, filldir, info->extra_fname))
+               if (call_filldir(file, ctx, info->extra_fname))
                        goto finished;
                info->extra_fname = NULL;
                goto next_node;
@@ -571,17 +519,17 @@ static int ext4_dx_readdir(struct file *filp,
                 * cached entries.
                 */
                if ((!info->curr_node) ||
-                   (filp->f_version != inode->i_version)) {
+                   (file->f_version != inode->i_version)) {
                        info->curr_node = NULL;
                        free_rb_tree_fname(&info->root);
-                       filp->f_version = inode->i_version;
-                       ret = ext4_htree_fill_tree(filp, info->curr_hash,
+                       file->f_version = inode->i_version;
+                       ret = ext4_htree_fill_tree(file, info->curr_hash,
                                                   info->curr_minor_hash,
                                                   &info->next_hash);
                        if (ret < 0)
                                return ret;
                        if (ret == 0) {
-                               filp->f_pos = ext4_get_htree_eof(filp);
+                               ctx->pos = ext4_get_htree_eof(file);
                                break;
                        }
                        info->curr_node = rb_first(&info->root);
@@ -590,7 +538,7 @@ static int ext4_dx_readdir(struct file *filp,
                fname = rb_entry(info->curr_node, struct fname, rb_hash);
                info->curr_hash = fname->hash;
                info->curr_minor_hash = fname->minor_hash;
-               if (call_filldir(filp, dirent, filldir, fname))
+               if (call_filldir(file, ctx, fname))
                        break;
        next_node:
                info->curr_node = rb_next(info->curr_node);
@@ -601,7 +549,7 @@ static int ext4_dx_readdir(struct file *filp,
                        info->curr_minor_hash = fname->minor_hash;
                } else {
                        if (info->next_hash == ~0) {
-                               filp->f_pos = ext4_get_htree_eof(filp);
+                               ctx->pos = ext4_get_htree_eof(file);
                                break;
                        }
                        info->curr_hash = info->next_hash;
@@ -609,7 +557,7 @@ static int ext4_dx_readdir(struct file *filp,
                }
        }
 finished:
-       info->last_pos = filp->f_pos;
+       info->last_pos = ctx->pos;
        return 0;
 }
 
@@ -621,10 +569,35 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
        return 0;
 }
 
+int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
+                     int buf_size)
+{
+       struct ext4_dir_entry_2 *de;
+       int nlen, rlen;
+       unsigned int offset = 0;
+       char *top;
+
+       de = (struct ext4_dir_entry_2 *)buf;
+       top = buf + buf_size;
+       while ((char *) de < top) {
+               if (ext4_check_dir_entry(dir, NULL, de, bh,
+                                        buf, buf_size, offset))
+                       return -EIO;
+               nlen = EXT4_DIR_REC_LEN(de->name_len);
+               rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+               de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+               offset += rlen;
+       }
+       if ((char *) de > top)
+               return -EIO;
+
+       return 0;
+}
+
 const struct file_operations ext4_dir_operations = {
        .llseek         = ext4_dir_llseek,
        .read           = generic_read_dir,
-       .readdir        = ext4_readdir,
+       .iterate        = ext4_readdir,
        .unlocked_ioctl = ext4_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext4_compat_ioctl,
index e5a77bb..a2f4422 100644 (file)
@@ -29,7 +29,9 @@
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
+#include <linux/ratelimit.h>
 #include <crypto/hash.h>
+#include <linux/falloc.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
@@ -156,7 +158,6 @@ struct ext4_allocation_request {
 #define EXT4_MAP_MAPPED                (1 << BH_Mapped)
 #define EXT4_MAP_UNWRITTEN     (1 << BH_Unwritten)
 #define EXT4_MAP_BOUNDARY      (1 << BH_Boundary)
-#define EXT4_MAP_UNINIT                (1 << BH_Uninit)
 /* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
  * ext4_map_blocks wants to know whether or not the underlying cluster has
  * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
@@ -167,7 +168,7 @@ struct ext4_allocation_request {
 #define EXT4_MAP_FROM_CLUSTER  (1 << BH_AllocFromCluster)
 #define EXT4_MAP_FLAGS         (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
                                 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
-                                EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER)
+                                EXT4_MAP_FROM_CLUSTER)
 
 struct ext4_map_blocks {
        ext4_fsblk_t m_pblk;
@@ -177,38 +178,25 @@ struct ext4_map_blocks {
 };
 
 /*
- * For delayed allocation tracking
- */
-struct mpage_da_data {
-       struct inode *inode;
-       sector_t b_blocknr;             /* start block number of extent */
-       size_t b_size;                  /* size of extent */
-       unsigned long b_state;          /* state of the extent */
-       unsigned long first_page, next_page;    /* extent of pages */
-       struct writeback_control *wbc;
-       int io_done;
-       int pages_written;
-       int retval;
-};
-
-/*
  * Flags for ext4_io_end->flags
  */
 #define        EXT4_IO_END_UNWRITTEN   0x0001
-#define EXT4_IO_END_ERROR      0x0002
-#define EXT4_IO_END_DIRECT     0x0004
 
 /*
- * For converting uninitialized extents on a work queue.
+ * For converting unwritten extents on a work queue. 'handle' is used for
+ * buffered writeback.
  */
 typedef struct ext4_io_end {
        struct list_head        list;           /* per-file finished IO list */
+       handle_t                *handle;        /* handle reserved for extent
+                                                * conversion */
        struct inode            *inode;         /* file being written to */
+       struct bio              *bio;           /* Linked list of completed
+                                                * bios covering the extent */
        unsigned int            flag;           /* unwritten or not */
        loff_t                  offset;         /* offset in the file */
        ssize_t                 size;           /* size of the extent */
-       struct kiocb            *iocb;          /* iocb struct for AIO */
-       int                     result;         /* error value for AIO */
+       atomic_t                count;          /* reference counter */
 } ext4_io_end_t;
 
 struct ext4_io_submit {
@@ -548,26 +536,26 @@ enum {
 /*
  * Flags used by ext4_map_blocks()
  */
-       /* Allocate any needed blocks and/or convert an unitialized
+       /* Allocate any needed blocks and/or convert an unwritten
           extent to be an initialized ext4 */
 #define EXT4_GET_BLOCKS_CREATE                 0x0001
-       /* Request the creation of an unitialized extent */
-#define EXT4_GET_BLOCKS_UNINIT_EXT             0x0002
-#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT      (EXT4_GET_BLOCKS_UNINIT_EXT|\
+       /* Request the creation of an unwritten extent */
+#define EXT4_GET_BLOCKS_UNWRIT_EXT             0x0002
+#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT      (EXT4_GET_BLOCKS_UNWRIT_EXT|\
                                                 EXT4_GET_BLOCKS_CREATE)
        /* Caller is from the delayed allocation writeout path
         * finally doing the actual allocation of delayed blocks */
 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE       0x0004
        /* caller is from the direct IO path, request to creation of an
-       unitialized extents if not allocated, split the uninitialized
+       unwritten extents if not allocated, split the unwritten
        extent if blocks has been preallocated already*/
 #define EXT4_GET_BLOCKS_PRE_IO                 0x0008
 #define EXT4_GET_BLOCKS_CONVERT                        0x0010
 #define EXT4_GET_BLOCKS_IO_CREATE_EXT          (EXT4_GET_BLOCKS_PRE_IO|\
-                                        EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+                                        EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
        /* Convert extent to initialized after IO complete */
 #define EXT4_GET_BLOCKS_IO_CONVERT_EXT         (EXT4_GET_BLOCKS_CONVERT|\
-                                        EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+                                        EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
        /* Eventual metadata allocation (due to growing extent tree)
         * should not fail, so try to use reserved blocks for that.*/
 #define EXT4_GET_BLOCKS_METADATA_NOFAIL                0x0020
@@ -579,6 +567,20 @@ enum {
 #define EXT4_GET_BLOCKS_NO_LOCK                        0x0100
        /* Do not put hole in extent cache */
 #define EXT4_GET_BLOCKS_NO_PUT_HOLE            0x0200
+       /* Convert written extents to unwritten */
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN      0x0400
+
+/*
+ * The bit position of these flags must not overlap with any of the
+ * EXT4_GET_BLOCKS_*.  They are used by ext4_find_extent(),
+ * read_extent_tree_block(), ext4_split_extent_at(),
+ * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
+ * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
+ * caching the extents when reading from the extent tree while a
+ * truncate or punch hole operation is in progress.
+ */
+#define EXT4_EX_NOCACHE                                0x40000000
+#define EXT4_EX_FORCE_CACHE                    0x20000000
 
 /*
  * Flags used by ext4_free_blocks
@@ -589,12 +591,6 @@ enum {
 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE        0x0008
 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER  0x0010
 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER   0x0020
-#define EXT4_FREE_BLOCKS_RESERVE               0x0040
-
-/*
- * Flags used by ext4_discard_partial_page_buffers
- */
-#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED  0x0001
 
 /*
  * ioctl commands
@@ -616,6 +612,7 @@ enum {
 #define EXT4_IOC_MOVE_EXT              _IOWR('f', 15, struct move_extent)
 #define EXT4_IOC_RESIZE_FS             _IOW('f', 16, __u64)
 #define EXT4_IOC_SWAP_BOOT             _IO('f', 17)
+#define EXT4_IOC_PRECACHE_EXTENTS      _IO('f', 18)
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -877,6 +874,8 @@ struct ext4_inode_info {
        struct inode vfs_inode;
        struct jbd2_inode *jinode;
 
+       spinlock_t i_raw_lock;  /* protects updates to the raw inode */
+
        /*
         * File creation time. Its function is same as that of
         * struct timespec i_{a,c,m}time in the generic inode.
@@ -891,7 +890,9 @@ struct ext4_inode_info {
        struct ext4_es_tree i_es_tree;
        rwlock_t i_es_lock;
        struct list_head i_es_lru;
+       unsigned int i_es_all_nr;       /* protected by i_es_lock */
        unsigned int i_es_lru_nr;       /* protected by i_es_lock */
+       unsigned long i_touch_when;     /* jiffies of last accessing */
 
        /* ialloc */
        ext4_group_t    i_last_alloc_group;
@@ -916,12 +917,20 @@ struct ext4_inode_info {
        qsize_t i_reserved_quota;
 #endif
 
-       /* completed IOs that might need unwritten extents handling */
-       struct list_head i_completed_io_list;
+       /* Lock protecting lists below */
        spinlock_t i_completed_io_lock;
+       /*
+        * Completed IOs that need unwritten extents handling and have
+        * transaction reserved
+        */
+       struct list_head i_rsv_conversion_list;
+       /*
+        * Completed IOs that need unwritten extents handling and don't have
+        * transaction reserved
+        */
        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
        atomic_t i_unwritten; /* Nr. of inflight conversions pending */
-       struct work_struct i_unwritten_work;    /* deferred extent conversion */
+       struct work_struct i_rsv_conversion_work;
 
        spinlock_t i_block_reservation_lock;
 
@@ -993,6 +1002,8 @@ struct ext4_inode_info {
 #define EXT4_MOUNT2_STD_GROUP_SIZE     0x00000002 /* We have standard group
                                                      size of blocksize * 8
                                                      blocks */
+#define EXT4_MOUNT2_HURD_COMPAT                0x00000004 /* Support HURD-castrated
+                                                     file systems */
 
 #define clear_opt(sb, opt)             EXT4_SB(sb)->s_mount_opt &= \
                                                ~EXT4_MOUNT_##opt
@@ -1149,7 +1160,8 @@ struct ext4_super_block {
        __le32  s_usr_quota_inum;       /* inode for tracking user quota */
        __le32  s_grp_quota_inum;       /* inode for tracking group quota */
        __le32  s_overhead_clusters;    /* overhead blocks/clusters in fs */
-       __le32  s_reserved[108];        /* Padding to the end of the block */
+       __le32  s_backup_bgs[2];        /* groups with sparse_super2 SBs */
+       __le32  s_reserved[106];        /* Padding to the end of the block */
        __le32  s_checksum;             /* crc32c(superblock) */
 };
 
@@ -1163,6 +1175,9 @@ struct ext4_super_block {
 #define EXT4_MF_MNTDIR_SAMPLED 0x0001
 #define EXT4_MF_FS_ABORTED     0x0002  /* Fatal error detected */
 
+/* Number of quota types we support */
+#define EXT4_MAXQUOTAS 2
+
 /*
  * fourth extended-fs super-block data in memory
  */
@@ -1226,7 +1241,7 @@ struct ext4_sb_info {
        u32 s_min_batch_time;
        struct block_device *journal_bdev;
 #ifdef CONFIG_QUOTA
-       char *s_qf_names[MAXQUOTAS];            /* Names of quota files with journalled quota */
+       char *s_qf_names[EXT4_MAXQUOTAS];       /* Names of quota files with journalled quota */
        int s_jquota_fmt;                       /* Format of quota to use */
 #endif
        unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
@@ -1258,7 +1273,6 @@ struct ext4_sb_info {
        unsigned int s_mb_stats;
        unsigned int s_mb_order2_reqs;
        unsigned int s_mb_group_prealloc;
-       unsigned int s_max_writeback_mb_bump;
        unsigned int s_max_dir_size_kb;
        /* where last allocation was done - for stream allocation */
        unsigned long s_mb_last_group;
@@ -1294,8 +1308,8 @@ struct ext4_sb_info {
        struct flex_groups *s_flex_groups;
        ext4_group_t s_flex_groups_allocated;
 
-       /* workqueue for dio unwritten */
-       struct workqueue_struct *dio_unwritten_wq;
+       /* workqueue for reserved extent conversions (buffered io) */
+       struct workqueue_struct *rsv_conversion_wq;
 
        /* timer for periodic error stats printing */
        struct timer_list s_err_report;
@@ -1320,8 +1334,14 @@ struct ext4_sb_info {
        /* Reclaim extents from extent status tree */
        struct shrinker s_es_shrinker;
        struct list_head s_es_lru;
-       struct percpu_counter s_extent_cache_cnt;
+       struct ext4_es_stats s_es_stats;
+       struct mb_cache *s_mb_cache;
        spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
+
+       /* Ratelimit ext4 messages. */
+       struct ratelimit_state s_err_ratelimit_state;
+       struct ratelimit_state s_warning_ratelimit_state;
+       struct ratelimit_state s_msg_ratelimit_state;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1382,11 +1402,11 @@ enum {
        EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
        EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
        EXT4_STATE_NEWENTRY,            /* File just added to dir */
-       EXT4_STATE_DELALLOC_RESERVED,   /* blks already reserved for delalloc */
        EXT4_STATE_DIOREAD_LOCK,        /* Disable support for dio read
                                           nolocking */
        EXT4_STATE_MAY_INLINE_DATA,     /* may have in-inode data */
        EXT4_STATE_ORDERED_MODE,        /* data=ordered mode */
+       EXT4_STATE_EXT_PRECACHED,       /* extents have been precached */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)                                \
@@ -1403,7 +1423,18 @@ static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
        clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);           \
 }
 
+/* Add these declarations here only so that these functions can be
+ * found by name.  Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_flag(struct inode *inode, int bit);
+static inline void ext4_set_inode_flag(struct inode *inode, int bit);
+static inline void ext4_clear_inode_flag(struct inode *inode, int bit);
 EXT4_INODE_BIT_FNS(flag, flags, 0)
+
+/* Add these declarations here only so that these functions can be
+ * found by name.  Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_state(struct inode *inode, int bit);
+static inline void ext4_set_inode_state(struct inode *inode, int bit);
+static inline void ext4_clear_inode_state(struct inode *inode, int bit);
 #if (BITS_PER_LONG < 64)
 EXT4_INODE_BIT_FNS(state, state_flags, 0)
 
@@ -1477,6 +1508,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_COMPAT_EXT_ATTR           0x0008
 #define EXT4_FEATURE_COMPAT_RESIZE_INODE       0x0010
 #define EXT4_FEATURE_COMPAT_DIR_INDEX          0x0020
+#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2      0x0200
 
 #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER    0x0001
 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE      0x0002
@@ -1795,7 +1827,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 /*
  * Special error return code only used by dx_probe() and its callers.
  */
-#define ERR_BAD_DX_DIR -75000
+#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
 
 /*
  * Timeout and state flag for lazy initialization inode thread.
@@ -1925,10 +1957,6 @@ extern void ext4_get_group_no_and_offset(struct super_block *sb,
 extern ext4_group_t ext4_get_group_number(struct super_block *sb,
                                          ext4_fsblk_t block);
 
-extern void ext4_validate_block_bitmap(struct super_block *sb,
-                                      struct ext4_group_desc *desc,
-                                      unsigned int block_group,
-                                      struct buffer_head *bh);
 extern unsigned int ext4_block_group(struct super_block *sb,
                        ext4_fsblk_t blocknr);
 extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
@@ -1957,16 +1985,9 @@ extern int ext4_wait_block_bitmap(struct super_block *sb,
                                  struct buffer_head *bh);
 extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
                                                  ext4_group_t block_group);
-extern void ext4_init_block_bitmap(struct super_block *sb,
-                                  struct buffer_head *bh,
-                                  ext4_group_t group,
-                                  struct ext4_group_desc *desc);
 extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
                                              ext4_group_t block_group,
                                              struct ext4_group_desc *gdp);
-extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
-                                          ext4_group_t block_group,
-                                          struct ext4_group_desc *gdp);
 ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
 
 /* dir.c */
@@ -2009,10 +2030,11 @@ static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
 
        return ext4_filetype_table[filetype];
 }
+extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
+                            void *buf, int buf_size);
 
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
-extern int ext4_flush_unwritten_io(struct inode *);
 
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2067,10 +2089,8 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *,
                                unsigned long blkdev_flags);
 
 /* inode.c */
-struct buffer_head *ext4_getblk(handle_t *, struct inode *,
-                                               ext4_lblk_t, int, int *);
-struct buffer_head *ext4_bread(handle_t *, struct inode *,
-                                               ext4_lblk_t, int, int *);
+struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
+struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
 int ext4_get_block_write(struct inode *inode, sector_t iblock,
                         struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
@@ -2101,9 +2121,10 @@ extern int  ext4_sync_inode(handle_t *, struct inode *);
 extern void ext4_dirty_inode(struct inode *, int);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_inode_attach_jinode(struct inode *inode);
 extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
-extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -2111,9 +2132,8 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_discard_partial_page_buffers(handle_t *handle,
-               struct address_space *mapping, loff_t from,
-               loff_t length, int flags);
+extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+                            loff_t lstart, loff_t lend);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -2123,13 +2143,12 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_map_blocks *map, int flags);
 extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
-                               const struct iovec *iov, loff_t offset,
-                               unsigned long nr_segs);
+                               struct iov_iter *iter, loff_t offset);
 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
-extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
+extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
 extern void ext4_ind_truncate(handle_t *, struct inode *inode);
-extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
-                                ext4_lblk_t first, ext4_lblk_t stop);
+extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
+                                ext4_lblk_t start, ext4_lblk_t end);
 
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -2171,8 +2190,6 @@ extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
 
 /* super.c */
 extern int ext4_calculate_overhead(struct super_block *sb);
-extern int ext4_superblock_csum_verify(struct super_block *sb,
-                                      struct ext4_super_block *es);
 extern void ext4_superblock_csum_set(struct super_block *sb);
 extern void *ext4_kvmalloc(size_t size, gfp_t flags);
 extern void *ext4_kvzalloc(size_t size, gfp_t flags);
@@ -2181,42 +2198,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb,
                                    ext4_group_t ngroup);
 extern const char *ext4_decode_error(struct super_block *sb, int errno,
                                     char nbuf[16]);
+
 extern __printf(4, 5)
 void __ext4_error(struct super_block *, const char *, unsigned int,
                  const char *, ...);
-#define ext4_error(sb, message...)     __ext4_error(sb, __func__,      \
-                                                    __LINE__, ## message)
 extern __printf(5, 6)
-void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
                      const char *, ...);
 extern __printf(5, 6)
-void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
                     const char *, ...);
 extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
 extern __printf(4, 5)
 void __ext4_abort(struct super_block *, const char *, unsigned int,
                  const char *, ...);
-#define ext4_abort(sb, message...)     __ext4_abort(sb, __func__, \
-                                                      __LINE__, ## message)
 extern __printf(4, 5)
 void __ext4_warning(struct super_block *, const char *, unsigned int,
                    const char *, ...);
-#define ext4_warning(sb, message...)   __ext4_warning(sb, __func__, \
-                                                      __LINE__, ## message)
 extern __printf(3, 4)
-void ext4_msg(struct super_block *, const char *, const char *, ...);
+void __ext4_msg(struct super_block *, const char *, const char *, ...);
 extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
                           const char *, unsigned int, const char *);
-#define dump_mmp_msg(sb, mmp, msg)     __dump_mmp_msg(sb, mmp, __func__, \
-                                                      __LINE__, msg)
 extern __printf(7, 8)
 void __ext4_grp_locked_error(const char *, unsigned int,
                             struct super_block *, ext4_group_t,
                             unsigned long, ext4_fsblk_t,
                             const char *, ...);
-#define ext4_grp_locked_error(sb, grp, message...) \
-       __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
+
+#ifdef CONFIG_PRINTK
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...)           \
+       __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error_file(file, func, line, block, fmt, ...)             \
+       __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error(sb, fmt, ...)                                       \
+       __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_abort(sb, fmt, ...)                                       \
+       __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_warning(sb, fmt, ...)                                     \
+       __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_msg(sb, level, fmt, ...)                          \
+       __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
+#define dump_mmp_msg(sb, mmp, msg)                                     \
+       __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)           \
+       __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
+                               fmt, ##__VA_ARGS__)
+
+#else
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...)           \
+do {                                                                   \
+       no_printk(fmt, ##__VA_ARGS__);                                  \
+       __ext4_error_inode(inode, "", 0, block, " ");                   \
+} while (0)
+#define ext4_error_file(file, func, line, block, fmt, ...)             \
+do {                                                                   \
+       no_printk(fmt, ##__VA_ARGS__);                                  \
+       __ext4_error_file(file, "", 0, block, " ");                     \
+} while (0)
+#define ext4_error(sb, fmt, ...)                                       \
+do {                                                                   \
+       no_printk(fmt, ##__VA_ARGS__);                                  \
+       __ext4_error(sb, "", 0, " ");                                   \
+} while (0)
+#define ext4_abort(sb, fmt, ...)                                       \
+do {                                                                   \
+       no_printk(fmt, ##__VA_ARGS__);                                  \
+       __ext4_abort(sb, "", 0, " ");                                   \
+} while (0)
+#define ext4_warning(sb, fmt, ...)                                     \
+do {                                                                   \
+       no_printk(fmt, ##__VA_ARGS__);                                  \
+       __ext4_warning(sb, "", 0, " ");                                 \
+} while (0)
+#define ext4_msg(sb, level, fmt, ...)                                  \
+do {                                                                   \
+       no_printk(fmt, ##__VA_ARGS__);                                  \
+       __ext4_msg(sb, "", " ");                                        \
+} while (0)
+#define dump_mmp_msg(sb, mmp, msg)                                     \
+       __dump_mmp_msg(sb, mmp, "", 0, "")
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)           \
+do {                                                                   \
+       no_printk(fmt, ##__VA_ARGS__);                          \
+       __ext4_grp_locked_error("", 0, sb, grp, ino, block, " ");       \
+} while (0)
+
+#endif
+
 extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
                                        __u32 compat);
@@ -2267,6 +2338,14 @@ static inline int ext4_has_group_desc_csum(struct super_block *sb)
               (EXT4_SB(sb)->s_chksum_driver != NULL);
 }
 
+static inline int ext4_has_metadata_csum(struct super_block *sb)
+{
+       WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+                    !EXT4_SB(sb)->s_chksum_driver);
+
+       return (EXT4_SB(sb)->s_chksum_driver != NULL);
+}
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
        return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
@@ -2327,6 +2406,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 {
         struct ext4_group_info ***grp_info;
         long indexv, indexh;
+        BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
         grp_info = EXT4_SB(sb)->s_group_info;
         indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
         indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
@@ -2373,16 +2453,31 @@ do {                                                            \
 #define EXT4_FREECLUSTERS_WATERMARK 0
 #endif
 
+/* Update i_disksize. Requires i_mutex to avoid races with truncate */
 static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
 {
-       /*
-        * XXX: replace with spinlock if seen contended -bzzz
-        */
+       WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
+                    !mutex_is_locked(&inode->i_mutex));
        down_write(&EXT4_I(inode)->i_data_sem);
        if (newsize > EXT4_I(inode)->i_disksize)
                EXT4_I(inode)->i_disksize = newsize;
        up_write(&EXT4_I(inode)->i_data_sem);
-       return ;
+}
+
+/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
+static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
+{
+       int changed = 0;
+
+       if (newsize > inode->i_size) {
+               i_size_write(inode, newsize);
+               changed = 1;
+       }
+       if (newsize > EXT4_I(inode)->i_disksize) {
+               ext4_update_i_disksize(inode, newsize);
+               changed |= 2;
+       }
+       return changed;
 }
 
 struct ext4_group_info {
@@ -2405,9 +2500,15 @@ struct ext4_group_info {
 
 #define EXT4_GROUP_INFO_NEED_INIT_BIT          0
 #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT                1
+#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT    2
+#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT    3
 
 #define EXT4_MB_GRP_NEED_INIT(grp)     \
        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp)       \
+       (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)       \
+       (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
 
 #define EXT4_MB_GRP_WAS_TRIMMED(grp)   \
        (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
@@ -2484,19 +2585,10 @@ extern const struct file_operations ext4_dir_operations;
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
 extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
-extern void ext4_unwritten_wait(struct inode *inode);
 
 /* inline.c */
-extern int ext4_has_inline_data(struct inode *inode);
-extern int ext4_get_inline_size(struct inode *inode);
 extern int ext4_get_max_inline_size(struct inode *inode);
 extern int ext4_find_inline_data_nolock(struct inode *inode);
-extern void ext4_write_inline_data(struct inode *inode,
-                                  struct ext4_iloc *iloc,
-                                  void *buffer, loff_t pos,
-                                  unsigned int len);
-extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
-                                   unsigned int len);
 extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
                                 unsigned int len);
 extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
@@ -2530,7 +2622,7 @@ extern int ext4_try_create_inline_dir(handle_t *handle,
                                      struct inode *parent,
                                      struct inode *inode);
 extern int ext4_read_inline_dir(struct file *filp,
-                               void *dirent, filldir_t filldir,
+                               struct dir_context *ctx,
                                int *has_inline_data);
 extern int htree_inlinedir_to_tree(struct file *dir_file,
                                   struct inode *dir, ext4_lblk_t block,
@@ -2560,6 +2652,12 @@ extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
 
 extern int ext4_convert_inline_data(struct inode *inode);
 
+static inline int ext4_has_inline_data(struct inode *inode)
+{
+       return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
+              EXT4_I(inode)->i_inline_off;
+}
+
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
 extern const struct inode_operations ext4_special_inode_operations;
@@ -2611,10 +2709,15 @@ extern int ext4_check_blockref(const char *, unsigned int,
 struct ext4_ext_path;
 struct ext4_extent;
 
+/*
+ * Maximum number of logical blocks in a file; ext4_extent's ee_block is
+ * __le32.
+ */
+#define EXT_MAX_BLOCKS 0xffffffff
+
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
-extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
-                                      int chunk);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                               struct ext4_map_blocks *map, int flags);
 extern void ext4_ext_truncate(handle_t *, struct inode *);
@@ -2624,8 +2727,8 @@ extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len);
-extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
-                         ssize_t len);
+extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+                                         loff_t offset, ssize_t len);
 extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags);
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -2638,65 +2741,66 @@ extern int ext4_can_extents_be_merged(struct inode *inode,
                                      struct ext4_extent *ex1,
                                      struct ext4_extent *ex2);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *,
-                                 struct ext4_ext_path *,
+                                 struct ext4_ext_path **,
                                  struct ext4_extent *, int);
-extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
-                                                 struct ext4_ext_path *);
+extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
+                                             struct ext4_ext_path **,
+                                             int flags);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
 extern int ext4_find_delalloc_range(struct inode *inode,
                                    ext4_lblk_t lblk_start,
                                    ext4_lblk_t lblk_end);
 extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
+extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
-
+extern int ext4_ext_precache(struct inode *inode);
+extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
+                               struct inode *inode2, ext4_lblk_t lblk1,
+                            ext4_lblk_t lblk2,  ext4_lblk_t count,
+                            int mark_unwritten,int *err);
 
 /* move_extent.c */
 extern void ext4_double_down_write_data_sem(struct inode *first,
                                            struct inode *second);
 extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
                                          struct inode *donor_inode);
-void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2);
-void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2);
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);
 
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
-extern void ext4_add_complete_io(ext4_io_end_t *io_end);
 extern void ext4_exit_pageio(void);
-extern void ext4_ioend_shutdown(struct inode *);
-extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern void ext4_end_io_work(struct work_struct *work);
+extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
+extern int ext4_put_io_end(ext4_io_end_t *io_end);
+extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
+extern void ext4_io_submit_init(struct ext4_io_submit *io,
+                               struct writeback_control *wbc);
+extern void ext4_end_io_rsv_work(struct work_struct *work);
 extern void ext4_io_submit(struct ext4_io_submit *io);
 extern int ext4_bio_write_page(struct ext4_io_submit *io,
                               struct page *page,
                               int len,
-                              struct writeback_control *wbc);
+                              struct writeback_control *wbc,
+                              bool keep_towrite);
 
 /* mmp.c */
 extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
-extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
-extern int ext4_mmp_csum_verify(struct super_block *sb,
-                               struct mmp_struct *mmp);
 
-/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+/*
+ * Note that these flags will never ever appear in a buffer_head's state flag.
+ * See EXT4_MAP_... to see where this is used.
+ */
 enum ext4_state_bits {
-       BH_Uninit       /* blocks are allocated but uninitialized on disk */
-         = BH_JBDPrivateStart,
-       BH_AllocFromCluster,    /* allocated blocks were part of already
-                                * allocated cluster. Note that this flag will
-                                * never, ever appear in a buffer_head's state
-                                * flag. See EXT4_MAP_FROM_CLUSTER to see where
-                                * this is used. */
+       BH_AllocFromCluster     /* allocated blocks were part of already
+                                * allocated cluster. */
+       = BH_JBDPrivateStart
 };
 
-BUFFER_FNS(Uninit, uninit)
-TAS_BUFFER_FNS(Uninit, uninit)
-
 /*
  * Add new method to test whether block and inode bitmaps are properly
  * initialized. With uninit_bg reading the block from disk is not enough
index 51bc821..3c93815 100644 (file)
@@ -123,6 +123,7 @@ find_ext4_extent_tail(struct ext4_extent_header *eh)
 struct ext4_ext_path {
        ext4_fsblk_t                    p_block;
        __u16                           p_depth;
+       __u16                           p_maxdepth;
        struct ext4_extent              *p_ext;
        struct ext4_extent_idx          *p_idx;
        struct ext4_extent_header       *p_hdr;
@@ -134,30 +135,24 @@ struct ext4_ext_path {
  */
 
 /*
- * Maximum number of logical blocks in a file; ext4_extent's ee_block is
- * __le32.
- */
-#define EXT_MAX_BLOCKS 0xffffffff
-
-/*
  * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
  * initialized extent. This is 2^15 and not (2^16 - 1), since we use the
  * MSB of ee_len field in the extent datastructure to signify if this
- * particular extent is an initialized extent or an uninitialized (i.e.
+ * particular extent is an initialized extent or an unwritten (i.e.
  * preallocated).
- * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an
- * uninitialized extent.
+ * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an
+ * unwritten extent.
  * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an
- * uninitialized one. In other words, if MSB of ee_len is set, it is an
- * uninitialized extent with only one special scenario when ee_len = 0x8000.
- * In this case we can not have an uninitialized extent of zero length and
+ * unwritten one. In other words, if MSB of ee_len is set, it is an
+ * unwritten extent with only one special scenario when ee_len = 0x8000.
+ * In this case we can not have an unwritten extent of zero length and
  * thus we make it as a special case of initialized extent with 0x8000 length.
  * This way we get better extent-to-group alignment for initialized extents.
  * Hence, the maximum number of blocks we can have in an *initialized*
- * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767).
+ * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767).
  */
 #define EXT_INIT_MAX_LEN       (1UL << 15)
-#define EXT_UNINIT_MAX_LEN     (EXT_INIT_MAX_LEN - 1)
+#define EXT_UNWRITTEN_MAX_LEN  (EXT_INIT_MAX_LEN - 1)
 
 
 #define EXT_FIRST_EXTENT(__hdr__) \
@@ -193,14 +188,14 @@ static inline unsigned short ext_depth(struct inode *inode)
        return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
 }
 
-static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
+static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext)
 {
-       /* We can not have an uninitialized extent of zero length! */
+       /* We can not have an unwritten extent of zero length! */
        BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0);
        ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN);
 }
 
-static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext)
+static inline int ext4_ext_is_unwritten(struct ext4_extent *ext)
 {
        /* Extent with ee_len of 0x8000 is treated as an initialized extent */
        return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN);
index 1be3996..d418431 100644 (file)
@@ -38,31 +38,43 @@ static void ext4_put_nojournal(handle_t *handle)
 /*
  * Wrappers for jbd2_journal_start/end.
  */
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
-                                 int type, int nblocks)
+static int ext4_journal_check_start(struct super_block *sb)
 {
        journal_t *journal;
 
        might_sleep();
-
-       trace_ext4_journal_start(sb, nblocks, _RET_IP_);
        if (sb->s_flags & MS_RDONLY)
-               return ERR_PTR(-EROFS);
-
+               return -EROFS;
        WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
        journal = EXT4_SB(sb)->s_journal;
-       if (!journal)
-               return ext4_get_nojournal();
        /*
         * Special case here: if the journal has aborted behind our
         * backs (eg. EIO in the commit thread), then we still need to
         * take the FS itself readonly cleanly.
         */
-       if (is_journal_aborted(journal)) {
+       if (journal && is_journal_aborted(journal)) {
                ext4_abort(sb, "Detected aborted journal");
-               return ERR_PTR(-EROFS);
+               return -EROFS;
        }
-       return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line);
+       return 0;
+}
+
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+                                 int type, int blocks, int rsv_blocks)
+{
+       journal_t *journal;
+       int err;
+
+       trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+       err = ext4_journal_check_start(sb);
+       if (err < 0)
+               return ERR_PTR(err);
+
+       journal = EXT4_SB(sb)->s_journal;
+       if (!journal)
+               return ext4_get_nojournal();
+       return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
+                                  type, line);
 }
 
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -75,6 +87,12 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
                ext4_put_nojournal(handle);
                return 0;
        }
+
+       if (!handle->h_transaction) {
+               err = jbd2_journal_stop(handle);
+               return handle->h_err ? handle->h_err : err;
+       }
+
        sb = handle->h_transaction->t_journal->j_private;
        err = handle->h_err;
        rc = jbd2_journal_stop(handle);
@@ -86,9 +104,34 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
        return err;
 }
 
-void ext4_journal_abort_handle(const char *caller, unsigned int line,
-                              const char *err_fn, struct buffer_head *bh,
-                              handle_t *handle, int err)
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+                                       int type)
+{
+       struct super_block *sb;
+       int err;
+
+       if (!ext4_handle_valid(handle))
+               return ext4_get_nojournal();
+
+       sb = handle->h_journal->j_private;
+       trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
+                                         _RET_IP_);
+       err = ext4_journal_check_start(sb);
+       if (err < 0) {
+               jbd2_journal_free_reserved(handle);
+               return ERR_PTR(err);
+       }
+
+       err = jbd2_journal_start_reserved(handle, type, line);
+       if (err < 0)
+               return ERR_PTR(err);
+       return handle;
+}
+
+static void ext4_journal_abort_handle(const char *caller, unsigned int line,
+                                     const char *err_fn,
+                                     struct buffer_head *bh,
+                                     handle_t *handle, int err)
 {
        char nbuf[16];
        const char *errstr = ext4_decode_error(NULL, err, nbuf);
@@ -219,10 +262,20 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
        set_buffer_prio(bh);
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_dirty_metadata(handle, bh);
-               /* Errors can only happen if there is a bug */
-               if (WARN_ON_ONCE(err)) {
+               /* Errors can only happen due to aborted journal or a nasty bug */
+               if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) {
                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
+                       if (inode == NULL) {
+                               pr_err("EXT4: jbd2_journal_dirty_metadata "
+                                      "failed: handle type %u started at "
+                                      "line %u, credits %u/%u, errcode %d",
+                                      handle->h_type,
+                                      handle->h_line_no,
+                                      handle->h_requested_credits,
+                                      handle->h_buffer_credits, err);
+                               return err;
+                       }
                        ext4_error_inode(inode, where, line,
                                         bh->b_blocknr,
                                         "journal_dirty_metadata failed: "
index c8c6885..9c5b49f 100644 (file)
 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0
 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0
 #endif
-#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
-#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
-#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
 
 static inline int ext4_jbd2_credits_xattr(struct inode *inode)
 {
@@ -134,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode)
 #define EXT4_HT_MIGRATE          8
 #define EXT4_HT_MOVE_EXTENTS     9
 #define EXT4_HT_XATTR           10
-#define EXT4_HT_MAX             11
+#define EXT4_HT_EXT_CONVERT     11
+#define EXT4_HT_MAX             12
 
 /**
  *   struct ext4_journal_cb_entry - Base structure for callback information.
@@ -196,7 +197,7 @@ static inline void ext4_journal_callback_add(handle_t *handle,
  * ext4_journal_callback_del: delete a registered callback
  * @handle: active journal transaction handle on which callback was registered
  * @jce: registered journal callback entry to unregister
- * Return true if object was sucessfully removed
+ * Return true if object was successfully removed
  */
 static inline bool ext4_journal_callback_try_del(handle_t *handle,
                                             struct ext4_journal_cb_entry *jce)
@@ -230,10 +231,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
 /*
  * Wrapper functions with which ext4 calls into JBD.
  */
-void ext4_journal_abort_handle(const char *caller, unsigned int line,
-                              const char *err_fn,
-               struct buffer_head *bh, handle_t *handle, int err);
-
 int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct buffer_head *bh);
 
@@ -265,7 +262,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
        __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
 
 handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
-                                 int type, int nblocks);
+                                 int type, int blocks, int rsv_blocks);
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
 
 #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -300,21 +297,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
 }
 
 #define ext4_journal_start_sb(sb, type, nblocks)                       \
-       __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks))
+       __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
 
 #define ext4_journal_start(inode, type, nblocks)                       \
-       __ext4_journal_start((inode), __LINE__, (type), (nblocks))
+       __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
+       __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
 
 static inline handle_t *__ext4_journal_start(struct inode *inode,
                                             unsigned int line, int type,
-                                            int nblocks)
+                                            int blocks, int rsv_blocks)
 {
-       return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks);
+       return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+                                      rsv_blocks);
 }
 
 #define ext4_journal_stop(handle) \
        __ext4_journal_stop(__func__, __LINE__, (handle))
 
+#define ext4_journal_start_reserved(handle, type) \
+       __ext4_journal_start_reserved((handle), __LINE__, (type))
+
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+                                       int type);
+
+static inline void ext4_journal_free_reserved(handle_t *handle)
+{
+       if (ext4_handle_valid(handle))
+               jbd2_journal_free_reserved(handle);
+}
+
 static inline handle_t *ext4_journal_current_handle(void)
 {
        return journal_current_handle();
index 1a667fb..b5fcb1a 100644 (file)
@@ -37,7 +37,6 @@
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <linux/falloc.h>
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
 #include "ext4_jbd2.h"
@@ -51,8 +50,8 @@
  */
 #define EXT4_EXT_MAY_ZEROOUT   0x1  /* safe to zeroout if split fails \
                                        due to ENOSPC */
-#define EXT4_EXT_MARK_UNINIT1  0x2  /* mark first half uninitialized */
-#define EXT4_EXT_MARK_UNINIT2  0x4  /* mark second half uninitialized */
+#define EXT4_EXT_MARK_UNWRIT1  0x2  /* mark first half unwritten */
+#define EXT4_EXT_MARK_UNWRIT2  0x4  /* mark second half unwritten */
 
 #define EXT4_EXT_DATA_VALID1   0x8  /* first half contains valid data */
 #define EXT4_EXT_DATA_VALID2   0x10 /* second half contains valid data */
@@ -74,8 +73,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode,
 {
        struct ext4_extent_tail *et;
 
-       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+       if (!ext4_has_metadata_csum(inode->i_sb))
                return 1;
 
        et = find_ext4_extent_tail(eh);
@@ -89,8 +87,7 @@ static void ext4_extent_block_csum_set(struct inode *inode,
 {
        struct ext4_extent_tail *et;
 
-       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
-               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+       if (!ext4_has_metadata_csum(inode->i_sb))
                return;
 
        et = find_ext4_extent_tail(eh);
@@ -99,14 +96,14 @@ static void ext4_extent_block_csum_set(struct inode *inode,
 
 static int ext4_split_extent(handle_t *handle,
                                struct inode *inode,
-                               struct ext4_ext_path *path,
+                               struct ext4_ext_path **ppath,
                                struct ext4_map_blocks *map,
                                int split_flag,
                                int flags);
 
 static int ext4_split_extent_at(handle_t *handle,
                             struct inode *inode,
-                            struct ext4_ext_path *path,
+                            struct ext4_ext_path **ppath,
                             ext4_lblk_t split,
                             int split_flag,
                             int flags);
@@ -144,6 +141,7 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
 {
        if (path->p_bh) {
                /* path points to block */
+               BUFFER_TRACE(path->p_bh, "get_write_access");
                return ext4_journal_get_write_access(handle, path->p_bh);
        }
        /* path points to leaf/index in inode body */
@@ -161,6 +159,8 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
                     struct inode *inode, struct ext4_ext_path *path)
 {
        int err;
+
+       WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
        if (path->p_bh) {
                ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
                /* path points to block */
@@ -289,6 +289,20 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
        return size;
 }
 
+static inline int
+ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
+                          struct ext4_ext_path **ppath, ext4_lblk_t lblk,
+                          int nofail)
+{
+       struct ext4_ext_path *path = *ppath;
+       int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
+
+       return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
+                       EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
+                       EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
+                       (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
+}
+
 /*
  * Calculate the number of metadata blocks needed
  * to allocate @blocks
@@ -424,7 +438,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
 
 static int __ext4_ext_check(const char *function, unsigned int line,
                            struct inode *inode, struct ext4_extent_header *eh,
-                           int depth)
+                           int depth, ext4_fsblk_t pblk)
 {
        const char *error_msg;
        int max = 0;
@@ -464,42 +478,149 @@ static int __ext4_ext_check(const char *function, unsigned int line,
 
 corrupted:
        ext4_error_inode(inode, function, line, 0,
-                       "bad header/extent: %s - magic %x, "
-                       "entries %u, max %u(%u), depth %u(%u)",
-                       error_msg, le16_to_cpu(eh->eh_magic),
-                       le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
-                       max, le16_to_cpu(eh->eh_depth), depth);
-
+                        "pblk %llu bad header/extent: %s - magic %x, "
+                        "entries %u, max %u(%u), depth %u(%u)",
+                        (unsigned long long) pblk, error_msg,
+                        le16_to_cpu(eh->eh_magic),
+                        le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
+                        max, le16_to_cpu(eh->eh_depth), depth);
        return -EIO;
 }
 
-#define ext4_ext_check(inode, eh, depth)       \
-       __ext4_ext_check(__func__, __LINE__, inode, eh, depth)
+#define ext4_ext_check(inode, eh, depth, pblk)                 \
+       __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
 
 int ext4_ext_check_inode(struct inode *inode)
 {
-       return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+       return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
 }
 
-static int __ext4_ext_check_block(const char *function, unsigned int line,
-                                 struct inode *inode,
-                                 struct ext4_extent_header *eh,
-                                 int depth,
-                                 struct buffer_head *bh)
+static struct buffer_head *
+__read_extent_tree_block(const char *function, unsigned int line,
+                        struct inode *inode, ext4_fsblk_t pblk, int depth,
+                        int flags)
 {
-       int ret;
+       struct buffer_head              *bh;
+       int                             err;
 
-       if (buffer_verified(bh))
-               return 0;
-       ret = ext4_ext_check(inode, eh, depth);
-       if (ret)
-               return ret;
+       bh = sb_getblk(inode->i_sb, pblk);
+       if (unlikely(!bh))
+               return ERR_PTR(-ENOMEM);
+
+       if (!bh_uptodate_or_lock(bh)) {
+               trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
+               err = bh_submit_read(bh);
+               if (err < 0)
+                       goto errout;
+       }
+       if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
+               return bh;
+       err = __ext4_ext_check(function, line, inode,
+                              ext_block_hdr(bh), depth, pblk);
+       if (err)
+               goto errout;
        set_buffer_verified(bh);
-       return ret;
+       /*
+        * If this is a leaf block, cache all of its entries
+        */
+       if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
+               struct ext4_extent_header *eh = ext_block_hdr(bh);
+               struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
+               ext4_lblk_t prev = 0;
+               int i;
+
+               for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
+                       unsigned int status = EXTENT_STATUS_WRITTEN;
+                       ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
+                       int len = ext4_ext_get_actual_len(ex);
+
+                       if (prev && (prev != lblk))
+                               ext4_es_cache_extent(inode, prev,
+                                                    lblk - prev, ~0,
+                                                    EXTENT_STATUS_HOLE);
+
+                       if (ext4_ext_is_unwritten(ex))
+                               status = EXTENT_STATUS_UNWRITTEN;
+                       ext4_es_cache_extent(inode, lblk, len,
+                                            ext4_ext_pblock(ex), status);
+                       prev = lblk + len;
+               }
+       }
+       return bh;
+errout:
+       put_bh(bh);
+       return ERR_PTR(err);
+
 }
 
-#define ext4_ext_check_block(inode, eh, depth, bh)     \
-       __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh)
+#define read_extent_tree_block(inode, pblk, depth, flags)              \
+       __read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
+                                (depth), (flags))
+
+/*
+ * This function is called to cache a file's extent information in the
+ * extent status tree
+ */
+int ext4_ext_precache(struct inode *inode)
+{
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       struct ext4_ext_path *path = NULL;
+       struct buffer_head *bh;
+       int i = 0, depth, ret = 0;
+
+       if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+               return 0;       /* not an extent-mapped inode */
+
+       down_read(&ei->i_data_sem);
+       depth = ext_depth(inode);
+
+       path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
+                      GFP_NOFS);
+       if (path == NULL) {
+               up_read(&ei->i_data_sem);
+               return -ENOMEM;
+       }
+
+       /* Don't cache anything if there are no external extent blocks */
+       if (depth == 0)
+               goto out;
+       path[0].p_hdr = ext_inode_hdr(inode);
+       ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
+       if (ret)
+               goto out;
+       path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
+       while (i >= 0) {
+               /*
+                * If this is a leaf block or we've reached the end of
+                * the index block, go up
+                */
+               if ((i == depth) ||
+                   path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
+                       brelse(path[i].p_bh);
+                       path[i].p_bh = NULL;
+                       i--;
+                       continue;
+               }
+               bh = read_extent_tree_block(inode,
+                                           ext4_idx_pblock(path[i].p_idx++),
+                                           depth - i - 1,
+                                           EXT4_EX_FORCE_CACHE);
+               if (IS_ERR(bh)) {
+                       ret = PTR_ERR(bh);
+                       break;
+               }
+               i++;
+               path[i].p_bh = bh;
+               path[i].p_hdr = ext_block_hdr(bh);
+               path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
+       }
+       ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
+out:
+       up_read(&ei->i_data_sem);
+       ext4_ext_drop_refs(path);
+       kfree(path);
+       return ret;
+}
 
 #ifdef EXT_DEBUG
 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -514,7 +635,7 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
                } else if (path->p_ext) {
                        ext_debug("  %d:[%d]%d:%llu ",
                                  le32_to_cpu(path->p_ext->ee_block),
-                                 ext4_ext_is_uninitialized(path->p_ext),
+                                 ext4_ext_is_unwritten(path->p_ext),
                                  ext4_ext_get_actual_len(path->p_ext),
                                  ext4_ext_pblock(path->p_ext));
                } else
@@ -540,7 +661,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 
        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
                ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
-                         ext4_ext_is_uninitialized(ex),
+                         ext4_ext_is_unwritten(ex),
                          ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
        }
        ext_debug("\n");
@@ -571,7 +692,7 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
                                le32_to_cpu(ex->ee_block),
                                ext4_ext_pblock(ex),
-                               ext4_ext_is_uninitialized(ex),
+                               ext4_ext_is_unwritten(ex),
                                ext4_ext_get_actual_len(ex),
                                newblock);
                ex++;
@@ -586,9 +707,11 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
 
 void ext4_ext_drop_refs(struct ext4_ext_path *path)
 {
-       int depth = path->p_depth;
-       int i;
+       int depth, i;
 
+       if (!path)
+               return;
+       depth = path->p_depth;
        for (i = 0; i <= depth; i++, path++)
                if (path->p_bh) {
                        brelse(path->p_bh);
@@ -696,7 +819,7 @@ ext4_ext_binsearch(struct inode *inode,
        ext_debug("  -> %d:%llu:[%d]%d ",
                        le32_to_cpu(path->p_ext->ee_block),
                        ext4_ext_pblock(path->p_ext),
-                       ext4_ext_is_uninitialized(path->p_ext),
+                       ext4_ext_is_unwritten(path->p_ext),
                        ext4_ext_get_actual_len(path->p_ext));
 
 #ifdef CHECK_BINSEARCH
@@ -732,24 +855,32 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 }
 
 struct ext4_ext_path *
-ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
-                                       struct ext4_ext_path *path)
+ext4_find_extent(struct inode *inode, ext4_lblk_t block,
+                struct ext4_ext_path **orig_path, int flags)
 {
        struct ext4_extent_header *eh;
        struct buffer_head *bh;
-       short int depth, i, ppos = 0, alloc = 0;
+       struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
+       short int depth, i, ppos = 0;
        int ret;
 
        eh = ext_inode_hdr(inode);
        depth = ext_depth(inode);
 
-       /* account possible depth increase */
+       if (path) {
+               ext4_ext_drop_refs(path);
+               if (depth > path[0].p_maxdepth) {
+                       kfree(path);
+                       *orig_path = path = NULL;
+               }
+       }
        if (!path) {
+               /* account possible depth increase */
                path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
                                GFP_NOFS);
-               if (!path)
+               if (unlikely(!path))
                        return ERR_PTR(-ENOMEM);
-               alloc = 1;
+               path[0].p_maxdepth = depth + 1;
        }
        path[0].p_hdr = eh;
        path[0].p_bh = NULL;
@@ -765,20 +896,13 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                path[ppos].p_depth = i;
                path[ppos].p_ext = NULL;
 
-               bh = sb_getblk(inode->i_sb, path[ppos].p_block);
-               if (unlikely(!bh)) {
-                       ret = -ENOMEM;
+               bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
+                                           flags);
+               if (unlikely(IS_ERR(bh))) {
+                       ret = PTR_ERR(bh);
                        goto err;
                }
-               if (!bh_uptodate_or_lock(bh)) {
-                       trace_ext4_ext_load_extent(inode, block,
-                                               path[ppos].p_block);
-                       ret = bh_submit_read(bh);
-                       if (ret < 0) {
-                               put_bh(bh);
-                               goto err;
-                       }
-               }
+
                eh = ext_block_hdr(bh);
                ppos++;
                if (unlikely(ppos > depth)) {
@@ -790,11 +914,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                }
                path[ppos].p_bh = bh;
                path[ppos].p_hdr = eh;
-               i--;
-
-               ret = ext4_ext_check_block(inode, eh, i, bh);
-               if (ret < 0)
-                       goto err;
        }
 
        path[ppos].p_depth = i;
@@ -813,8 +932,9 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 
 err:
        ext4_ext_drop_refs(path);
-       if (alloc)
-               kfree(path);
+       kfree(path);
+       if (orig_path)
+               *orig_path = NULL;
        return ERR_PTR(ret);
 }
 
@@ -1141,16 +1261,24 @@ cleanup:
  *   just created block
  */
 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
-                                unsigned int flags,
-                                struct ext4_extent *newext)
+                                unsigned int flags)
 {
        struct ext4_extent_header *neh;
        struct buffer_head *bh;
-       ext4_fsblk_t newblock;
+       ext4_fsblk_t newblock, goal = 0;
+       struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
        int err = 0;
 
-       newblock = ext4_ext_new_meta_block(handle, inode, NULL,
-               newext, &err, flags);
+       /* Try to prepend new index to old one */
+       if (ext_depth(inode))
+               goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
+       if (goal > le32_to_cpu(es->s_first_data_block)) {
+               flags |= EXT4_MB_HINT_TRY_GOAL;
+               goal--;
+       } else
+               goal = ext4_inode_to_goal_block(inode);
+       newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+                                       NULL, &err);
        if (newblock == 0)
                return err;
 
@@ -1215,10 +1343,12 @@ out:
  * if no free index is found, then it requests in-depth growing.
  */
 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
-                                   unsigned int flags,
-                                   struct ext4_ext_path *path,
+                                   unsigned int mb_flags,
+                                   unsigned int gb_flags,
+                                   struct ext4_ext_path **ppath,
                                    struct ext4_extent *newext)
 {
+       struct ext4_ext_path *path = *ppath;
        struct ext4_ext_path *curp;
        int depth, i, err = 0;
 
@@ -1237,28 +1367,26 @@ repeat:
        if (EXT_HAS_FREE_INDEX(curp)) {
                /* if we found index with free entry, then use that
                 * entry: create all needed subtree and add new leaf */
-               err = ext4_ext_split(handle, inode, flags, path, newext, i);
+               err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
                if (err)
                        goto out;
 
                /* refill path */
-               ext4_ext_drop_refs(path);
-               path = ext4_ext_find_extent(inode,
+               path = ext4_find_extent(inode,
                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
-                                   path);
+                                   ppath, gb_flags);
                if (IS_ERR(path))
                        err = PTR_ERR(path);
        } else {
                /* tree is full, time to grow in depth */
-               err = ext4_ext_grow_indepth(handle, inode, flags, newext);
+               err = ext4_ext_grow_indepth(handle, inode, mb_flags);
                if (err)
                        goto out;
 
                /* refill path */
-               ext4_ext_drop_refs(path);
-               path = ext4_ext_find_extent(inode,
+               path = ext4_find_extent(inode,
                                   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
-                                   path);
+                                   ppath, gb_flags);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        goto out;
@@ -1429,29 +1557,21 @@ got_index:
        ix++;
        block = ext4_idx_pblock(ix);
        while (++depth < path->p_depth) {
-               bh = sb_bread(inode->i_sb, block);
-               if (bh == NULL)
-                       return -EIO;
-               eh = ext_block_hdr(bh);
                /* subtract from p_depth to get proper eh_depth */
-               if (ext4_ext_check_block(inode, eh,
-                                        path->p_depth - depth, bh)) {
-                       put_bh(bh);
-                       return -EIO;
-               }
+               bh = read_extent_tree_block(inode, block,
+                                           path->p_depth - depth, 0);
+               if (IS_ERR(bh))
+                       return PTR_ERR(bh);
+               eh = ext_block_hdr(bh);
                ix = EXT_FIRST_INDEX(eh);
                block = ext4_idx_pblock(ix);
                put_bh(bh);
        }
 
-       bh = sb_bread(inode->i_sb, block);
-       if (bh == NULL)
-               return -EIO;
+       bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+       if (IS_ERR(bh))
+               return PTR_ERR(bh);
        eh = ext_block_hdr(bh);
-       if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) {
-               put_bh(bh);
-               return -EIO;
-       }
        ex = EXT_FIRST_EXTENT(eh);
 found_extent:
        *logical = le32_to_cpu(ex->ee_block);
@@ -1469,7 +1589,7 @@ found_extent:
  * allocated block. Thus, index entries have to be consistent
  * with leaves.
  */
-static ext4_lblk_t
+ext4_lblk_t
 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
 {
        int depth;
@@ -1595,22 +1715,17 @@ int
 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
                                struct ext4_extent *ex2)
 {
-       unsigned short ext1_ee_len, ext2_ee_len, max_len;
+       unsigned short ext1_ee_len, ext2_ee_len;
 
        /*
         * Make sure that both extents are initialized. We don't merge
-        * uninitialized extents so that we can be sure that end_io code has
+        * unwritten extents so that we can be sure that end_io code has
         * the extent that was written properly split out and conversion to
         * initialized is trivial.
         */
-       if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))
+       if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
                return 0;
 
-       if (ext4_ext_is_uninitialized(ex1))
-               max_len = EXT_UNINIT_MAX_LEN;
-       else
-               max_len = EXT_INIT_MAX_LEN;
-
        ext1_ee_len = ext4_ext_get_actual_len(ex1);
        ext2_ee_len = ext4_ext_get_actual_len(ex2);
 
@@ -1623,7 +1738,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
         * as an RO_COMPAT feature, refuse to merge to extents if
         * this can result in the top bit of ee_len being set.
         */
-       if (ext1_ee_len + ext2_ee_len > max_len)
+       if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
+               return 0;
+       if (ext4_ext_is_unwritten(ex1) &&
+           (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
+            atomic_read(&EXT4_I(inode)->i_unwritten) ||
+            (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
                return 0;
 #ifdef AGGRESSIVE_TEST
        if (ext1_ee_len >= 4)
@@ -1648,8 +1768,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
 {
        struct ext4_extent_header *eh;
        unsigned int depth, len;
-       int merge_done = 0;
-       int uninitialized = 0;
+       int merge_done = 0, unwritten;
 
        depth = ext_depth(inode);
        BUG_ON(path[depth].p_hdr == NULL);
@@ -1659,12 +1778,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
                if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
                        break;
                /* merge with next extent! */
-               if (ext4_ext_is_uninitialized(ex))
-                       uninitialized = 1;
+               unwritten = ext4_ext_is_unwritten(ex);
                ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                + ext4_ext_get_actual_len(ex + 1));
-               if (uninitialized)
-                       ext4_ext_mark_uninitialized(ex);
+               if (unwritten)
+                       ext4_ext_mark_unwritten(ex);
 
                if (ex + 1 < EXT_LAST_EXTENT(eh)) {
                        len = (EXT_LAST_EXTENT(eh) - ex - 1)
@@ -1714,6 +1832,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
                sizeof(struct ext4_extent_idx);
        s += sizeof(struct ext4_extent_header);
 
+       path[1].p_maxdepth = path[0].p_maxdepth;
        memcpy(path[0].p_hdr, path[1].p_hdr, s);
        path[0].p_depth = 0;
        path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
@@ -1722,8 +1841,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
 
        brelse(path[1].p_bh);
        ext4_free_blocks(handle, inode, NULL, blk, 1,
-                        EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET |
-                        EXT4_FREE_BLOCKS_RESERVE);
+                        EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
 }
 
 /*
@@ -1809,18 +1927,20 @@ out:
  * creating new leaf in the no-space case.
  */
 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
-                               struct ext4_ext_path *path,
-                               struct ext4_extent *newext, int flag)
+                               struct ext4_ext_path **ppath,
+                               struct ext4_extent *newext, int gb_flags)
 {
+       struct ext4_ext_path *path = *ppath;
        struct ext4_extent_header *eh;
        struct ext4_extent *ex, *fex;
        struct ext4_extent *nearex; /* nearest extent */
        struct ext4_ext_path *npath = NULL;
        int depth, len, err;
        ext4_lblk_t next;
-       unsigned uninitialized = 0;
-       int flags = 0;
+       int mb_flags = 0, unwritten;
 
+       if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+               mb_flags |= EXT4_MB_DELALLOC_RESERVED;
        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
                return -EIO;
@@ -1834,12 +1954,12 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        }
 
        /* try to insert block into found extent and return */
-       if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) {
+       if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
 
                /*
                 * Try to see whether we should rather test the extent on
                 * right from ex, or from the left of ex. This is because
-                * ext4_ext_find_extent() can return either extent on the
+                * ext4_find_extent() can return either extent on the
                 * left, or on the right from the searched position. This
                 * will make merging more effective.
                 */
@@ -1859,29 +1979,21 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
                if (ext4_can_extents_be_merged(inode, ex, newext)) {
                        ext_debug("append [%d]%d block to %u:[%d]%d"
                                  "(from %llu)\n",
-                                 ext4_ext_is_uninitialized(newext),
+                                 ext4_ext_is_unwritten(newext),
                                  ext4_ext_get_actual_len(newext),
                                  le32_to_cpu(ex->ee_block),
-                                 ext4_ext_is_uninitialized(ex),
+                                 ext4_ext_is_unwritten(ex),
                                  ext4_ext_get_actual_len(ex),
                                  ext4_ext_pblock(ex));
                        err = ext4_ext_get_access(handle, inode,
                                                  path + depth);
                        if (err)
                                return err;
-
-                       /*
-                        * ext4_can_extents_be_merged should have checked
-                        * that either both extents are uninitialized, or
-                        * both aren't. Thus we need to check only one of
-                        * them here.
-                        */
-                       if (ext4_ext_is_uninitialized(ex))
-                               uninitialized = 1;
+                       unwritten = ext4_ext_is_unwritten(ex);
                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                        + ext4_ext_get_actual_len(newext));
-                       if (uninitialized)
-                               ext4_ext_mark_uninitialized(ex);
+                       if (unwritten)
+                               ext4_ext_mark_unwritten(ex);
                        eh = path[depth].p_hdr;
                        nearex = ex;
                        goto merge;
@@ -1893,10 +2005,10 @@ prepend:
                        ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
                                  "(from %llu)\n",
                                  le32_to_cpu(newext->ee_block),
-                                 ext4_ext_is_uninitialized(newext),
+                                 ext4_ext_is_unwritten(newext),
                                  ext4_ext_get_actual_len(newext),
                                  le32_to_cpu(ex->ee_block),
-                                 ext4_ext_is_uninitialized(ex),
+                                 ext4_ext_is_unwritten(ex),
                                  ext4_ext_get_actual_len(ex),
                                  ext4_ext_pblock(ex));
                        err = ext4_ext_get_access(handle, inode,
@@ -1904,20 +2016,13 @@ prepend:
                        if (err)
                                return err;
 
-                       /*
-                        * ext4_can_extents_be_merged should have checked
-                        * that either both extents are uninitialized, or
-                        * both aren't. Thus we need to check only one of
-                        * them here.
-                        */
-                       if (ext4_ext_is_uninitialized(ex))
-                               uninitialized = 1;
+                       unwritten = ext4_ext_is_unwritten(ex);
                        ex->ee_block = newext->ee_block;
                        ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                        + ext4_ext_get_actual_len(newext));
-                       if (uninitialized)
-                               ext4_ext_mark_uninitialized(ex);
+                       if (unwritten)
+                               ext4_ext_mark_unwritten(ex);
                        eh = path[depth].p_hdr;
                        nearex = ex;
                        goto merge;
@@ -1937,7 +2042,7 @@ prepend:
        if (next != EXT_MAX_BLOCKS) {
                ext_debug("next leaf block - %u\n", next);
                BUG_ON(npath != NULL);
-               npath = ext4_ext_find_extent(inode, next, NULL);
+               npath = ext4_find_extent(inode, next, NULL, 0);
                if (IS_ERR(npath))
                        return PTR_ERR(npath);
                BUG_ON(npath->p_depth != path->p_depth);
@@ -1956,9 +2061,10 @@ prepend:
         * There is no free space in the found leaf.
         * We're gonna add a new leaf in the tree.
         */
-       if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL)
-               flags = EXT4_MB_USE_RESERVED;
-       err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
+       if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+               mb_flags |= EXT4_MB_USE_RESERVED;
+       err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
+                                      ppath, newext);
        if (err)
                goto cleanup;
        depth = ext_depth(inode);
@@ -1976,7 +2082,7 @@ has_space:
                ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
                                le32_to_cpu(newext->ee_block),
                                ext4_ext_pblock(newext),
-                               ext4_ext_is_uninitialized(newext),
+                               ext4_ext_is_unwritten(newext),
                                ext4_ext_get_actual_len(newext));
                nearex = EXT_FIRST_EXTENT(eh);
        } else {
@@ -1987,7 +2093,7 @@ has_space:
                                        "nearest %p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_pblock(newext),
-                                       ext4_ext_is_uninitialized(newext),
+                                       ext4_ext_is_unwritten(newext),
                                        ext4_ext_get_actual_len(newext),
                                        nearex);
                        nearex++;
@@ -1998,7 +2104,7 @@ has_space:
                                        "nearest %p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_pblock(newext),
-                                       ext4_ext_is_uninitialized(newext),
+                                       ext4_ext_is_unwritten(newext),
                                        ext4_ext_get_actual_len(newext),
                                        nearex);
                }
@@ -2008,7 +2114,7 @@ has_space:
                                        "move %d extents from 0x%p to 0x%p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_pblock(newext),
-                                       ext4_ext_is_uninitialized(newext),
+                                       ext4_ext_is_unwritten(newext),
                                        ext4_ext_get_actual_len(newext),
                                        len, nearex, nearex + 1);
                        memmove(nearex + 1, nearex,
@@ -2024,7 +2130,7 @@ has_space:
 
 merge:
        /* try to merge extents */
-       if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
+       if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
                ext4_ext_try_to_merge(handle, inode, path, nearex);
 
 
@@ -2036,10 +2142,8 @@ merge:
        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
 
 cleanup:
-       if (npath) {
-               ext4_ext_drop_refs(npath);
-               kfree(npath);
-       }
+       ext4_ext_drop_refs(npath);
+       kfree(npath);
        return err;
 }
 
@@ -2061,13 +2165,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
                /* find extent for this block */
                down_read(&EXT4_I(inode)->i_data_sem);
 
-               if (path && ext_depth(inode) != depth) {
-                       /* depth was changed. we have to realloc path */
-                       kfree(path);
-                       path = NULL;
-               }
-
-               path = ext4_ext_find_extent(inode, block, path);
+               path = ext4_find_extent(inode, block, &path, 0);
                if (IS_ERR(path)) {
                        up_read(&EXT4_I(inode)->i_data_sem);
                        err = PTR_ERR(path);
@@ -2084,7 +2182,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
                }
                ex = path[depth].p_ext;
                next = ext4_ext_next_allocated_block(path);
-               ext4_ext_drop_refs(path);
 
                flags = 0;
                exists = 0;
@@ -2130,7 +2227,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
                        es.es_lblk = le32_to_cpu(ex->ee_block);
                        es.es_len = ext4_ext_get_actual_len(ex);
                        es.es_pblk = ext4_ext_pblock(ex);
-                       if (ext4_ext_is_uninitialized(ex))
+                       if (ext4_ext_is_unwritten(ex))
                                flags |= FIEMAP_EXTENT_UNWRITTEN;
                }
 
@@ -2142,7 +2239,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
                next_del = ext4_find_delayed_extent(inode, &es);
                if (!exists && next_del) {
                        exists = 1;
-                       flags |= FIEMAP_EXTENT_DELALLOC;
+                       flags |= (FIEMAP_EXTENT_DELALLOC |
+                                 FIEMAP_EXTENT_UNKNOWN);
                }
                up_read(&EXT4_I(inode)->i_data_sem);
 
@@ -2193,11 +2291,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
                block = es.es_lblk + es.es_len;
        }
 
-       if (path) {
-               ext4_ext_drop_refs(path);
-               kfree(path);
-       }
-
+       ext4_ext_drop_refs(path);
+       kfree(path);
        return err;
 }
 
@@ -2344,17 +2439,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
 }
 
 /*
- * How many index/leaf blocks need to change/allocate to modify nrblocks?
+ * How many index/leaf blocks need to change/allocate to add @extents extents?
  *
- * if nrblocks are fit in a single extent (chunk flag is 1), then
- * in the worse case, each tree level index/leaf need to be changed
- * if the tree split due to insert a new extent, then the old tree
- * index/leaf need to be updated too
+ * If we add a single extent, then in the worse case, each tree level
+ * index/leaf need to be changed in case of the tree split.
  *
- * If the nrblocks are discontiguous, they could cause
- * the whole tree split more than once, but this is really rare.
+ * If more extents are inserted, they could cause the whole tree split more
+ * than once, but this is really rare.
  */
-int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
 {
        int index;
        int depth;
@@ -2365,7 +2458,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 
        depth = ext_depth(inode);
 
-       if (chunk)
+       if (extents <= 1)
                index = depth * 2;
        else
                index = depth * 3;
@@ -2373,20 +2466,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
        return index;
 }
 
+static inline int get_default_free_blocks_flags(struct inode *inode)
+{
+       if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+               return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+       else if (ext4_should_journal_data(inode))
+               return EXT4_FREE_BLOCKS_FORGET;
+       return 0;
+}
+
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                              struct ext4_extent *ex,
-                             ext4_fsblk_t *partial_cluster,
+                             long long *partial_cluster,
                              ext4_lblk_t from, ext4_lblk_t to)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        unsigned short ee_len =  ext4_ext_get_actual_len(ex);
        ext4_fsblk_t pblk;
-       int flags = 0;
-
-       if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-               flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
-       else if (ext4_should_journal_data(inode))
-               flags |= EXT4_FREE_BLOCKS_FORGET;
+       int flags = get_default_free_blocks_flags(inode);
 
        /*
         * For bigalloc file systems, we never free a partial cluster
@@ -2404,7 +2501,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
         * partial cluster here.
         */
        pblk = ext4_ext_pblock(ex) + ee_len - 1;
-       if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+       if ((*partial_cluster > 0) &&
+           (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_C2B(sbi, *partial_cluster),
                                 sbi->s_cluster_ratio, flags);
@@ -2430,41 +2528,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
            && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
                /* tail removal */
                ext4_lblk_t num;
+               unsigned int unaligned;
 
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
                pblk = ext4_ext_pblock(ex) + ee_len - num;
-               ext_debug("free last %u blocks starting %llu\n", num, pblk);
+               /*
+                * Usually we want to free partial cluster at the end of the
+                * extent, except for the situation when the cluster is still
+                * used by any other extent (partial_cluster is negative).
+                */
+               if (*partial_cluster < 0 &&
+                   -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+                       flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+
+               ext_debug("free last %u blocks starting %llu partial %lld\n",
+                         num, pblk, *partial_cluster);
                ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
                /*
                 * If the block range to be freed didn't start at the
                 * beginning of a cluster, and we removed the entire
-                * extent, save the partial cluster here, since we
-                * might need to delete if we determine that the
-                * truncate operation has removed all of the blocks in
-                * the cluster.
+                * extent and the cluster is not used by any other extent,
+                * save the partial cluster here, since we might need to
+                * delete if we determine that the truncate operation has
+                * removed all of the blocks in the cluster.
+                *
+                * On the other hand, if we did not manage to free the whole
+                * extent, we have to mark the cluster as used (store negative
+                * cluster number in partial_cluster).
                 */
-               if (EXT4_PBLK_COFF(sbi, pblk) &&
-                   (ee_len == num))
+               unaligned = EXT4_PBLK_COFF(sbi, pblk);
+               if (unaligned && (ee_len == num) &&
+                   (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
                        *partial_cluster = EXT4_B2C(sbi, pblk);
-               else
+               else if (unaligned)
+                       *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
+               else if (*partial_cluster > 0)
                        *partial_cluster = 0;
-       } else if (from == le32_to_cpu(ex->ee_block)
-                  && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
-               /* head removal */
-               ext4_lblk_t num;
-               ext4_fsblk_t start;
-
-               num = to - from;
-               start = ext4_ext_pblock(ex);
-
-               ext_debug("free first %u blocks starting %llu\n", num, start);
-               ext4_free_blocks(handle, inode, NULL, start, num, flags);
-
-       } else {
-               printk(KERN_INFO "strange request: removal(2) "
-                               "%u-%u from %u:%u\n",
-                               from, to, le32_to_cpu(ex->ee_block), ee_len);
-       }
+       } else
+               ext4_error(sbi->s_sb, "strange request: removal(2) "
+                          "%u-%u from %u:%u\n",
+                          from, to, le32_to_cpu(ex->ee_block), ee_len);
        return 0;
 }
 
@@ -2477,12 +2580,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
  * @handle: The journal handle
  * @inode:  The files inode
  * @path:   The path to the leaf
+ * @partial_cluster: The cluster which we'll have to free if all extents
+ *                   has been released from it. It gets negative in case
+ *                   that the cluster is still used.
  * @start:  The first block to remove
  * @end:   The last block to remove
  */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-                struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
+                struct ext4_ext_path *path,
+                long long *partial_cluster,
                 ext4_lblk_t start, ext4_lblk_t end)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2493,8 +2600,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        unsigned num;
        ext4_lblk_t ex_ee_block;
        unsigned short ex_ee_len;
-       unsigned uninitialized = 0;
+       unsigned unwritten = 0;
        struct ext4_extent *ex;
+       ext4_fsblk_t pblk;
 
        /* the header must be checked already in ext4_ext_remove_space() */
        ext_debug("truncate since %u in leaf to %u\n", start, end);
@@ -2506,7 +2614,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                return -EIO;
        }
        /* find where to start removing */
-       ex = EXT_LAST_EXTENT(eh);
+       ex = path[depth].p_ext;
+       if (!ex)
+               ex = EXT_LAST_EXTENT(eh);
 
        ex_ee_block = le32_to_cpu(ex->ee_block);
        ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2537,13 +2647,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        while (ex >= EXT_FIRST_EXTENT(eh) &&
                        ex_ee_block + ex_ee_len > start) {
 
-               if (ext4_ext_is_uninitialized(ex))
-                       uninitialized = 1;
+               if (ext4_ext_is_unwritten(ex))
+                       unwritten = 1;
                else
-                       uninitialized = 0;
+                       unwritten = 0;
 
                ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
-                        uninitialized, ex_ee_len);
+                         unwritten, ex_ee_len);
                path[depth].p_ext = ex;
 
                a = ex_ee_block > start ? ex_ee_block : start;
@@ -2554,6 +2664,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 
                /* If this extent is beyond the end of the hole, skip it */
                if (end < ex_ee_block) {
+                       /*
+                        * We're going to skip this extent and move to another,
+                        * so if this extent is not cluster aligned we have
+                        * to mark the current cluster as used to avoid
+                        * accidentally freeing it later on
+                        */
+                       pblk = ext4_ext_pblock(ex);
+                       if (EXT4_PBLK_COFF(sbi, pblk))
+                               *partial_cluster =
+                                       -((long long)EXT4_B2C(sbi, pblk));
                        ex--;
                        ex_ee_block = le32_to_cpu(ex->ee_block);
                        ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2605,11 +2725,11 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 
                ex->ee_len = cpu_to_le16(num);
                /*
-                * Do not mark uninitialized if all the blocks in the
+                * Do not mark unwritten if all the blocks in the
                 * extent have been removed.
                 */
-               if (uninitialized && num)
-                       ext4_ext_mark_uninitialized(ex);
+               if (unwritten && num)
+                       ext4_ext_mark_unwritten(ex);
                /*
                 * If the extent was completely released,
                 * we need to remove it from the leaf
@@ -2629,7 +2749,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                                        sizeof(struct ext4_extent));
                        }
                        le16_add_cpu(&eh->eh_entries, -1);
-               } else
+               } else if (*partial_cluster > 0)
                        *partial_cluster = 0;
 
                err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2647,17 +2767,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                err = ext4_ext_correct_indexes(handle, inode, path);
 
        /*
-        * If there is still a entry in the leaf node, check to see if
-        * it references the partial cluster.  This is the only place
-        * where it could; if it doesn't, we can free the cluster.
+        * If there's a partial cluster and at least one extent remains in
+        * the leaf, free the partial cluster if it isn't shared with the
+        * current extent.  If there's a partial cluster and no extents
+        * remain in the leaf, it can't be freed here.  It can only be
+        * freed when it's possible to determine if it's not shared with
+        * any other extent - when the next leaf is processed or when space
+        * removal is complete.
         */
-       if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+       if (*partial_cluster > 0 && eh->eh_entries &&
            (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
             *partial_cluster)) {
-               int flags = EXT4_FREE_BLOCKS_FORGET;
-
-               if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                       flags |= EXT4_FREE_BLOCKS_METADATA;
+               int flags = get_default_free_blocks_flags(inode);
 
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_C2B(sbi, *partial_cluster),
@@ -2701,7 +2822,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
        struct super_block *sb = inode->i_sb;
        int depth = ext_depth(inode);
        struct ext4_ext_path *path = NULL;
-       ext4_fsblk_t partial_cluster = 0;
+       long long partial_cluster = 0;
        handle_t *handle;
        int i = 0, err = 0;
 
@@ -2713,7 +2834,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
                return PTR_ERR(handle);
 
 again:
-       trace_ext4_ext_remove_space(inode, start, depth);
+       trace_ext4_ext_remove_space(inode, start, end, depth);
 
        /*
         * Check if we are removing extents inside the extent tree. If that
@@ -2727,7 +2848,7 @@ again:
                ext4_lblk_t ee_block;
 
                /* find extent for this block */
-               path = ext4_ext_find_extent(inode, end, NULL);
+               path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
                if (IS_ERR(path)) {
                        ext4_journal_stop(handle);
                        return PTR_ERR(path);
@@ -2755,23 +2876,14 @@ again:
                 */
                if (end >= ee_block &&
                    end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
-                       int split_flag = 0;
-
-                       if (ext4_ext_is_uninitialized(ex))
-                               split_flag = EXT4_EXT_MARK_UNINIT1 |
-                                            EXT4_EXT_MARK_UNINIT2;
-
                        /*
                         * Split the extent in two so that 'end' is the last
                         * block in the first new extent. Also we should not
                         * fail removing space due to ENOSPC so try to use
                         * reserved block if that happens.
                         */
-                       err = ext4_split_extent_at(handle, inode, path,
-                                       end + 1, split_flag,
-                                       EXT4_GET_BLOCKS_PRE_IO |
-                                       EXT4_GET_BLOCKS_METADATA_NOFAIL);
-
+                       err = ext4_force_split_extent_at(handle, inode, &path,
+                                                        end + 1, 1);
                        if (err < 0)
                                goto out;
                }
@@ -2793,11 +2905,11 @@ again:
                        ext4_journal_stop(handle);
                        return -ENOMEM;
                }
-               path[0].p_depth = depth;
+               path[0].p_maxdepth = path[0].p_depth = depth;
                path[0].p_hdr = ext_inode_hdr(inode);
                i = 0;
 
-               if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
+               if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
                        err = -EIO;
                        goto out;
                }
@@ -2844,21 +2956,21 @@ again:
                        ext_debug("move to level %d (block %llu)\n",
                                  i + 1, ext4_idx_pblock(path[i].p_idx));
                        memset(path + i + 1, 0, sizeof(*path));
-                       bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
-                       if (!bh) {
+                       bh = read_extent_tree_block(inode,
+                               ext4_idx_pblock(path[i].p_idx), depth - i - 1,
+                               EXT4_EX_NOCACHE);
+                       if (IS_ERR(bh)) {
                                /* should we reset i_size? */
-                               err = -EIO;
+                               err = PTR_ERR(bh);
                                break;
                        }
+                       /* Yield here to deal with large extent trees.
+                        * Should be a no-op if we did IO above. */
+                       cond_resched();
                        if (WARN_ON(i + 1 > depth)) {
                                err = -EIO;
                                break;
                        }
-                       if (ext4_ext_check_block(inode, ext_block_hdr(bh),
-                                                       depth - i - 1, bh)) {
-                               err = -EIO;
-                               break;
-                       }
                        path[i + 1].p_bh = bh;
 
                        /* save actual number of indexes since this
@@ -2881,17 +2993,14 @@ again:
                }
        }
 
-       trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
-                       path->p_hdr->eh_entries);
+       trace_ext4_ext_remove_space_done(inode, start, end, depth,
+                       partial_cluster, path->p_hdr->eh_entries);
 
        /* If we still have something in the partial cluster and we have removed
         * even the first extent, then we should free the blocks in the partial
         * cluster as well. */
-       if (partial_cluster && path->p_hdr->eh_entries == 0) {
-               int flags = EXT4_FREE_BLOCKS_FORGET;
-
-               if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                       flags |= EXT4_FREE_BLOCKS_METADATA;
+       if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
+               int flags = get_default_free_blocks_flags(inode);
 
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_C2B(EXT4_SB(sb), partial_cluster),
@@ -2916,10 +3025,9 @@ again:
 out:
        ext4_ext_drop_refs(path);
        kfree(path);
-       if (err == -EAGAIN) {
-               path = NULL;
+       path = NULL;
+       if (err == -EAGAIN)
                goto again;
-       }
        ext4_journal_stop(handle);
 
        return err;
@@ -2976,6 +3084,23 @@ void ext4_ext_release(struct super_block *sb)
 #endif
 }
 
+static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
+{
+       ext4_lblk_t  ee_block;
+       ext4_fsblk_t ee_pblock;
+       unsigned int ee_len;
+
+       ee_block  = le32_to_cpu(ex->ee_block);
+       ee_len    = ext4_ext_get_actual_len(ex);
+       ee_pblock = ext4_ext_pblock(ex);
+
+       if (ee_len == 0)
+               return 0;
+
+       return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
+                                    EXTENT_STATUS_WRITTEN);
+}
+
 /* FIXME!! we need to try to merge to left or right after zero-out  */
 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
@@ -3001,7 +3126,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
  * @path: the path to the extent
  * @split: the logical block where the extent is splitted.
  * @split_flags: indicates if the extent could be zeroout if split fails, and
- *              the states(init or uninit) of new extents.
+ *              the states(init or unwritten) of new extents.
  * @flags: flags used to insert new extent to extent tree.
  *
  *
@@ -3016,11 +3141,12 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
  */
 static int ext4_split_extent_at(handle_t *handle,
                             struct inode *inode,
-                            struct ext4_ext_path *path,
+                            struct ext4_ext_path **ppath,
                             ext4_lblk_t split,
                             int split_flag,
                             int flags)
 {
+       struct ext4_ext_path *path = *ppath;
        ext4_fsblk_t newblock;
        ext4_lblk_t ee_block;
        struct ext4_extent *ex, newex, orig_ex, zero_ex;
@@ -3043,10 +3169,10 @@ static int ext4_split_extent_at(handle_t *handle,
        newblock = split - ee_block + ext4_ext_pblock(ex);
 
        BUG_ON(split < ee_block || split >= (ee_block + ee_len));
-       BUG_ON(!ext4_ext_is_uninitialized(ex) &&
+       BUG_ON(!ext4_ext_is_unwritten(ex) &&
               split_flag & (EXT4_EXT_MAY_ZEROOUT |
-                            EXT4_EXT_MARK_UNINIT1 |
-                            EXT4_EXT_MARK_UNINIT2));
+                            EXT4_EXT_MARK_UNWRIT1 |
+                            EXT4_EXT_MARK_UNWRIT2));
 
        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
@@ -3058,8 +3184,8 @@ static int ext4_split_extent_at(handle_t *handle,
                 * then we just change the state of the extent, and splitting
                 * is not needed.
                 */
-               if (split_flag & EXT4_EXT_MARK_UNINIT2)
-                       ext4_ext_mark_uninitialized(ex);
+               if (split_flag & EXT4_EXT_MARK_UNWRIT2)
+                       ext4_ext_mark_unwritten(ex);
                else
                        ext4_ext_mark_initialized(ex);
 
@@ -3073,8 +3199,8 @@ static int ext4_split_extent_at(handle_t *handle,
        /* case a */
        memcpy(&orig_ex, ex, sizeof(orig_ex));
        ex->ee_len = cpu_to_le16(split - ee_block);
-       if (split_flag & EXT4_EXT_MARK_UNINIT1)
-               ext4_ext_mark_uninitialized(ex);
+       if (split_flag & EXT4_EXT_MARK_UNWRIT1)
+               ext4_ext_mark_unwritten(ex);
 
        /*
         * path may lead to new leaf, not to original leaf any more
@@ -3088,10 +3214,10 @@ static int ext4_split_extent_at(handle_t *handle,
        ex2->ee_block = cpu_to_le32(split);
        ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
        ext4_ext_store_pblock(ex2, newblock);
-       if (split_flag & EXT4_EXT_MARK_UNINIT2)
-               ext4_ext_mark_uninitialized(ex2);
+       if (split_flag & EXT4_EXT_MARK_UNWRIT2)
+               ext4_ext_mark_unwritten(ex2);
 
-       err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+       err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
        if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
                if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
                        if (split_flag & EXT4_EXT_DATA_VALID1) {
@@ -3128,7 +3254,7 @@ static int ext4_split_extent_at(handle_t *handle,
                        goto fix_extent_len;
 
                /* update extent status tree */
-               err = ext4_es_zeroout(inode, &zero_ex);
+               err = ext4_zeroout_es(inode, &zero_ex);
 
                goto out;
        } else if (err)
@@ -3140,7 +3266,7 @@ out:
 
 fix_extent_len:
        ex->ee_len = orig_ex.ee_len;
-       ext4_ext_dirty(handle, inode, path + depth);
+       ext4_ext_dirty(handle, inode, path + path->p_depth);
        return err;
 }
 
@@ -3148,7 +3274,7 @@ fix_extent_len:
  * ext4_split_extents() splits an extent and mark extent which is covered
  * by @map as split_flags indicates
  *
- * It may result in splitting the extent into multiple extents (upto three)
+ * It may result in splitting the extent into multiple extents (up to three)
  * There are three possibilities:
  *   a> There is no split required
  *   b> Splits in two extents: Split is happening at either end of the extent
@@ -3157,16 +3283,17 @@ fix_extent_len:
  */
 static int ext4_split_extent(handle_t *handle,
                              struct inode *inode,
-                             struct ext4_ext_path *path,
+                             struct ext4_ext_path **ppath,
                              struct ext4_map_blocks *map,
                              int split_flag,
                              int flags)
 {
+       struct ext4_ext_path *path = *ppath;
        ext4_lblk_t ee_block;
        struct ext4_extent *ex;
        unsigned int ee_len, depth;
        int err = 0;
-       int uninitialized;
+       int unwritten;
        int split_flag1, flags1;
        int allocated = map->m_len;
 
@@ -3174,17 +3301,17 @@ static int ext4_split_extent(handle_t *handle,
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
-       uninitialized = ext4_ext_is_uninitialized(ex);
+       unwritten = ext4_ext_is_unwritten(ex);
 
        if (map->m_lblk + map->m_len < ee_block + ee_len) {
                split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
                flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
-               if (uninitialized)
-                       split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
-                                      EXT4_EXT_MARK_UNINIT2;
+               if (unwritten)
+                       split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
+                                      EXT4_EXT_MARK_UNWRIT2;
                if (split_flag & EXT4_EXT_DATA_VALID2)
                        split_flag1 |= EXT4_EXT_DATA_VALID1;
-               err = ext4_split_extent_at(handle, inode, path,
+               err = ext4_split_extent_at(handle, inode, ppath,
                                map->m_lblk + map->m_len, split_flag1, flags1);
                if (err)
                        goto out;
@@ -3195,23 +3322,27 @@ static int ext4_split_extent(handle_t *handle,
         * Update path is required because previous ext4_split_extent_at() may
         * result in split of original leaf or extent zeroout.
         */
-       ext4_ext_drop_refs(path);
-       path = ext4_ext_find_extent(inode, map->m_lblk, path);
+       path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
        if (IS_ERR(path))
                return PTR_ERR(path);
        depth = ext_depth(inode);
        ex = path[depth].p_ext;
-       uninitialized = ext4_ext_is_uninitialized(ex);
+       if (!ex) {
+               EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+                                (unsigned long) map->m_lblk);
+               return -EIO;
+       }
+       unwritten = ext4_ext_is_unwritten(ex);
        split_flag1 = 0;
 
        if (map->m_lblk >= ee_block) {
                split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
-               if (uninitialized) {
-                       split_flag1 |= EXT4_EXT_MARK_UNINIT1;
+               if (unwritten) {
+                       split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
                        split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
-                                                    EXT4_EXT_MARK_UNINIT2);
+                                                    EXT4_EXT_MARK_UNWRIT2);
                }
-               err = ext4_split_extent_at(handle, inode, path,
+               err = ext4_split_extent_at(handle, inode, ppath,
                                map->m_lblk, split_flag1, flags);
                if (err)
                        goto out;
@@ -3224,16 +3355,16 @@ out:
 
 /*
  * This function is called by ext4_ext_map_blocks() if someone tries to write
- * to an uninitialized extent. It may result in splitting the uninitialized
+ * to an unwritten extent. It may result in splitting the unwritten
  * extent into multiple extents (up to three - one initialized and two
- * uninitialized).
+ * unwritten).
  * There are three possibilities:
  *   a> There is no split required: Entire extent should be initialized
  *   b> Splits in two extents: Write is happening at either end of the extent
  *   c> Splits in three extents: Somone is writing in middle of the extent
  *
  * Pre-conditions:
- *  - The extent pointed to by 'path' is uninitialized.
+ *  - The extent pointed to by 'path' is unwritten.
  *  - The extent pointed to by 'path' contains a superset
  *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
  *
@@ -3245,9 +3376,10 @@ out:
 static int ext4_ext_convert_to_initialized(handle_t *handle,
                                           struct inode *inode,
                                           struct ext4_map_blocks *map,
-                                          struct ext4_ext_path *path,
+                                          struct ext4_ext_path **ppath,
                                           int flags)
 {
+       struct ext4_ext_path *path = *ppath;
        struct ext4_sb_info *sbi;
        struct ext4_extent_header *eh;
        struct ext4_map_blocks split_map;
@@ -3279,12 +3411,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
 
        /* Pre-conditions */
-       BUG_ON(!ext4_ext_is_uninitialized(ex));
+       BUG_ON(!ext4_ext_is_unwritten(ex));
        BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
 
        /*
         * Attempt to transfer newly initialized blocks from the currently
-        * uninitialized extent to its neighbor. This is much cheaper
+        * unwritten extent to its neighbor. This is much cheaper
         * than an insertion followed by a merge as those involve costly
         * memmove() calls. Transferring to the left is the common case in
         * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
@@ -3320,7 +3452,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                 * - C4: abut_ex can receive the additional blocks without
                 *   overflowing the (initialized) length limit.
                 */
-               if ((!ext4_ext_is_uninitialized(abut_ex)) &&            /*C1*/
+               if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
                        ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
                        ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
                        (prev_len < (EXT_INIT_MAX_LEN - map_len))) {    /*C4*/
@@ -3335,7 +3467,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ex->ee_block = cpu_to_le32(ee_block + map_len);
                        ext4_ext_store_pblock(ex, ee_pblk + map_len);
                        ex->ee_len = cpu_to_le16(ee_len - map_len);
-                       ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+                       ext4_ext_mark_unwritten(ex); /* Restore the flag */
 
                        /* Extend abut_ex by 'map_len' blocks */
                        abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
@@ -3366,7 +3498,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                 * - C4: abut_ex can receive the additional blocks without
                 *   overflowing the (initialized) length limit.
                 */
-               if ((!ext4_ext_is_uninitialized(abut_ex)) &&            /*C1*/
+               if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
                    ((map->m_lblk + map_len) == next_lblk) &&           /*C2*/
                    ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
                    (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
@@ -3381,7 +3513,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
                        ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
                        ex->ee_len = cpu_to_le16(ee_len - map_len);
-                       ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+                       ext4_ext_mark_unwritten(ex); /* Restore the flag */
 
                        /* Extend abut_ex by 'map_len' blocks */
                        abut_ex->ee_len = cpu_to_le16(next_len + map_len);
@@ -3403,7 +3535,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        WARN_ON(map->m_lblk < ee_block);
        /*
         * It is safe to convert extent to initialized via explicit
-        * zeroout only if extent is fully insde i_size or new_size.
+        * zeroout only if extent is fully inside i_size or new_size.
         */
        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
 
@@ -3471,55 +3603,57 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                }
        }
 
-       allocated = ext4_split_extent(handle, inode, path,
-                                     &split_map, split_flag, flags);
-       if (allocated < 0)
-               err = allocated;
-
+       err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
+                               flags);
+       if (err > 0)
+               err = 0;
 out:
        /* If we have gotten a failure, don't zero out status tree */
        if (!err)
-               err = ext4_es_zeroout(inode, &zero_ex);
+               err = ext4_zeroout_es(inode, &zero_ex);
        return err ? err : allocated;
 }
 
 /*
  * This function is called by ext4_ext_map_blocks() from
  * ext4_get_blocks_dio_write() when DIO to write
- * to an uninitialized extent.
+ * to an unwritten extent.
  *
- * Writing to an uninitialized extent may result in splitting the uninitialized
- * extent into multiple initialized/uninitialized extents (up to three)
+ * Writing to an unwritten extent may result in splitting the unwritten
+ * extent into multiple initialized/unwritten extents (up to three)
  * There are three possibilities:
- *   a> There is no split required: Entire extent should be uninitialized
+ *   a> There is no split required: Entire extent should be unwritten
  *   b> Splits in two extents: Write is happening at either end of the extent
  *   c> Splits in three extents: Somone is writing in middle of the extent
  *
+ * This works the same way in the case of initialized -> unwritten conversion.
+ *
  * One of more index blocks maybe needed if the extent tree grow after
- * the uninitialized extent split. To prevent ENOSPC occur at the IO
- * complete, we need to split the uninitialized extent before DIO submit
- * the IO. The uninitialized extent called at this time will be split
- * into three uninitialized extent(at most). After IO complete, the part
+ * the unwritten extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the unwritten extent before DIO submit
+ * the IO. The unwritten extent called at this time will be split
+ * into three unwritten extent(at most). After IO complete, the part
  * being filled will be convert to initialized by the end_io callback function
  * via ext4_convert_unwritten_extents().
  *
- * Returns the size of uninitialized extent to be written on success.
+ * Returns the size of unwritten extent to be written on success.
  */
-static int ext4_split_unwritten_extents(handle_t *handle,
+static int ext4_split_convert_extents(handle_t *handle,
                                        struct inode *inode,
                                        struct ext4_map_blocks *map,
-                                       struct ext4_ext_path *path,
+                                       struct ext4_ext_path **ppath,
                                        int flags)
 {
+       struct ext4_ext_path *path = *ppath;
        ext4_lblk_t eof_block;
        ext4_lblk_t ee_block;
        struct ext4_extent *ex;
        unsigned int ee_len;
        int split_flag = 0, depth;
 
-       ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
-               "block %llu, max_blocks %u\n", inode->i_ino,
-               (unsigned long long)map->m_lblk, map->m_len);
+       ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
+                 __func__, inode->i_ino,
+                 (unsigned long long)map->m_lblk, map->m_len);
 
        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
                inode->i_sb->s_blocksize_bits;
@@ -3534,19 +3668,25 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
 
-       split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
-       split_flag |= EXT4_EXT_MARK_UNINIT2;
-       if (flags & EXT4_GET_BLOCKS_CONVERT)
-               split_flag |= EXT4_EXT_DATA_VALID2;
+       /* Convert to unwritten */
+       if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
+               split_flag |= EXT4_EXT_DATA_VALID1;
+       /* Convert to initialized */
+       } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+               split_flag |= ee_block + ee_len <= eof_block ?
+                             EXT4_EXT_MAY_ZEROOUT : 0;
+               split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
+       }
        flags |= EXT4_GET_BLOCKS_PRE_IO;
-       return ext4_split_extent(handle, inode, path, map, split_flag, flags);
+       return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
 }
 
 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                                struct inode *inode,
                                                struct ext4_map_blocks *map,
-                                               struct ext4_ext_path *path)
+                                               struct ext4_ext_path **ppath)
 {
+       struct ext4_ext_path *path = *ppath;
        struct ext4_extent *ex;
        ext4_lblk_t ee_block;
        unsigned int ee_len;
@@ -3575,16 +3715,13 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                             inode->i_ino, (unsigned long long)ee_block, ee_len,
                             (unsigned long long)map->m_lblk, map->m_len);
 #endif
-               err = ext4_split_unwritten_extents(handle, inode, map, path,
-                                                  EXT4_GET_BLOCKS_CONVERT);
+               err = ext4_split_convert_extents(handle, inode, map, ppath,
+                                                EXT4_GET_BLOCKS_CONVERT);
                if (err < 0)
-                       goto out;
-               ext4_ext_drop_refs(path);
-               path = ext4_ext_find_extent(inode, map->m_lblk, path);
-               if (IS_ERR(path)) {
-                       err = PTR_ERR(path);
-                       goto out;
-               }
+                       return err;
+               path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+               if (IS_ERR(path))
+                       return PTR_ERR(path);
                depth = ext_depth(inode);
                ex = path[depth].p_ext;
        }
@@ -3777,34 +3914,109 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 }
 
 static int
-ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
+convert_initialized_extent(handle_t *handle, struct inode *inode,
+                          struct ext4_map_blocks *map,
+                          struct ext4_ext_path **ppath, int flags,
+                          unsigned int allocated, ext4_fsblk_t newblock)
+{
+       struct ext4_ext_path *path = *ppath;
+       struct ext4_extent *ex;
+       ext4_lblk_t ee_block;
+       unsigned int ee_len;
+       int depth;
+       int err = 0;
+
+       /*
+        * Make sure that the extent is no bigger than we support with
+        * unwritten extent
+        */
+       if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
+               map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
+
+       depth = ext_depth(inode);
+       ex = path[depth].p_ext;
+       ee_block = le32_to_cpu(ex->ee_block);
+       ee_len = ext4_ext_get_actual_len(ex);
+
+       ext_debug("%s: inode %lu, logical"
+               "block %llu, max_blocks %u\n", __func__, inode->i_ino,
+                 (unsigned long long)ee_block, ee_len);
+
+       if (ee_block != map->m_lblk || ee_len > map->m_len) {
+               err = ext4_split_convert_extents(handle, inode, map, ppath,
+                               EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+               if (err < 0)
+                       return err;
+               path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+               if (IS_ERR(path))
+                       return PTR_ERR(path);
+               depth = ext_depth(inode);
+               ex = path[depth].p_ext;
+               if (!ex) {
+                       EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+                                        (unsigned long) map->m_lblk);
+                       return -EIO;
+               }
+       }
+
+       err = ext4_ext_get_access(handle, inode, path + depth);
+       if (err)
+               return err;
+       /* first mark the extent as unwritten */
+       ext4_ext_mark_unwritten(ex);
+
+       /* note: ext4_ext_correct_indexes() isn't needed here because
+        * borders are not changed
+        */
+       ext4_ext_try_to_merge(handle, inode, path, ex);
+
+       /* Mark modified extent as dirty */
+       err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+       if (err)
+               return err;
+       ext4_ext_show_leaf(inode, path);
+
+       ext4_update_inode_fsync_trans(handle, inode, 1);
+       err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
+       if (err)
+               return err;
+       map->m_flags |= EXT4_MAP_UNWRITTEN;
+       if (allocated > map->m_len)
+               allocated = map->m_len;
+       map->m_len = allocated;
+       return allocated;
+}
+
+static int
+ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
-                       struct ext4_ext_path *path, int flags,
+                       struct ext4_ext_path **ppath, int flags,
                        unsigned int allocated, ext4_fsblk_t newblock)
 {
+       struct ext4_ext_path *path = *ppath;
        int ret = 0;
        int err = 0;
        ext4_io_end_t *io = ext4_inode_aio(inode);
 
-       ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
+       ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
                  "block %llu, max_blocks %u, flags %x, allocated %u\n",
                  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
                  flags, allocated);
        ext4_ext_show_leaf(inode, path);
 
        /*
-        * When writing into uninitialized space, we should not fail to
+        * When writing into unwritten space, we should not fail to
         * allocate metadata blocks for the new extent block if needed.
         */
        flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
 
-       trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
+       trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
                                                    allocated, newblock);
 
        /* get_block() before submit the IO, split the extent */
-       if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-               ret = ext4_split_unwritten_extents(handle, inode, map,
-                                                  path, flags);
+       if (flags & EXT4_GET_BLOCKS_PRE_IO) {
+               ret = ext4_split_convert_extents(handle, inode, map, ppath,
+                                        flags | EXT4_GET_BLOCKS_CONVERT);
                if (ret <= 0)
                        goto out;
                /*
@@ -3817,14 +4029,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                map->m_flags |= EXT4_MAP_UNWRITTEN;
-               if (ext4_should_dioread_nolock(inode))
-                       map->m_flags |= EXT4_MAP_UNINIT;
                goto out;
        }
        /* IO end_io complete, convert the filled extent to written */
-       if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
+       if (flags & EXT4_GET_BLOCKS_CONVERT) {
                ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
-                                                       path);
+                                                          ppath);
                if (ret >= 0) {
                        ext4_update_inode_fsync_trans(handle, inode, 1);
                        err = check_eofblocks_fl(handle, inode, map->m_lblk,
@@ -3832,6 +4042,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                } else
                        err = ret;
                map->m_flags |= EXT4_MAP_MAPPED;
+               map->m_pblk = newblock;
                if (allocated > map->m_len)
                        allocated = map->m_len;
                map->m_len = allocated;
@@ -3842,7 +4053,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
         * repeat fallocate creation request
         * we already have an unwritten extent
         */
-       if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) {
+       if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
                map->m_flags |= EXT4_MAP_UNWRITTEN;
                goto map_out;
        }
@@ -3861,7 +4072,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        }
 
        /* buffered write, writepage time, convert*/
-       ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
+       ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
        if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
@@ -3918,10 +4129,6 @@ out1:
        map->m_pblk = newblock;
        map->m_len = allocated;
 out2:
-       if (path) {
-               ext4_ext_drop_refs(path);
-               kfree(path);
-       }
        return err ? err : allocated;
 }
 
@@ -4066,7 +4273,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
 
        /* find extent for this block */
-       path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
+       path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
        if (IS_ERR(path)) {
                err = PTR_ERR(path);
                path = NULL;
@@ -4078,7 +4285,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        /*
         * consistent leaf must not be empty;
         * this situation is possible, though, _during_ tree modification;
-        * this is why assert can't be put in ext4_ext_find_extent()
+        * this is why assert can't be put in ext4_find_extent()
         */
        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
                EXT4_ERROR_INODE(inode, "bad extent address "
@@ -4095,8 +4302,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                unsigned short ee_len;
 
+
                /*
-                * Uninitialized extents are treated as holes, except that
+                * unwritten extents are treated as holes, except that
                 * we split out initialized portions during a write.
                 */
                ee_len = ext4_ext_get_actual_len(ex);
@@ -4111,17 +4319,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
                                  ee_block, ee_len, newblock);
 
-                       if (!ext4_ext_is_uninitialized(ex))
+                       /*
+                        * If the extent is initialized check whether the
+                        * caller wants to convert it to unwritten.
+                        */
+                       if ((!ext4_ext_is_unwritten(ex)) &&
+                           (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
+                               allocated = convert_initialized_extent(
+                                               handle, inode, map, &path,
+                                               flags, allocated, newblock);
+                               goto out2;
+                       } else if (!ext4_ext_is_unwritten(ex))
                                goto out;
 
-                       ret = ext4_ext_handle_uninitialized_extents(
-                               handle, inode, map, path, flags,
+                       ret = ext4_ext_handle_unwritten_extents(
+                               handle, inode, map, &path, flags,
                                allocated, newblock);
                        if (ret < 0)
                                err = ret;
                        else
                                allocated = ret;
-                       goto out3;
+                       goto out2;
                }
        }
 
@@ -4152,7 +4370,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 
        /*
         * If we are doing bigalloc, check to see if the extent returned
-        * by ext4_ext_find_extent() implies a cluster we can use.
+        * by ext4_find_extent() implies a cluster we can use.
         */
        if (cluster_offset && ex &&
            get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
@@ -4186,15 +4404,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        /*
         * See if request is beyond maximum number of blocks we can have in
         * a single extent. For an initialized extent this limit is
-        * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
-        * EXT_UNINIT_MAX_LEN.
+        * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
+        * EXT_UNWRITTEN_MAX_LEN.
         */
        if (map->m_len > EXT_INIT_MAX_LEN &&
-           !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
+           !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
                map->m_len = EXT_INIT_MAX_LEN;
-       else if (map->m_len > EXT_UNINIT_MAX_LEN &&
-                (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
-               map->m_len = EXT_UNINIT_MAX_LEN;
+       else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
+                (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
+               map->m_len = EXT_UNWRITTEN_MAX_LEN;
 
        /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
        newex.ee_len = cpu_to_le16(map->m_len);
@@ -4227,6 +4445,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                ar.flags = 0;
        if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
                ar.flags |= EXT4_MB_HINT_NOPREALLOC;
+       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+               ar.flags |= EXT4_MB_DELALLOC_RESERVED;
        newblock = ext4_mb_new_blocks(handle, &ar, &err);
        if (!newblock)
                goto out2;
@@ -4242,21 +4462,19 @@ got_allocated_blocks:
        /* try to insert new extent into found leaf and return */
        ext4_ext_store_pblock(&newex, newblock + offset);
        newex.ee_len = cpu_to_le16(ar.len);
-       /* Mark uninitialized */
-       if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
-               ext4_ext_mark_uninitialized(&newex);
+       /* Mark unwritten */
+       if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
+               ext4_ext_mark_unwritten(&newex);
                map->m_flags |= EXT4_MAP_UNWRITTEN;
                /*
                 * io_end structure was created for every IO write to an
-                * uninitialized extent. To avoid unnecessary conversion,
+                * unwritten extent. To avoid unnecessary conversion,
                 * here we flag the IO that really needs the conversion.
                 * For non asycn direct IO case, flag the inode state
                 * that we need to perform conversion when IO is done.
                 */
-               if ((flags & EXT4_GET_BLOCKS_PRE_IO))
+               if (flags & EXT4_GET_BLOCKS_PRE_IO)
                        set_unwritten = 1;
-               if (ext4_should_dioread_nolock(inode))
-                       map->m_flags |= EXT4_MAP_UNINIT;
        }
 
        err = 0;
@@ -4264,7 +4482,7 @@ got_allocated_blocks:
                err = check_eofblocks_fl(handle, inode, map->m_lblk,
                                         path, ar.len);
        if (!err)
-               err = ext4_ext_insert_extent(handle, inode, path,
+               err = ext4_ext_insert_extent(handle, inode, &path,
                                             &newex, flags);
 
        if (!err && set_unwritten) {
@@ -4282,8 +4500,8 @@ got_allocated_blocks:
                /* not a good idea to call discard here directly,
                 * but otherwise we'd need to call it every free() */
                ext4_discard_preallocations(inode);
-               ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
-                                ext4_ext_get_actual_len(&newex), fb_flags);
+               ext4_free_blocks(handle, inode, NULL, newblock,
+                                EXT4_C2B(sbi, allocated_clusters), fb_flags);
                goto out2;
        }
 
@@ -4383,9 +4601,9 @@ got_allocated_blocks:
 
        /*
         * Cache the extent and update transaction to commit on fdatasync only
-        * when it is _not_ an uninitialized extent.
+        * when it is _not_ an unwritten extent.
         */
-       if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
+       if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
        else
                ext4_update_inode_fsync_trans(handle, inode, 0);
@@ -4397,14 +4615,12 @@ out:
        map->m_pblk = newblock;
        map->m_len = allocated;
 out2:
-       if (path) {
-               ext4_ext_drop_refs(path);
-               kfree(path);
-       }
-
-out3:
-       trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
+       ext4_ext_drop_refs(path);
+       kfree(path);
 
+       trace_ext4_ext_map_blocks_exit(inode, flags, map,
+                                      err ? err : allocated);
+       ext4_es_lru_add(inode);
        return err ? err : allocated;
 }
 
@@ -4442,34 +4658,240 @@ retry:
        ext4_std_error(inode->i_sb, err);
 }
 
-static void ext4_falloc_update_inode(struct inode *inode,
-                               int mode, loff_t new_size, int update_ctime)
+static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
+                                 ext4_lblk_t len, loff_t new_size,
+                                 int flags, int mode)
 {
-       struct timespec now;
+       struct inode *inode = file_inode(file);
+       handle_t *handle;
+       int ret = 0;
+       int ret2 = 0;
+       int retries = 0;
+       struct ext4_map_blocks map;
+       unsigned int credits;
+       loff_t epos;
+
+       map.m_lblk = offset;
+       map.m_len = len;
+       /*
+        * Don't normalize the request if it can fit in one extent so
+        * that it doesn't get unnecessarily split into multiple
+        * extents.
+        */
+       if (len <= EXT_UNWRITTEN_MAX_LEN)
+               flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
+
+       /*
+        * credits to insert 1 extent into extent tree
+        */
+       credits = ext4_chunk_trans_blocks(inode, len);
 
-       if (update_ctime) {
-               now = current_fs_time(inode->i_sb);
-               if (!timespec_equal(&inode->i_ctime, &now))
-                       inode->i_ctime = now;
+retry:
+       while (ret >= 0 && len) {
+               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+                                           credits);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       break;
+               }
+               ret = ext4_map_blocks(handle, inode, &map, flags);
+               if (ret <= 0) {
+                       ext4_debug("inode #%lu: block %u: len %u: "
+                                  "ext4_ext_map_blocks returned %d",
+                                  inode->i_ino, map.m_lblk,
+                                  map.m_len, ret);
+                       ext4_mark_inode_dirty(handle, inode);
+                       ret2 = ext4_journal_stop(handle);
+                       break;
+               }
+               map.m_lblk += ret;
+               map.m_len = len = len - ret;
+               epos = (loff_t)map.m_lblk << inode->i_blkbits;
+               inode->i_ctime = ext4_current_time(inode);
+               if (new_size) {
+                       if (epos > new_size)
+                               epos = new_size;
+                       if (ext4_update_inode_size(inode, epos) & 0x1)
+                               inode->i_mtime = inode->i_ctime;
+               } else {
+                       if (epos > inode->i_size)
+                               ext4_set_inode_flag(inode,
+                                                   EXT4_INODE_EOFBLOCKS);
+               }
+               ext4_mark_inode_dirty(handle, inode);
+               ret2 = ext4_journal_stop(handle);
+               if (ret2)
+                       break;
        }
+       if (ret == -ENOSPC &&
+                       ext4_should_retry_alloc(inode->i_sb, &retries)) {
+               ret = 0;
+               goto retry;
+       }
+
+       return ret > 0 ? ret2 : ret;
+}
+
+static long ext4_zero_range(struct file *file, loff_t offset,
+                           loff_t len, int mode)
+{
+       struct inode *inode = file_inode(file);
+       handle_t *handle = NULL;
+       unsigned int max_blocks;
+       loff_t new_size = 0;
+       int ret = 0;
+       int flags;
+       int credits;
+       int partial_begin, partial_end;
+       loff_t start, end;
+       ext4_lblk_t lblk;
+       struct address_space *mapping = inode->i_mapping;
+       unsigned int blkbits = inode->i_blkbits;
+
+       trace_ext4_zero_range(inode, offset, len, mode);
+
+       if (!S_ISREG(inode->i_mode))
+               return -EINVAL;
+
+       /* Call ext4_force_commit to flush all data in case of data=journal. */
+       if (ext4_should_journal_data(inode)) {
+               ret = ext4_force_commit(inode->i_sb);
+               if (ret)
+                       return ret;
+       }
+
        /*
-        * Update only when preallocation was requested beyond
-        * the file size.
+        * Write out all dirty pages to avoid race conditions
+        * Then release them.
         */
-       if (!(mode & FALLOC_FL_KEEP_SIZE)) {
-               if (new_size > i_size_read(inode))
-                       i_size_write(inode, new_size);
-               if (new_size > EXT4_I(inode)->i_disksize)
-                       ext4_update_i_disksize(inode, new_size);
-       } else {
+       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+               ret = filemap_write_and_wait_range(mapping, offset,
+                                                  offset + len - 1);
+               if (ret)
+                       return ret;
+       }
+
+       /*
+        * Round up offset. This is not fallocate, we neet to zero out
+        * blocks, so convert interior block aligned part of the range to
+        * unwritten and possibly manually zero out unaligned parts of the
+        * range.
+        */
+       start = round_up(offset, 1 << blkbits);
+       end = round_down((offset + len), 1 << blkbits);
+
+       if (start < offset || end > offset + len)
+               return -EINVAL;
+       partial_begin = offset & ((1 << blkbits) - 1);
+       partial_end = (offset + len) & ((1 << blkbits) - 1);
+
+       lblk = start >> blkbits;
+       max_blocks = (end >> blkbits);
+       if (max_blocks < lblk)
+               max_blocks = 0;
+       else
+               max_blocks -= lblk;
+
+       flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
+               EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+               EXT4_EX_NOCACHE;
+       if (mode & FALLOC_FL_KEEP_SIZE)
+               flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
+       mutex_lock(&inode->i_mutex);
+
+       /*
+        * Indirect files do not support unwritten extnets
+        */
+       if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+               ret = -EOPNOTSUPP;
+               goto out_mutex;
+       }
+
+       if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+               new_size = offset + len;
+               ret = inode_newsize_ok(inode, new_size);
+               if (ret)
+                       goto out_mutex;
                /*
-                * Mark that we allocate beyond EOF so the subsequent truncate
-                * can proceed even if the new size is the same as i_size.
+                * If we have a partial block after EOF we have to allocate
+                * the entire block.
                 */
-               if (new_size > i_size_read(inode))
+               if (partial_end)
+                       max_blocks += 1;
+       }
+
+       if (max_blocks > 0) {
+
+               /* Now release the pages and zero block aligned part of pages*/
+               truncate_pagecache_range(inode, start, end - 1);
+               inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+
+               /* Wait all existing dio workers, newcomers will block on i_mutex */
+               ext4_inode_block_unlocked_dio(inode);
+               inode_dio_wait(inode);
+
+               ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
+                                            flags, mode);
+               if (ret)
+                       goto out_dio;
+               /*
+                * Remove entire range from the extent status tree.
+                *
+                * ext4_es_remove_extent(inode, lblk, max_blocks) is
+                * NOT sufficient.  I'm not sure why this is the case,
+                * but let's be conservative and remove the extent
+                * status tree for the entire inode.  There should be
+                * no outstanding delalloc extents thanks to the
+                * filemap_write_and_wait_range() call above.
+                */
+               ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+               if (ret)
+                       goto out_dio;
+       }
+       if (!partial_begin && !partial_end)
+               goto out_dio;
+
+       /*
+        * In worst case we have to writeout two nonadjacent unwritten
+        * blocks and update the inode
+        */
+       credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
+       if (ext4_should_journal_data(inode))
+               credits += 2;
+       handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               ext4_std_error(inode->i_sb, ret);
+               goto out_dio;
+       }
+
+       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+       if (new_size) {
+               ext4_update_inode_size(inode, new_size);
+       } else {
+               /*
+               * Mark that we allocate beyond EOF so the subsequent truncate
+               * can proceed even if the new size is the same as i_size.
+               */
+               if ((offset + len) > i_size_read(inode))
                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
+       ext4_mark_inode_dirty(handle, inode);
+
+       /* Zero out partial block at the edges of the range */
+       ret = ext4_zero_partial_blocks(handle, inode, offset, len);
 
+       if (file->f_flags & O_SYNC)
+               ext4_handle_sync(handle);
+
+       ext4_journal_stop(handle);
+out_dio:
+       ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+       mutex_unlock(&inode->i_mutex);
+       return ret;
 }
 
 /*
@@ -4482,111 +4904,75 @@ static void ext4_falloc_update_inode(struct inode *inode,
 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
        struct inode *inode = file_inode(file);
-       handle_t *handle;
-       loff_t new_size;
+       loff_t new_size = 0;
        unsigned int max_blocks;
        int ret = 0;
-       int ret2 = 0;
-       int retries = 0;
        int flags;
-       struct ext4_map_blocks map;
-       unsigned int credits, blkbits = inode->i_blkbits;
+       ext4_lblk_t lblk;
+       unsigned int blkbits = inode->i_blkbits;
 
        /* Return error if mode is not supported */
-       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+                    FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;
 
        if (mode & FALLOC_FL_PUNCH_HOLE)
-               return ext4_punch_hole(file, offset, len);
+               return ext4_punch_hole(inode, offset, len);
 
        ret = ext4_convert_inline_data(inode);
        if (ret)
                return ret;
 
-       /*
-        * currently supporting (pre)allocate mode for extent-based
-        * files _only_
-        */
-       if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
-               return -EOPNOTSUPP;
+       if (mode & FALLOC_FL_COLLAPSE_RANGE)
+               return ext4_collapse_range(inode, offset, len);
+
+       if (mode & FALLOC_FL_ZERO_RANGE)
+               return ext4_zero_range(file, offset, len, mode);
 
        trace_ext4_fallocate_enter(inode, offset, len, mode);
-       map.m_lblk = offset >> blkbits;
+       lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
         * If blocksize = 4096 offset = 3072 and len = 2048
         */
        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-               - map.m_lblk;
-       /*
-        * credits to insert 1 extent into extent tree
-        */
-       credits = ext4_chunk_trans_blocks(inode, max_blocks);
-       mutex_lock(&inode->i_mutex);
-       ret = inode_newsize_ok(inode, (len + offset));
-       if (ret) {
-               mutex_unlock(&inode->i_mutex);
-               trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
-               return ret;
-       }
-       flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
+               - lblk;
+
+       flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
        if (mode & FALLOC_FL_KEEP_SIZE)
                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
+       mutex_lock(&inode->i_mutex);
+
        /*
-        * Don't normalize the request if it can fit in one extent so
-        * that it doesn't get unnecessarily split into multiple
-        * extents.
+        * We only support preallocation for extent-based files only
         */
-       if (len <= EXT_UNINIT_MAX_LEN << blkbits)
-               flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
-
-retry:
-       while (ret >= 0 && ret < max_blocks) {
-               map.m_lblk = map.m_lblk + ret;
-               map.m_len = max_blocks = max_blocks - ret;
-               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
-                                           credits);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       break;
-               }
-               ret = ext4_map_blocks(handle, inode, &map, flags);
-               if (ret <= 0) {
-#ifdef EXT4FS_DEBUG
-                       ext4_warning(inode->i_sb,
-                                    "inode #%lu: block %u: len %u: "
-                                    "ext4_ext_map_blocks returned %d",
-                                    inode->i_ino, map.m_lblk,
-                                    map.m_len, ret);
-#endif
-                       ext4_mark_inode_dirty(handle, inode);
-                       ret2 = ext4_journal_stop(handle);
-                       break;
-               }
-               if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
-                                               blkbits) >> blkbits))
-                       new_size = offset + len;
-               else
-                       new_size = ((loff_t) map.m_lblk + ret) << blkbits;
+       if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+               ret = -EOPNOTSUPP;
+               goto out;
+       }
 
-               ext4_falloc_update_inode(inode, mode, new_size,
-                                        (map.m_flags & EXT4_MAP_NEW));
-               ext4_mark_inode_dirty(handle, inode);
-               if ((file->f_flags & O_SYNC) && ret >= max_blocks)
-                       ext4_handle_sync(handle);
-               ret2 = ext4_journal_stop(handle);
-               if (ret2)
-                       break;
+       if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+               new_size = offset + len;
+               ret = inode_newsize_ok(inode, new_size);
+               if (ret)
+                       goto out;
        }
-       if (ret == -ENOSPC &&
-                       ext4_should_retry_alloc(inode->i_sb, &retries)) {
-               ret = 0;
-               goto retry;
+
+       ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
+                                    flags, mode);
+       if (ret)
+               goto out;
+
+       if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
+               ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
+                                               EXT4_I(inode)->i_sync_tid);
        }
+out:
        mutex_unlock(&inode->i_mutex);
-       trace_ext4_fallocate_exit(inode, offset, max_blocks,
-                               ret > 0 ? ret2 : ret);
-       return ret > 0 ? ret2 : ret;
+       trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+       return ret;
 }
 
 /*
@@ -4599,10 +4985,9 @@ retry:
  * function, to convert the fallocated extents after IO is completed.
  * Returns 0 on success.
  */
-int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
-                                   ssize_t len)
+int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+                                  loff_t offset, ssize_t len)
 {
-       handle_t *handle;
        unsigned int max_blocks;
        int ret = 0;
        int ret2 = 0;
@@ -4617,16 +5002,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
        max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
                      map.m_lblk);
        /*
-        * credits to insert 1 extent into extent tree
+        * This is somewhat ugly but the idea is clear: When transaction is
+        * reserved, everything goes into it. Otherwise we rather start several
+        * smaller transactions for conversion of each extent separately.
         */
-       credits = ext4_chunk_trans_blocks(inode, max_blocks);
+       if (handle) {
+               handle = ext4_journal_start_reserved(handle,
+                                                    EXT4_HT_EXT_CONVERT);
+               if (IS_ERR(handle))
+                       return PTR_ERR(handle);
+               credits = 0;
+       } else {
+               /*
+                * credits to insert 1 extent into extent tree
+                */
+               credits = ext4_chunk_trans_blocks(inode, max_blocks);
+       }
        while (ret >= 0 && ret < max_blocks) {
                map.m_lblk += ret;
                map.m_len = (max_blocks -= ret);
-               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       break;
+               if (credits) {
+                       handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+                                                   credits);
+                       if (IS_ERR(handle)) {
+                               ret = PTR_ERR(handle);
+                               break;
+                       }
                }
                ret = ext4_map_blocks(handle, inode, &map,
                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
@@ -4637,10 +5038,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                                     inode->i_ino, map.m_lblk,
                                     map.m_len, ret);
                ext4_mark_inode_dirty(handle, inode);
-               ret2 = ext4_journal_stop(handle);
-               if (ret <= 0 || ret2 )
+               if (credits)
+                       ret2 = ext4_journal_stop(handle);
+               if (ret <= 0 || ret2)
                        break;
        }
+       if (!credits)
+               ret2 = ext4_journal_stop(handle);
        return ret > 0 ? ret2 : ret;
 }
 
@@ -4743,6 +5147,12 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        return error;
        }
 
+       if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+               error = ext4_ext_precache(inode);
+               if (error)
+                       return error;
+       }
+
        /* fallback to generic here if not in extents fmt */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return generic_block_fiemap(inode, fieinfo, start, len,
@@ -4770,6 +5180,518 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                error = ext4_fill_fiemap_extents(inode, start_blk,
                                                 len_blks, fieinfo);
        }
-
+       ext4_es_lru_add(inode);
        return error;
 }
+
+/*
+ * ext4_access_path:
+ * Function to access the path buffer for marking it dirty.
+ * It also checks if there are sufficient credits left in the journal handle
+ * to update path.
+ */
+static int
+ext4_access_path(handle_t *handle, struct inode *inode,
+               struct ext4_ext_path *path)
+{
+       int credits, err;
+
+       if (!ext4_handle_valid(handle))
+               return 0;
+
+       /*
+        * Check if need to extend journal credits
+        * 3 for leaf, sb, and inode plus 2 (bmap and group
+        * descriptor) for each block group; assume two block
+        * groups
+        */
+       if (handle->h_buffer_credits < 7) {
+               credits = ext4_writepage_trans_blocks(inode);
+               err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+               /* EAGAIN is success */
+               if (err && err != -EAGAIN)
+                       return err;
+       }
+
+       err = ext4_ext_get_access(handle, inode, path);
+       return err;
+}
+
+/*
+ * ext4_ext_shift_path_extents:
+ * Shift the extents of a path structure lying between path[depth].p_ext
+ * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
+ * from starting block for each extent.
+ */
+static int
+ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
+                           struct inode *inode, handle_t *handle,
+                           ext4_lblk_t *start)
+{
+       int depth, err = 0;
+       struct ext4_extent *ex_start, *ex_last;
+       bool update = 0;
+       depth = path->p_depth;
+
+       while (depth >= 0) {
+               if (depth == path->p_depth) {
+                       ex_start = path[depth].p_ext;
+                       if (!ex_start)
+                               return -EIO;
+
+                       ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+                       if (!ex_last)
+                               return -EIO;
+
+                       err = ext4_access_path(handle, inode, path + depth);
+                       if (err)
+                               goto out;
+
+                       if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
+                               update = 1;
+
+                       *start = le32_to_cpu(ex_last->ee_block) +
+                               ext4_ext_get_actual_len(ex_last);
+
+                       while (ex_start <= ex_last) {
+                               le32_add_cpu(&ex_start->ee_block, -shift);
+                               /* Try to merge to the left. */
+                               if ((ex_start >
+                                    EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
+                                   ext4_ext_try_to_merge_right(inode,
+                                                       path, ex_start - 1))
+                                       ex_last--;
+                               else
+                                       ex_start++;
+                       }
+                       err = ext4_ext_dirty(handle, inode, path + depth);
+                       if (err)
+                               goto out;
+
+                       if (--depth < 0 || !update)
+                               break;
+               }
+
+               /* Update index too */
+               err = ext4_access_path(handle, inode, path + depth);
+               if (err)
+                       goto out;
+
+               le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
+               err = ext4_ext_dirty(handle, inode, path + depth);
+               if (err)
+                       goto out;
+
+               /* we are done if current index is not a starting index */
+               if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
+                       break;
+
+               depth--;
+       }
+
+out:
+       return err;
+}
+
+/*
+ * ext4_ext_shift_extents:
+ * All the extents which lies in the range from start to the last allocated
+ * block for the file are shifted downwards by shift blocks.
+ * On success, 0 is returned, error otherwise.
+ */
+static int
+ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
+                      ext4_lblk_t start, ext4_lblk_t shift)
+{
+       struct ext4_ext_path *path;
+       int ret = 0, depth;
+       struct ext4_extent *extent;
+       ext4_lblk_t stop_block;
+       ext4_lblk_t ex_start, ex_end;
+
+       /* Let path point to the last extent */
+       path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+       if (IS_ERR(path))
+               return PTR_ERR(path);
+
+       depth = path->p_depth;
+       extent = path[depth].p_ext;
+       if (!extent)
+               goto out;
+
+       stop_block = le32_to_cpu(extent->ee_block) +
+                       ext4_ext_get_actual_len(extent);
+
+       /* Nothing to shift, if hole is at the end of file */
+       if (start >= stop_block)
+               goto out;
+
+       /*
+        * Don't start shifting extents until we make sure the hole is big
+        * enough to accomodate the shift.
+        */
+       path = ext4_find_extent(inode, start - 1, &path, 0);
+       if (IS_ERR(path))
+               return PTR_ERR(path);
+       depth = path->p_depth;
+       extent =  path[depth].p_ext;
+       if (extent) {
+               ex_start = le32_to_cpu(extent->ee_block);
+               ex_end = le32_to_cpu(extent->ee_block) +
+                       ext4_ext_get_actual_len(extent);
+       } else {
+               ex_start = 0;
+               ex_end = 0;
+       }
+
+       if ((start == ex_start && shift > ex_start) ||
+           (shift > start - ex_end))
+               return -EINVAL;
+
+       /* Its safe to start updating extents */
+       while (start < stop_block) {
+               path = ext4_find_extent(inode, start, &path, 0);
+               if (IS_ERR(path))
+                       return PTR_ERR(path);
+               depth = path->p_depth;
+               extent = path[depth].p_ext;
+               if (!extent) {
+                       EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+                                        (unsigned long) start);
+                       return -EIO;
+               }
+               if (start > le32_to_cpu(extent->ee_block)) {
+                       /* Hole, move to the next extent */
+                       if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
+                               path[depth].p_ext++;
+                       } else {
+                               start = ext4_ext_next_allocated_block(path);
+                               continue;
+                       }
+               }
+               ret = ext4_ext_shift_path_extents(path, shift, inode,
+                               handle, &start);
+               if (ret)
+                       break;
+       }
+out:
+       ext4_ext_drop_refs(path);
+       kfree(path);
+       return ret;
+}
+
+/*
+ * ext4_collapse_range:
+ * This implements the fallocate's collapse range functionality for ext4
+ * Returns: 0 and non-zero on error.
+ */
+int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+       struct super_block *sb = inode->i_sb;
+       ext4_lblk_t punch_start, punch_stop;
+       handle_t *handle;
+       unsigned int credits;
+       loff_t new_size, ioffset;
+       int ret;
+
+       /* Collapse range works only on fs block size aligned offsets. */
+       if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
+           len & (EXT4_CLUSTER_SIZE(sb) - 1))
+               return -EINVAL;
+
+       if (!S_ISREG(inode->i_mode))
+               return -EINVAL;
+
+       trace_ext4_collapse_range(inode, offset, len);
+
+       punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+       punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+       /* Call ext4_force_commit to flush all data in case of data=journal. */
+       if (ext4_should_journal_data(inode)) {
+               ret = ext4_force_commit(inode->i_sb);
+               if (ret)
+                       return ret;
+       }
+
+       /*
+        * Need to round down offset to be aligned with page size boundary
+        * for page size > block size.
+        */
+       ioffset = round_down(offset, PAGE_SIZE);
+
+       /* Write out all dirty pages */
+       ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+                                          LLONG_MAX);
+       if (ret)
+               return ret;
+
+       /* Take mutex lock */
+       mutex_lock(&inode->i_mutex);
+
+       /*
+        * There is no need to overlap collapse range with EOF, in which case
+        * it is effectively a truncate operation
+        */
+       if (offset + len >= i_size_read(inode)) {
+               ret = -EINVAL;
+               goto out_mutex;
+       }
+
+       /* Currently just for extent based files */
+       if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+               ret = -EOPNOTSUPP;
+               goto out_mutex;
+       }
+
+       truncate_pagecache(inode, ioffset);
+
+       /* Wait for existing dio to complete */
+       ext4_inode_block_unlocked_dio(inode);
+       inode_dio_wait(inode);
+
+       credits = ext4_writepage_trans_blocks(inode);
+       handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               goto out_dio;
+       }
+
+       down_write(&EXT4_I(inode)->i_data_sem);
+       ext4_discard_preallocations(inode);
+
+       ret = ext4_es_remove_extent(inode, punch_start,
+                                   EXT_MAX_BLOCKS - punch_start);
+       if (ret) {
+               up_write(&EXT4_I(inode)->i_data_sem);
+               goto out_stop;
+       }
+
+       ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+       if (ret) {
+               up_write(&EXT4_I(inode)->i_data_sem);
+               goto out_stop;
+       }
+       ext4_discard_preallocations(inode);
+
+       ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+                                    punch_stop - punch_start);
+       if (ret) {
+               up_write(&EXT4_I(inode)->i_data_sem);
+               goto out_stop;
+       }
+
+       new_size = i_size_read(inode) - len;
+       i_size_write(inode, new_size);
+       EXT4_I(inode)->i_disksize = new_size;
+
+       up_write(&EXT4_I(inode)->i_data_sem);
+       if (IS_SYNC(inode))
+               ext4_handle_sync(handle);
+       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+       ext4_mark_inode_dirty(handle, inode);
+
+out_stop:
+       ext4_journal_stop(handle);
+out_dio:
+       ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+       mutex_unlock(&inode->i_mutex);
+       return ret;
+}
+
+/**
+ * ext4_swap_extents - Swap extents between two inodes
+ *
+ * @inode1:    First inode
+ * @inode2:    Second inode
+ * @lblk1:     Start block for first inode
+ * @lblk2:     Start block for second inode
+ * @count:     Number of blocks to swap
+ * @mark_unwritten: Mark second inode's extents as unwritten after swap
+ * @erp:       Pointer to save error value
+ *
+ * This helper routine does exactly what is promise "swap extents". All other
+ * stuff such as page-cache locking consistency, bh mapping consistency or
+ * extent's data copying must be performed by caller.
+ * Locking:
+ *             i_mutex is held for both inodes
+ *             i_data_sem is locked for write for both inodes
+ * Assumptions:
+ *             All pages from requested range are locked for both inodes
+ */
+int
+ext4_swap_extents(handle_t *handle, struct inode *inode1,
+                    struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
+                 ext4_lblk_t count, int unwritten, int *erp)
+{
+       struct ext4_ext_path *path1 = NULL;
+       struct ext4_ext_path *path2 = NULL;
+       int replaced_count = 0;
+
+       BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
+       BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
+       BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+       BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+
+       *erp = ext4_es_remove_extent(inode1, lblk1, count);
+       if (unlikely(*erp))
+               return 0;
+       *erp = ext4_es_remove_extent(inode2, lblk2, count);
+       if (unlikely(*erp))
+               return 0;
+
+       while (count) {
+               struct ext4_extent *ex1, *ex2, tmp_ex;
+               ext4_lblk_t e1_blk, e2_blk;
+               int e1_len, e2_len, len;
+               int split = 0;
+
+               path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
+               if (unlikely(IS_ERR(path1))) {
+                       *erp = PTR_ERR(path1);
+                       path1 = NULL;
+               finish:
+                       count = 0;
+                       goto repeat;
+               }
+               path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
+               if (unlikely(IS_ERR(path2))) {
+                       *erp = PTR_ERR(path2);
+                       path2 = NULL;
+                       goto finish;
+               }
+               ex1 = path1[path1->p_depth].p_ext;
+               ex2 = path2[path2->p_depth].p_ext;
+               /* Do we have somthing to swap ? */
+               if (unlikely(!ex2 || !ex1))
+                       goto finish;
+
+               e1_blk = le32_to_cpu(ex1->ee_block);
+               e2_blk = le32_to_cpu(ex2->ee_block);
+               e1_len = ext4_ext_get_actual_len(ex1);
+               e2_len = ext4_ext_get_actual_len(ex2);
+
+               /* Hole handling */
+               if (!in_range(lblk1, e1_blk, e1_len) ||
+                   !in_range(lblk2, e2_blk, e2_len)) {
+                       ext4_lblk_t next1, next2;
+
+                       /* if hole after extent, then go to next extent */
+                       next1 = ext4_ext_next_allocated_block(path1);
+                       next2 = ext4_ext_next_allocated_block(path2);
+                       /* If hole before extent, then shift to that extent */
+                       if (e1_blk > lblk1)
+                               next1 = e1_blk;
+                       if (e2_blk > lblk2)
+                               next2 = e1_blk;
+                       /* Do we have something to swap */
+                       if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
+                               goto finish;
+                       /* Move to the rightest boundary */
+                       len = next1 - lblk1;
+                       if (len < next2 - lblk2)
+                               len = next2 - lblk2;
+                       if (len > count)
+                               len = count;
+                       lblk1 += len;
+                       lblk2 += len;
+                       count -= len;
+                       goto repeat;
+               }
+
+               /* Prepare left boundary */
+               if (e1_blk < lblk1) {
+                       split = 1;
+                       *erp = ext4_force_split_extent_at(handle, inode1,
+                                               &path1, lblk1, 0);
+                       if (unlikely(*erp))
+                               goto finish;
+               }
+               if (e2_blk < lblk2) {
+                       split = 1;
+                       *erp = ext4_force_split_extent_at(handle, inode2,
+                                               &path2,  lblk2, 0);
+                       if (unlikely(*erp))
+                               goto finish;
+               }
+               /* ext4_split_extent_at() may result in leaf extent split,
+                * path must to be revalidated. */
+               if (split)
+                       goto repeat;
+
+               /* Prepare right boundary */
+               len = count;
+               if (len > e1_blk + e1_len - lblk1)
+                       len = e1_blk + e1_len - lblk1;
+               if (len > e2_blk + e2_len - lblk2)
+                       len = e2_blk + e2_len - lblk2;
+
+               if (len != e1_len) {
+                       split = 1;
+                       *erp = ext4_force_split_extent_at(handle, inode1,
+                                               &path1, lblk1 + len, 0);
+                       if (unlikely(*erp))
+                               goto finish;
+               }
+               if (len != e2_len) {
+                       split = 1;
+                       *erp = ext4_force_split_extent_at(handle, inode2,
+                                               &path2, lblk2 + len, 0);
+                       if (*erp)
+                               goto finish;
+               }
+               /* ext4_split_extent_at() may result in leaf extent split,
+                * path must to be revalidated. */
+               if (split)
+                       goto repeat;
+
+               BUG_ON(e2_len != e1_len);
+               *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
+               if (unlikely(*erp))
+                       goto finish;
+               *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
+               if (unlikely(*erp))
+                       goto finish;
+
+               /* Both extents are fully inside boundaries. Swap it now */
+               tmp_ex = *ex1;
+               ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
+               ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
+               ex1->ee_len = cpu_to_le16(e2_len);
+               ex2->ee_len = cpu_to_le16(e1_len);
+               if (unwritten)
+                       ext4_ext_mark_unwritten(ex2);
+               if (ext4_ext_is_unwritten(&tmp_ex))
+                       ext4_ext_mark_unwritten(ex1);
+
+               ext4_ext_try_to_merge(handle, inode2, path2, ex2);
+               ext4_ext_try_to_merge(handle, inode1, path1, ex1);
+               *erp = ext4_ext_dirty(handle, inode2, path2 +
+                                     path2->p_depth);
+               if (unlikely(*erp))
+                       goto finish;
+               *erp = ext4_ext_dirty(handle, inode1, path1 +
+                                     path1->p_depth);
+               /*
+                * Looks scarry ah..? second inode already points to new blocks,
+                * and it was successfully dirtied. But luckily error may happen
+                * only due to journal error, so full transaction will be
+                * aborted anyway.
+                */
+               if (unlikely(*erp))
+                       goto finish;
+               lblk1 += len;
+               lblk2 += len;
+               replaced_count += len;
+               count -= len;
+
+       repeat:
+               ext4_ext_drop_refs(path1);
+               kfree(path1);
+               ext4_ext_drop_refs(path2);
+               kfree(path2);
+               path1 = path2 = NULL;
+       }
+       return replaced_count;
+}
index 1fefeb7..b860603 100644 (file)
  * Ext4 extents status tree core functions.
  */
 #include <linux/rbtree.h>
+#include <linux/list_sort.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include "ext4.h"
 #include "extents_status.h"
-#include "ext4_extents.h"
 
 #include <trace/events/ext4.h>
 
@@ -147,6 +149,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                              ext4_lblk_t end);
 static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
                                       int nr_to_scan);
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+                           struct ext4_inode_info *locked_ei);
 
 int __init ext4_init_es(void)
 {
@@ -182,7 +186,7 @@ static void ext4_es_print_tree(struct inode *inode)
        while (node) {
                struct extent_status *es;
                es = rb_entry(node, struct extent_status, rb_node);
-               printk(KERN_DEBUG " [%u/%u) %llu %llx",
+               printk(KERN_DEBUG " [%u/%u) %llu %x",
                       es->es_lblk, es->es_len,
                       ext4_es_pblock(es), ext4_es_status(es));
                node = rb_next(node);
@@ -260,7 +264,7 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
        if (tree->cache_es) {
                es1 = tree->cache_es;
                if (in_range(lblk, es1->es_lblk, es1->es_len)) {
-                       es_debug("%u cached by [%u/%u) %llu %llx\n",
+                       es_debug("%u cached by [%u/%u) %llu %x\n",
                                 lblk, es1->es_lblk, es1->es_len,
                                 ext4_es_pblock(es1), ext4_es_status(es1));
                        goto out;
@@ -291,7 +295,6 @@ out:
 
        read_unlock(&EXT4_I(inode)->i_es_lock);
 
-       ext4_es_lru_add(inode);
        trace_ext4_es_find_delayed_extent_range_exit(inode, es);
 }
 
@@ -312,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
         */
        if (!ext4_es_is_delayed(es)) {
                EXT4_I(inode)->i_es_lru_nr++;
-               percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+               percpu_counter_inc(&EXT4_SB(inode->i_sb)->
+                                       s_es_stats.es_stats_lru_cnt);
        }
 
+       EXT4_I(inode)->i_es_all_nr++;
+       percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+
        return es;
 }
 
 static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
 {
+       EXT4_I(inode)->i_es_all_nr--;
+       percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+
        /* Decrease the lru counter when this es is not delayed */
        if (!ext4_es_is_delayed(es)) {
                BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
                EXT4_I(inode)->i_es_lru_nr--;
-               percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+               percpu_counter_dec(&EXT4_SB(inode->i_sb)->
+                                       s_es_stats.es_stats_lru_cnt);
        }
 
        kmem_cache_free(ext4_es_cachep, es);
@@ -343,8 +354,14 @@ static int ext4_es_can_be_merged(struct extent_status *es1,
        if (ext4_es_status(es1) != ext4_es_status(es2))
                return 0;
 
-       if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL)
+       if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
+               pr_warn("ES assertion failed when merging extents. "
+                       "The sum of lengths of es1 (%d) and es2 (%d) "
+                       "is bigger than allowed file size (%d)\n",
+                       es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
+               WARN_ON(1);
                return 0;
+       }
 
        if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
                return 0;
@@ -407,6 +424,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
 }
 
 #ifdef ES_AGGRESSIVE_TEST
+#include "ext4_extents.h"      /* Needed when ES_AGGRESSIVE_TEST is defined */
+
 static void ext4_es_insert_extent_ext_check(struct inode *inode,
                                            struct extent_status *es)
 {
@@ -417,7 +436,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
        unsigned short ee_len;
        int depth, ee_status, es_status;
 
-       path = ext4_ext_find_extent(inode, es->es_lblk, NULL);
+       path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return;
 
@@ -430,7 +449,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                ee_start = ext4_ext_pblock(ex);
                ee_len = ext4_ext_get_actual_len(ex);
 
-               ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0;
+               ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
                es_status = ext4_es_is_unwritten(es) ? 1 : 0;
 
                /*
@@ -439,11 +458,11 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                 */
                if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
                        if (in_range(es->es_lblk, ee_block, ee_len)) {
-                               pr_warn("ES insert assertation failed for "
+                               pr_warn("ES insert assertion failed for "
                                        "inode: %lu we can find an extent "
                                        "at block [%d/%d/%llu/%c], but we "
-                                       "want to add an delayed/hole extent "
-                                       "[%d/%d/%llu/%llx]\n",
+                                       "want to add a delayed/hole extent "
+                                       "[%d/%d/%llu/%x]\n",
                                        inode->i_ino, ee_block, ee_len,
                                        ee_start, ee_status ? 'u' : 'w',
                                        es->es_lblk, es->es_len,
@@ -458,7 +477,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                 */
                if (es->es_lblk < ee_block ||
                    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
-                       pr_warn("ES insert assertation failed for inode: %lu "
+                       pr_warn("ES insert assertion failed for inode: %lu "
                                "ex_status [%d/%d/%llu/%c] != "
                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
                                ee_block, ee_len, ee_start,
@@ -468,7 +487,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                }
 
                if (ee_status ^ es_status) {
-                       pr_warn("ES insert assertation failed for inode: %lu "
+                       pr_warn("ES insert assertion failed for inode: %lu "
                                "ex_status [%d/%d/%llu/%c] != "
                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
                                ee_block, ee_len, ee_start,
@@ -481,19 +500,17 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                 * that we don't want to add an written/unwritten extent.
                 */
                if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
-                       pr_warn("ES insert assertation failed for inode: %lu "
+                       pr_warn("ES insert assertion failed for inode: %lu "
                                "can't find an extent at block %d but we want "
-                               "to add an written/unwritten extent "
-                               "[%d/%d/%llu/%llx]\n", inode->i_ino,
+                               "to add a written/unwritten extent "
+                               "[%d/%d/%llu/%x]\n", inode->i_ino,
                                es->es_lblk, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                }
        }
 out:
-       if (path) {
-               ext4_ext_drop_refs(path);
-               kfree(path);
-       }
+       ext4_ext_drop_refs(path);
+       kfree(path);
 }
 
 static void ext4_es_insert_extent_ind_check(struct inode *inode,
@@ -519,21 +536,21 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
                         * We want to add a delayed/hole extent but this
                         * block has been allocated.
                         */
-                       pr_warn("ES insert assertation failed for inode: %lu "
+                       pr_warn("ES insert assertion failed for inode: %lu "
                                "We can find blocks but we want to add a "
-                               "delayed/hole extent [%d/%d/%llu/%llx]\n",
+                               "delayed/hole extent [%d/%d/%llu/%x]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                        return;
                } else if (ext4_es_is_written(es)) {
                        if (retval != es->es_len) {
-                               pr_warn("ES insert assertation failed for "
+                               pr_warn("ES insert assertion failed for "
                                        "inode: %lu retval %d != es_len %d\n",
                                        inode->i_ino, retval, es->es_len);
                                return;
                        }
                        if (map.m_pblk != ext4_es_pblock(es)) {
-                               pr_warn("ES insert assertation failed for "
+                               pr_warn("ES insert assertion failed for "
                                        "inode: %lu m_pblk %llu != "
                                        "es_pblk %llu\n",
                                        inode->i_ino, map.m_pblk,
@@ -549,9 +566,9 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
                }
        } else if (retval == 0) {
                if (ext4_es_is_written(es)) {
-                       pr_warn("ES insert assertation failed for inode: %lu "
+                       pr_warn("ES insert assertion failed for inode: %lu "
                                "We can't find the block but we want to add "
-                               "an written extent [%d/%d/%llu/%llx]\n",
+                               "a written extent [%d/%d/%llu/%x]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                        return;
@@ -632,22 +649,20 @@ out:
 }
 
 /*
- * ext4_es_insert_extent() adds a space to a extent status tree.
- *
- * ext4_es_insert_extent is called by ext4_da_write_begin and
- * ext4_es_remove_extent.
+ * ext4_es_insert_extent() adds information to an inode's extent
+ * status tree.
  *
  * Return 0 on success, error code on failure.
  */
 int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
                          ext4_lblk_t len, ext4_fsblk_t pblk,
-                         unsigned long long status)
+                         unsigned int status)
 {
        struct extent_status newes;
        ext4_lblk_t end = lblk + len - 1;
        int err = 0;
 
-       es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
+       es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
                 lblk, len, pblk, status, inode->i_ino);
 
        if (!len)
@@ -665,8 +680,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 
        newes.es_lblk = lblk;
        newes.es_len = len;
-       ext4_es_store_pblock(&newes, pblk);
-       ext4_es_store_status(&newes, status);
+       ext4_es_store_pblock_status(&newes, pblk, status);
        trace_ext4_es_insert_extent(inode, &newes);
 
        ext4_es_insert_extent_check(inode, &newes);
@@ -675,18 +689,54 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
        err = __es_remove_extent(inode, lblk, end);
        if (err != 0)
                goto error;
+retry:
        err = __es_insert_extent(inode, &newes);
+       if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+                                              EXT4_I(inode)))
+               goto retry;
+       if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+               err = 0;
 
 error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
 
-       ext4_es_lru_add(inode);
        ext4_es_print_tree(inode);
 
        return err;
 }
 
 /*
+ * ext4_es_cache_extent() inserts information into the extent status
+ * tree if and only if there isn't information about the range in
+ * question already.
+ */
+void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+                         ext4_lblk_t len, ext4_fsblk_t pblk,
+                         unsigned int status)
+{
+       struct extent_status *es;
+       struct extent_status newes;
+       ext4_lblk_t end = lblk + len - 1;
+
+       newes.es_lblk = lblk;
+       newes.es_len = len;
+       ext4_es_store_pblock_status(&newes, pblk, status);
+       trace_ext4_es_cache_extent(inode, &newes);
+
+       if (!len)
+               return;
+
+       BUG_ON(end < lblk);
+
+       write_lock(&EXT4_I(inode)->i_es_lock);
+
+       es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
+       if (!es || es->es_lblk > end)
+               __es_insert_extent(inode, &newes);
+       write_unlock(&EXT4_I(inode)->i_es_lock);
+}
+
+/*
  * ext4_es_lookup_extent() looks up an extent in extent status tree.
  *
  * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
@@ -697,6 +747,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
                          struct extent_status *es)
 {
        struct ext4_es_tree *tree;
+       struct ext4_es_stats *stats;
        struct extent_status *es1 = NULL;
        struct rb_node *node;
        int found = 0;
@@ -733,16 +784,19 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
        }
 
 out:
+       stats = &EXT4_SB(inode->i_sb)->s_es_stats;
        if (found) {
                BUG_ON(!es1);
                es->es_lblk = es1->es_lblk;
                es->es_len = es1->es_len;
                es->es_pblk = es1->es_pblk;
+               stats->es_stats_cache_hits++;
+       } else {
+               stats->es_stats_cache_misses++;
        }
 
        read_unlock(&EXT4_I(inode)->i_es_lock);
 
-       ext4_es_lru_add(inode);
        trace_ext4_es_lookup_extent_exit(inode, es, found);
        return found;
 }
@@ -756,8 +810,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
        struct extent_status orig_es;
        ext4_lblk_t len1, len2;
        ext4_fsblk_t block;
-       int err = 0;
+       int err;
 
+retry:
+       err = 0;
        es = __es_tree_search(&tree->root, lblk);
        if (!es)
                goto out;
@@ -781,17 +837,21 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 
                        newes.es_lblk = end + 1;
                        newes.es_len = len2;
+                       block = 0x7FDEADBEEFULL;
                        if (ext4_es_is_written(&orig_es) ||
-                           ext4_es_is_unwritten(&orig_es)) {
+                           ext4_es_is_unwritten(&orig_es))
                                block = ext4_es_pblock(&orig_es) +
                                        orig_es.es_len - len2;
-                               ext4_es_store_pblock(&newes, block);
-                       }
-                       ext4_es_store_status(&newes, ext4_es_status(&orig_es));
+                       ext4_es_store_pblock_status(&newes, block,
+                                                   ext4_es_status(&orig_es));
                        err = __es_insert_extent(inode, &newes);
                        if (err) {
                                es->es_lblk = orig_es.es_lblk;
                                es->es_len = orig_es.es_len;
+                               if ((err == -ENOMEM) &&
+                                   __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+                                                    EXT4_I(inode)))
+                                       goto retry;
                                goto out;
                        }
                } else {
@@ -869,85 +929,308 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
        return err;
 }
 
-int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
+static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
+                                    struct list_head *b)
 {
-       ext4_lblk_t  ee_block;
-       ext4_fsblk_t ee_pblock;
-       unsigned int ee_len;
-
-       ee_block  = le32_to_cpu(ex->ee_block);
-       ee_len    = ext4_ext_get_actual_len(ex);
-       ee_pblock = ext4_ext_pblock(ex);
+       struct ext4_inode_info *eia, *eib;
+       eia = list_entry(a, struct ext4_inode_info, i_es_lru);
+       eib = list_entry(b, struct ext4_inode_info, i_es_lru);
 
-       if (ee_len == 0)
+       if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+           !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+               return 1;
+       if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+           ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+               return -1;
+       if (eia->i_touch_when == eib->i_touch_when)
                return 0;
-
-       return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
-                                    EXTENT_STATUS_WRITTEN);
+       if (time_after(eia->i_touch_when, eib->i_touch_when))
+               return 1;
+       else
+               return -1;
 }
 
-static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+                           struct ext4_inode_info *locked_ei)
 {
-       struct ext4_sb_info *sbi = container_of(shrink,
-                                       struct ext4_sb_info, s_es_shrinker);
        struct ext4_inode_info *ei;
-       struct list_head *cur, *tmp, scanned;
-       int nr_to_scan = sc->nr_to_scan;
-       int ret, nr_shrunk = 0;
-
-       ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
-       trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
-
-       if (!nr_to_scan)
-               return ret;
-
-       INIT_LIST_HEAD(&scanned);
+       struct ext4_es_stats *es_stats;
+       struct list_head *cur, *tmp;
+       LIST_HEAD(skipped);
+       ktime_t start_time;
+       u64 scan_time;
+       int nr_shrunk = 0;
+       int retried = 0, skip_precached = 1, nr_skipped = 0;
 
+       es_stats = &sbi->s_es_stats;
+       start_time = ktime_get();
        spin_lock(&sbi->s_es_lru_lock);
+
+retry:
        list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
-               list_move_tail(cur, &scanned);
+               int shrunk;
+
+               /*
+                * If we have already reclaimed all extents from extent
+                * status tree, just stop the loop immediately.
+                */
+               if (percpu_counter_read_positive(
+                               &es_stats->es_stats_lru_cnt) == 0)
+                       break;
 
                ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
 
-               read_lock(&ei->i_es_lock);
-               if (ei->i_es_lru_nr == 0) {
-                       read_unlock(&ei->i_es_lock);
+               /*
+                * Skip the inode that is newer than the last_sorted
+                * time.  Normally we try hard to avoid shrinking
+                * precached inodes, but we will as a last resort.
+                */
+               if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
+                   (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
+                                               EXT4_STATE_EXT_PRECACHED))) {
+                       nr_skipped++;
+                       list_move_tail(cur, &skipped);
                        continue;
                }
-               read_unlock(&ei->i_es_lock);
 
-               write_lock(&ei->i_es_lock);
-               ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+               if (ei->i_es_lru_nr == 0 || ei == locked_ei ||
+                   !write_trylock(&ei->i_es_lock))
+                       continue;
+
+               shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
+               if (ei->i_es_lru_nr == 0)
+                       list_del_init(&ei->i_es_lru);
                write_unlock(&ei->i_es_lock);
 
-               nr_shrunk += ret;
-               nr_to_scan -= ret;
+               nr_shrunk += shrunk;
+               nr_to_scan -= shrunk;
                if (nr_to_scan == 0)
                        break;
        }
-       list_splice_tail(&scanned, &sbi->s_es_lru);
+
+       /* Move the newer inodes into the tail of the LRU list. */
+       list_splice_tail(&skipped, &sbi->s_es_lru);
+       INIT_LIST_HEAD(&skipped);
+
+       /*
+        * If we skipped any inodes, and we weren't able to make any
+        * forward progress, sort the list and try again.
+        */
+       if ((nr_shrunk == 0) && nr_skipped && !retried) {
+               retried++;
+               list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+               es_stats->es_stats_last_sorted = jiffies;
+               ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
+                                     i_es_lru);
+               /*
+                * If there are no non-precached inodes left on the
+                * list, start releasing precached extents.
+                */
+               if (ext4_test_inode_state(&ei->vfs_inode,
+                                         EXT4_STATE_EXT_PRECACHED))
+                       skip_precached = 0;
+               goto retry;
+       }
+
        spin_unlock(&sbi->s_es_lru_lock);
 
-       ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
-       trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
-       return ret;
+       if (locked_ei && nr_shrunk == 0)
+               nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+
+       scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+       if (likely(es_stats->es_stats_scan_time))
+               es_stats->es_stats_scan_time = (scan_time +
+                               es_stats->es_stats_scan_time*3) / 4;
+       else
+               es_stats->es_stats_scan_time = scan_time;
+       if (scan_time > es_stats->es_stats_max_scan_time)
+               es_stats->es_stats_max_scan_time = scan_time;
+       if (likely(es_stats->es_stats_shrunk))
+               es_stats->es_stats_shrunk = (nr_shrunk +
+                               es_stats->es_stats_shrunk*3) / 4;
+       else
+               es_stats->es_stats_shrunk = nr_shrunk;
+
+       trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached,
+                            nr_skipped, retried);
+       return nr_shrunk;
 }
 
-void ext4_es_register_shrinker(struct super_block *sb)
+static unsigned long ext4_es_count(struct shrinker *shrink,
+                                  struct shrink_control *sc)
 {
+       unsigned long nr;
        struct ext4_sb_info *sbi;
 
-       sbi = EXT4_SB(sb);
+       sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
+       nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
+       trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
+       return nr;
+}
+
+static unsigned long ext4_es_scan(struct shrinker *shrink,
+                                 struct shrink_control *sc)
+{
+       struct ext4_sb_info *sbi = container_of(shrink,
+                                       struct ext4_sb_info, s_es_shrinker);
+       int nr_to_scan = sc->nr_to_scan;
+       int ret, nr_shrunk;
+
+       ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
+       trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
+
+       if (!nr_to_scan)
+               return ret;
+
+       nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
+
+       trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
+       return nr_shrunk;
+}
+
+static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
+{
+       return *pos ? NULL : SEQ_START_TOKEN;
+}
+
+static void *
+ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+       return NULL;
+}
+
+static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
+{
+       struct ext4_sb_info *sbi = seq->private;
+       struct ext4_es_stats *es_stats = &sbi->s_es_stats;
+       struct ext4_inode_info *ei, *max = NULL;
+       unsigned int inode_cnt = 0;
+
+       if (v != SEQ_START_TOKEN)
+               return 0;
+
+       /* here we just find an inode that has the max nr. of objects */
+       spin_lock(&sbi->s_es_lru_lock);
+       list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
+               inode_cnt++;
+               if (max && max->i_es_all_nr < ei->i_es_all_nr)
+                       max = ei;
+               else if (!max)
+                       max = ei;
+       }
+       spin_unlock(&sbi->s_es_lru_lock);
+
+       seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
+                  percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
+                  percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
+       seq_printf(seq, "  %lu/%lu cache hits/misses\n",
+                  es_stats->es_stats_cache_hits,
+                  es_stats->es_stats_cache_misses);
+       if (es_stats->es_stats_last_sorted != 0)
+               seq_printf(seq, "  %u ms last sorted interval\n",
+                          jiffies_to_msecs(jiffies -
+                                           es_stats->es_stats_last_sorted));
+       if (inode_cnt)
+               seq_printf(seq, "  %d inodes on lru list\n", inode_cnt);
+
+       seq_printf(seq, "average:\n  %llu us scan time\n",
+           div_u64(es_stats->es_stats_scan_time, 1000));
+       seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
+       if (inode_cnt)
+               seq_printf(seq,
+                   "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
+                   "  %llu us max scan time\n",
+                   max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
+                   div_u64(es_stats->es_stats_max_scan_time, 1000));
+
+       return 0;
+}
+
+static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
+       .start = ext4_es_seq_shrinker_info_start,
+       .next  = ext4_es_seq_shrinker_info_next,
+       .stop  = ext4_es_seq_shrinker_info_stop,
+       .show  = ext4_es_seq_shrinker_info_show,
+};
+
+static int
+ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
+{
+       int ret;
+
+       ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
+       if (!ret) {
+               struct seq_file *m = file->private_data;
+               m->private = PDE_DATA(inode);
+       }
+
+       return ret;
+}
+
+static int
+ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
+{
+       return seq_release(inode, file);
+}
+
+static const struct file_operations ext4_es_seq_shrinker_info_fops = {
+       .owner          = THIS_MODULE,
+       .open           = ext4_es_seq_shrinker_info_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = ext4_es_seq_shrinker_info_release,
+};
+
+int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+{
+       int err;
+
        INIT_LIST_HEAD(&sbi->s_es_lru);
        spin_lock_init(&sbi->s_es_lru_lock);
-       sbi->s_es_shrinker.shrink = ext4_es_shrink;
+       sbi->s_es_stats.es_stats_last_sorted = 0;
+       sbi->s_es_stats.es_stats_shrunk = 0;
+       sbi->s_es_stats.es_stats_cache_hits = 0;
+       sbi->s_es_stats.es_stats_cache_misses = 0;
+       sbi->s_es_stats.es_stats_scan_time = 0;
+       sbi->s_es_stats.es_stats_max_scan_time = 0;
+       err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
+       if (err)
+               return err;
+       err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL);
+       if (err)
+               goto err1;
+
+       sbi->s_es_shrinker.scan_objects = ext4_es_scan;
+       sbi->s_es_shrinker.count_objects = ext4_es_count;
        sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
-       register_shrinker(&sbi->s_es_shrinker);
+       err = register_shrinker(&sbi->s_es_shrinker);
+       if (err)
+               goto err2;
+
+       if (sbi->s_proc)
+               proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
+                                &ext4_es_seq_shrinker_info_fops, sbi);
+
+       return 0;
+
+err2:
+       percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
+err1:
+       percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+       return err;
 }
 
-void ext4_es_unregister_shrinker(struct super_block *sb)
+void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
 {
-       unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker);
+       if (sbi->s_proc)
+               remove_proc_entry("es_shrinker_info", sbi->s_proc);
+       percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+       percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
+       unregister_shrinker(&sbi->s_es_shrinker);
 }
 
 void ext4_es_lru_add(struct inode *inode)
@@ -955,11 +1238,14 @@ void ext4_es_lru_add(struct inode *inode)
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
+       ei->i_touch_when = jiffies;
+
+       if (!list_empty(&ei->i_es_lru))
+               return;
+
        spin_lock(&sbi->s_es_lru_lock);
        if (list_empty(&ei->i_es_lru))
                list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
-       else
-               list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
        spin_unlock(&sbi->s_es_lru_lock);
 }
 
@@ -981,11 +1267,17 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
        struct ext4_es_tree *tree = &ei->i_es_tree;
        struct rb_node *node;
        struct extent_status *es;
-       int nr_shrunk = 0;
+       unsigned long nr_shrunk = 0;
+       static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                     DEFAULT_RATELIMIT_BURST);
 
        if (ei->i_es_lru_nr == 0)
                return 0;
 
+       if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+           __ratelimit(&_rs))
+               ext4_warning(inode->i_sb, "forced shrink of precached extents");
+
        node = rb_first(&tree->root);
        while (node != NULL) {
                es = rb_entry(node, struct extent_status, rb_node);
index f740eb0..efd5f97 100644 (file)
 /*
  * These flags live in the high bits of extent_status.es_pblk
  */
-#define EXTENT_STATUS_WRITTEN  (1ULL << 63)
-#define EXTENT_STATUS_UNWRITTEN (1ULL << 62)
-#define EXTENT_STATUS_DELAYED  (1ULL << 61)
-#define EXTENT_STATUS_HOLE     (1ULL << 60)
+#define ES_SHIFT       60
+
+#define EXTENT_STATUS_WRITTEN  (1 << 3)
+#define EXTENT_STATUS_UNWRITTEN (1 << 2)
+#define EXTENT_STATUS_DELAYED  (1 << 1)
+#define EXTENT_STATUS_HOLE     (1 << 0)
 
 #define EXTENT_STATUS_FLAGS    (EXTENT_STATUS_WRITTEN | \
                                 EXTENT_STATUS_UNWRITTEN | \
                                 EXTENT_STATUS_DELAYED | \
                                 EXTENT_STATUS_HOLE)
 
+#define ES_WRITTEN             (1ULL << 63)
+#define ES_UNWRITTEN           (1ULL << 62)
+#define ES_DELAYED             (1ULL << 61)
+#define ES_HOLE                        (1ULL << 60)
+
+#define ES_MASK                        (ES_WRITTEN | ES_UNWRITTEN | \
+                                ES_DELAYED | ES_HOLE)
+
+struct ext4_sb_info;
 struct ext4_extent;
 
 struct extent_status {
@@ -53,13 +64,27 @@ struct ext4_es_tree {
        struct extent_status *cache_es; /* recently accessed extent */
 };
 
+struct ext4_es_stats {
+       unsigned long es_stats_last_sorted;
+       unsigned long es_stats_shrunk;
+       unsigned long es_stats_cache_hits;
+       unsigned long es_stats_cache_misses;
+       u64 es_stats_scan_time;
+       u64 es_stats_max_scan_time;
+       struct percpu_counter es_stats_all_cnt;
+       struct percpu_counter es_stats_lru_cnt;
+};
+
 extern int __init ext4_init_es(void);
 extern void ext4_exit_es(void);
 extern void ext4_es_init_tree(struct ext4_es_tree *tree);
 
 extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t len, ext4_fsblk_t pblk,
-                                unsigned long long status);
+                                unsigned int status);
+extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+                                ext4_lblk_t len, ext4_fsblk_t pblk,
+                                unsigned int status);
 extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t len);
 extern void ext4_es_find_delayed_extent_range(struct inode *inode,
@@ -67,36 +92,35 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode,
                                        struct extent_status *es);
 extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
                                 struct extent_status *es);
-extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex);
 
 static inline int ext4_es_is_written(struct extent_status *es)
 {
-       return (es->es_pblk & EXTENT_STATUS_WRITTEN) != 0;
+       return (es->es_pblk & ES_WRITTEN) != 0;
 }
 
 static inline int ext4_es_is_unwritten(struct extent_status *es)
 {
-       return (es->es_pblk & EXTENT_STATUS_UNWRITTEN) != 0;
+       return (es->es_pblk & ES_UNWRITTEN) != 0;
 }
 
 static inline int ext4_es_is_delayed(struct extent_status *es)
 {
-       return (es->es_pblk & EXTENT_STATUS_DELAYED) != 0;
+       return (es->es_pblk & ES_DELAYED) != 0;
 }
 
 static inline int ext4_es_is_hole(struct extent_status *es)
 {
-       return (es->es_pblk & EXTENT_STATUS_HOLE) != 0;
+       return (es->es_pblk & ES_HOLE) != 0;
 }
 
-static inline ext4_fsblk_t ext4_es_status(struct extent_status *es)
+static inline unsigned int ext4_es_status(struct extent_status *es)
 {
-       return (es->es_pblk & EXTENT_STATUS_FLAGS);
+       return es->es_pblk >> ES_SHIFT;
 }
 
 static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
 {
-       return (es->es_pblk & ~EXTENT_STATUS_FLAGS);
+       return es->es_pblk & ~ES_MASK;
 }
 
 static inline void ext4_es_store_pblock(struct extent_status *es,
@@ -104,23 +128,29 @@ static inline void ext4_es_store_pblock(struct extent_status *es,
 {
        ext4_fsblk_t block;
 
-       block = (pb & ~EXTENT_STATUS_FLAGS) |
-               (es->es_pblk & EXTENT_STATUS_FLAGS);
+       block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
        es->es_pblk = block;
 }
 
 static inline void ext4_es_store_status(struct extent_status *es,
-                                       unsigned long long status)
+                                       unsigned int status)
 {
-       ext4_fsblk_t block;
+       es->es_pblk = (((ext4_fsblk_t)
+                       (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+                      (es->es_pblk & ~ES_MASK));
+}
 
-       block = (status & EXTENT_STATUS_FLAGS) |
-               (es->es_pblk & ~EXTENT_STATUS_FLAGS);
-       es->es_pblk = block;
+static inline void ext4_es_store_pblock_status(struct extent_status *es,
+                                              ext4_fsblk_t pb,
+                                              unsigned int status)
+{
+       es->es_pblk = (((ext4_fsblk_t)
+                       (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+                      (pb & ~ES_MASK));
 }
 
-extern void ext4_es_register_shrinker(struct super_block *sb);
-extern void ext4_es_unregister_shrinker(struct super_block *sb);
+extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_lru_add(struct inode *inode);
 extern void ext4_es_lru_del(struct inode *inode);
 
index ec9770f..8131be8 100644 (file)
@@ -57,7 +57,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
        return 0;
 }
 
-void ext4_unwritten_wait(struct inode *inode)
+static void ext4_unwritten_wait(struct inode *inode)
 {
        wait_queue_head_t *wq = ext4_ioend_wq(inode);
 
@@ -74,142 +74,132 @@ void ext4_unwritten_wait(struct inode *inode)
  * or one thread will zero the other's data, causing corruption.
  */
 static int
-ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
-                  unsigned long nr_segs, loff_t pos)
+ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
 {
        struct super_block *sb = inode->i_sb;
        int blockmask = sb->s_blocksize - 1;
-       size_t count = iov_length(iov, nr_segs);
-       loff_t final_size = pos + count;
 
        if (pos >= i_size_read(inode))
                return 0;
 
-       if ((pos & blockmask) || (final_size & blockmask))
+       if ((pos | iov_iter_alignment(from)) & blockmask)
                return 1;
 
        return 0;
 }
 
 static ssize_t
-ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
-                   unsigned long nr_segs, loff_t pos)
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
        struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
+       struct inode *inode = file_inode(iocb->ki_filp);
+       struct mutex *aio_mutex = NULL;
        struct blk_plug plug;
-       int unaligned_aio = 0;
+       int o_direct = file->f_flags & O_DIRECT;
+       int overwrite = 0;
+       size_t length = iov_iter_count(from);
        ssize_t ret;
-       int *overwrite = iocb->private;
-       size_t length = iov_length(iov, nr_segs);
-
-       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
-           !is_sync_kiocb(iocb))
-               unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
+       loff_t pos = iocb->ki_pos;
 
-       /* Unaligned direct AIO must be serialized; see comment above */
-       if (unaligned_aio) {
-               mutex_lock(ext4_aio_mutex(inode));
+       /*
+        * Unaligned direct AIO must be serialized; see comment above
+        * In the case of O_APPEND, assume that we must always serialize
+        */
+       if (o_direct &&
+           ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+           !is_sync_kiocb(iocb) &&
+           (file->f_flags & O_APPEND ||
+            ext4_unaligned_aio(inode, from, pos))) {
+               aio_mutex = ext4_aio_mutex(inode);
+               mutex_lock(aio_mutex);
                ext4_unwritten_wait(inode);
        }
 
-       BUG_ON(iocb->ki_pos != pos);
-
        mutex_lock(&inode->i_mutex);
-       blk_start_plug(&plug);
+       if (file->f_flags & O_APPEND)
+               iocb->ki_pos = pos = i_size_read(inode);
 
-       /* check whether we do a DIO overwrite or not */
-       if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
-           !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
-               struct ext4_map_blocks map;
-               unsigned int blkbits = inode->i_blkbits;
-               int err, len;
+       /*
+        * If we have encountered a bitmap-format file, the size limit
+        * is smaller than s_maxbytes, which is for extent-mapped files.
+        */
+       if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+               struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
-               map.m_lblk = pos >> blkbits;
-               map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
-                       - map.m_lblk;
-               len = map.m_len;
+               if ((pos > sbi->s_bitmap_maxbytes) ||
+                   (pos == sbi->s_bitmap_maxbytes && length > 0)) {
+                       mutex_unlock(&inode->i_mutex);
+                       ret = -EFBIG;
+                       goto errout;
+               }
 
-               err = ext4_map_blocks(NULL, inode, &map, 0);
-               /*
-                * 'err==len' means that all of blocks has been preallocated no
-                * matter they are initialized or not.  For excluding
-                * uninitialized extents, we need to check m_flags.  There are
-                * two conditions that indicate for initialized extents.
-                * 1) If we hit extent cache, EXT4_MAP_MAPPED flag is returned;
-                * 2) If we do a real lookup, non-flags are returned.
-                * So we should check these two conditions.
-                */
-               if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
-                       *overwrite = 1;
+               if (pos + length > sbi->s_bitmap_maxbytes)
+                       iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
        }
 
-       ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
-       mutex_unlock(&inode->i_mutex);
-
-       if (ret > 0 || ret == -EIOCBQUEUED) {
-               ssize_t err;
-
-               err = generic_write_sync(file, pos, ret);
-               if (err < 0 && ret > 0)
-                       ret = err;
-       }
-       blk_finish_plug(&plug);
+       iocb->private = &overwrite;
+       if (o_direct) {
+               blk_start_plug(&plug);
 
-       if (unaligned_aio)
-               mutex_unlock(ext4_aio_mutex(inode));
 
-       return ret;
-}
+               /* check whether we do a DIO overwrite or not */
+               if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
+                   !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
+                       struct ext4_map_blocks map;
+                       unsigned int blkbits = inode->i_blkbits;
+                       int err, len;
 
-static ssize_t
-ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
-               unsigned long nr_segs, loff_t pos)
-{
-       struct inode *inode = file_inode(iocb->ki_filp);
-       ssize_t ret;
-       int overwrite = 0;
+                       map.m_lblk = pos >> blkbits;
+                       map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
+                               - map.m_lblk;
+                       len = map.m_len;
 
-       /*
-        * If we have encountered a bitmap-format file, the size limit
-        * is smaller than s_maxbytes, which is for extent-mapped files.
-        */
+                       err = ext4_map_blocks(NULL, inode, &map, 0);
+                       /*
+                        * 'err==len' means that all of blocks has
+                        * been preallocated no matter they are
+                        * initialized or not.  For excluding
+                        * unwritten extents, we need to check
+                        * m_flags.  There are two conditions that
+                        * indicate for initialized extents.  1) If we
+                        * hit extent cache, EXT4_MAP_MAPPED flag is
+                        * returned; 2) If we do a real lookup,
+                        * non-flags are returned.  So we should check
+                        * these two conditions.
+                        */
+                       if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
+                               overwrite = 1;
+               }
+       }
 
-       if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
-               struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-               size_t length = iov_length(iov, nr_segs);
+       ret = __generic_file_write_iter(iocb, from);
+       mutex_unlock(&inode->i_mutex);
 
-               if ((pos > sbi->s_bitmap_maxbytes ||
-                   (pos == sbi->s_bitmap_maxbytes && length > 0)))
-                       return -EFBIG;
+       if (ret > 0) {
+               ssize_t err;
 
-               if (pos + length > sbi->s_bitmap_maxbytes) {
-                       nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
-                                             sbi->s_bitmap_maxbytes - pos);
-               }
+               err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+               if (err < 0)
+                       ret = err;
        }
+       if (o_direct)
+               blk_finish_plug(&plug);
 
-       iocb->private = &overwrite;
-       if (unlikely(iocb->ki_filp->f_flags & O_DIRECT))
-               ret = ext4_file_dio_write(iocb, iov, nr_segs, pos);
-       else
-               ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
+errout:
+       if (aio_mutex)
+               mutex_unlock(aio_mutex);
        return ret;
 }
 
 static const struct vm_operations_struct ext4_file_vm_ops = {
        .fault          = filemap_fault,
+       .map_pages      = filemap_map_pages,
        .page_mkwrite   = ext4_page_mkwrite,
        .remap_pages    = generic_file_remap_pages,
 };
 
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
-       struct address_space *mapping = file->f_mapping;
-
-       if (!mapping->a_ops->readpage)
-               return -ENOEXEC;
        file_accessed(file);
        vma->vm_ops = &ext4_file_vm_ops;
        return 0;
@@ -219,7 +209,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 {
        struct super_block *sb = inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       struct ext4_inode_info *ei = EXT4_I(inode);
        struct vfsmount *mnt = filp->f_path.mnt;
        struct path path;
        char buf[64], *cp;
@@ -244,6 +233,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
                        handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
                        if (IS_ERR(handle))
                                return PTR_ERR(handle);
+                       BUFFER_TRACE(sbi->s_sbh, "get_write_access");
                        err = ext4_journal_get_write_access(handle, sbi->s_sbh);
                        if (err) {
                                ext4_journal_stop(handle);
@@ -259,22 +249,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
         * Set up the jbd2_inode if we are opening the inode for
         * writing and the journal is present
         */
-       if