Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Linus Torvalds [Mon, 11 Apr 2011 22:45:47 +0000 (15:45 -0700)]
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: fix data corruption regression by reverting commit 6de9843dab3f
  ext4: Allow indirect-block file to grow the file size to max file size
  ext4: allow an active handle to be started when freezing
  ext4: sync the directory inode in ext4_sync_parent()
  ext4: init timer earlier to avoid a kernel panic in __save_error_info
  jbd2: fix potential memory leak on transaction commit
  ext4: fix a double free in ext4_register_li_request
  ext4: fix credits computing for indirect mapped files
  ext4: remove unnecessary [cm]time update of quota file
  jbd2: move bdget out of critical section

fs/ext4/ext4_jbd2.h
fs/ext4/fsync.c
fs/ext4/inode.c
fs/ext4/super.c
fs/jbd2/commit.c
fs/jbd2/journal.c

index e25e99b..d0f5353 100644 (file)
@@ -86,8 +86,8 @@
 
 #ifdef CONFIG_QUOTA
 /* Amount of blocks needed for quota update - we know that the structure was
- * allocated so we need to update only inode+data */
-#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
+ * allocated so we need to update only data block */
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0)
 /* Amount of blocks needed for quota insert/delete - we do some block writes
  * but inode, sb and group updates are done only once */
 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
index 4673bc0..e9473cb 100644 (file)
@@ -125,9 +125,11 @@ extern int ext4_flush_completed_IO(struct inode *inode)
  * the parent directory's parent as well, and so on recursively, if
  * they are also freshly created.
  */
-static void ext4_sync_parent(struct inode *inode)
+static int ext4_sync_parent(struct inode *inode)
 {
+       struct writeback_control wbc;
        struct dentry *dentry = NULL;
+       int ret = 0;
 
        while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
                ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
@@ -136,8 +138,17 @@ static void ext4_sync_parent(struct inode *inode)
                if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
                        break;
                inode = dentry->d_parent->d_inode;
-               sync_mapping_buffers(inode->i_mapping);
+               ret = sync_mapping_buffers(inode->i_mapping);
+               if (ret)
+                       break;
+               memset(&wbc, 0, sizeof(wbc));
+               wbc.sync_mode = WB_SYNC_ALL;
+               wbc.nr_to_write = 0;         /* only write out the inode */
+               ret = sync_inode(inode, &wbc);
+               if (ret)
+                       break;
        }
+       return ret;
 }
 
 /*
@@ -176,7 +187,7 @@ int ext4_sync_file(struct file *file, int datasync)
        if (!journal) {
                ret = generic_file_fsync(file, datasync);
                if (!ret && !list_empty(&inode->i_dentry))
-                       ext4_sync_parent(inode);
+                       ret = ext4_sync_parent(inode);
                goto out;
        }
 
index ad8e303..f2fa5e8 100644 (file)
@@ -2502,6 +2502,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                 * for partial write.
                 */
                set_buffer_new(bh);
+               set_buffer_mapped(bh);
        }
        return 0;
 }
@@ -4429,8 +4430,8 @@ void ext4_truncate(struct inode *inode)
        Indirect chain[4];
        Indirect *partial;
        __le32 nr = 0;
-       int n;
-       ext4_lblk_t last_block;
+       int n = 0;
+       ext4_lblk_t last_block, max_block;
        unsigned blocksize = inode->i_sb->s_blocksize;
 
        trace_ext4_truncate_enter(inode);
@@ -4455,14 +4456,18 @@ void ext4_truncate(struct inode *inode)
 
        last_block = (inode->i_size + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+       max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+                                       >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 
        if (inode->i_size & (blocksize - 1))
                if (ext4_block_truncate_page(handle, mapping, inode->i_size))
                        goto out_stop;
 
-       n = ext4_block_to_path(inode, last_block, offsets, NULL);
-       if (n == 0)
-               goto out_stop;  /* error */
+       if (last_block != max_block) {
+               n = ext4_block_to_path(inode, last_block, offsets, NULL);
+               if (n == 0)
+                       goto out_stop;  /* error */
+       }
 
        /*
         * OK.  This truncate is going to happen.  We add the inode to the
@@ -4493,7 +4498,13 @@ void ext4_truncate(struct inode *inode)
         */
        ei->i_disksize = inode->i_size;
 
-       if (n == 1) {           /* direct blocks */
+       if (last_block == max_block) {
+               /*
+                * It is unnecessary to free any data blocks if last_block is
+                * equal to the indirect block limit.
+                */
+               goto out_unlock;
+       } else if (n == 1) {            /* direct blocks */
                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                               i_data + EXT4_NDIR_BLOCKS);
                goto do_indirects;
@@ -4553,6 +4564,7 @@ do_indirects:
                ;
        }
 
+out_unlock:
        up_write(&ei->i_data_sem);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
@@ -5398,13 +5410,12 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
        /* if nrblocks are contiguous */
        if (chunk) {
                /*
-                * With N contiguous data blocks, it need at most
-                * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
-                * 2 dindirect blocks
-                * 1 tindirect block
+                * With N contiguous data blocks, we need at most
+                * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+                * 2 dindirect blocks, and 1 tindirect block
                 */
-               indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
-               return indirects + 3;
+               return DIV_ROUND_UP(nrblocks,
+                                   EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
        }
        /*
         * if nrblocks are not contiguous, worse case, each block touch
index 056474b..8553dfb 100644 (file)
@@ -242,27 +242,44 @@ static void ext4_put_nojournal(handle_t *handle)
  * journal_end calls result in the superblock being marked dirty, so
  * that sync() will call the filesystem's write_super callback if
  * appropriate.
+ *
+ * To avoid j_barrier hold in userspace when a user calls freeze(),
+ * ext4 prevents a new handle from being started by s_frozen, which
+ * is in an upper layer.
  */
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 {
        journal_t *journal;
+       handle_t  *handle;
 
        if (sb->s_flags & MS_RDONLY)
                return ERR_PTR(-EROFS);
 
-       vfs_check_frozen(sb, SB_FREEZE_TRANS);
-       /* Special case here: if the journal has aborted behind our
-        * backs (eg. EIO in the commit thread), then we still need to
-        * take the FS itself readonly cleanly. */
        journal = EXT4_SB(sb)->s_journal;
-       if (journal) {
-               if (is_journal_aborted(journal)) {
-                       ext4_abort(sb, "Detected aborted journal");
-                       return ERR_PTR(-EROFS);
-               }
-               return jbd2_journal_start(journal, nblocks);
+       handle = ext4_journal_current_handle();
+
+       /*
+        * If a handle has been started, it should be allowed to
+        * finish, otherwise deadlock could happen between freeze
+        * and others(e.g. truncate) due to the restart of the
+        * journal handle if the filesystem is forzen and active
+        * handles are not stopped.
+        */
+       if (!handle)
+               vfs_check_frozen(sb, SB_FREEZE_TRANS);
+
+       if (!journal)
+               return ext4_get_nojournal();
+       /*
+        * Special case here: if the journal has aborted behind our
+        * backs (eg. EIO in the commit thread), then we still need to
+        * take the FS itself readonly cleanly.
+        */
+       if (is_journal_aborted(journal)) {
+               ext4_abort(sb, "Detected aborted journal");
+               return ERR_PTR(-EROFS);
        }
-       return ext4_get_nojournal();
+       return jbd2_journal_start(journal, nblocks);
 }
 
 /*
@@ -2975,6 +2992,12 @@ static int ext4_register_li_request(struct super_block *sb,
        mutex_unlock(&ext4_li_info->li_list_mtx);
 
        sbi->s_li_request = elr;
+       /*
+        * set elr to NULL here since it has been inserted to
+        * the request_list and the removal and free of it is
+        * handled by ext4_clear_request_list from now on.
+        */
+       elr = NULL;
 
        if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
                ret = ext4_run_lazyinit_thread();
@@ -3385,6 +3408,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
 
+       init_timer(&sbi->s_err_report);
+       sbi->s_err_report.function = print_daily_error_info;
+       sbi->s_err_report.data = (unsigned long) sb;
+
        err = percpu_counter_init(&sbi->s_freeblocks_counter,
                        ext4_count_free_blocks(sb));
        if (!err) {
@@ -3646,9 +3673,6 @@ no_journal:
                 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
                 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
 
-       init_timer(&sbi->s_err_report);
-       sbi->s_err_report.function = print_daily_error_info;
-       sbi->s_err_report.data = (unsigned long) sb;
        if (es->s_error_count)
                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
 
@@ -3672,6 +3696,7 @@ failed_mount_wq:
                sbi->s_journal = NULL;
        }
 failed_mount3:
+       del_timer(&sbi->s_err_report);
        if (sbi->s_flex_groups) {
                if (is_vmalloc_addr(sbi->s_flex_groups))
                        vfree(sbi->s_flex_groups);
@@ -4138,6 +4163,11 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 /*
  * LVM calls this function before a (read-only) snapshot is created.  This
  * gives us a chance to flush the journal completely and mark the fs clean.
+ *
+ * Note that only this function cannot bring a filesystem to be in a clean
+ * state independently, because ext4 prevents a new handle from being started
+ * by @sb->s_frozen, which stays in an upper layer.  It thus needs help from
+ * the upper layer.
  */
 static int ext4_freeze(struct super_block *sb)
 {
@@ -4614,11 +4644,24 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 
 static int ext4_quota_off(struct super_block *sb, int type)
 {
+       struct inode *inode = sb_dqopt(sb)->files[type];
+       handle_t *handle;
+
        /* Force all delayed allocation blocks to be allocated.
         * Caller already holds s_umount sem */
        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);
 
+       /* Update modification times of quota files when userspace can
+        * start looking at them */
+       handle = ext4_journal_start(inode, 1);
+       if (IS_ERR(handle))
+               goto out;
+       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+       ext4_mark_inode_dirty(handle, inode);
+       ext4_journal_stop(handle);
+
+out:
        return dquot_quota_off(sb, type);
 }
 
@@ -4714,9 +4757,8 @@ out:
        if (inode->i_size < off + len) {
                i_size_write(inode, off + len);
                EXT4_I(inode)->i_disksize = inode->i_size;
+               ext4_mark_inode_dirty(handle, inode);
        }
-       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-       ext4_mark_inode_dirty(handle, inode);
        mutex_unlock(&inode->i_mutex);
        return len;
 }
index 20af62f..6e28000 100644 (file)
@@ -105,6 +105,8 @@ static int journal_submit_commit_record(journal_t *journal,
        int ret;
        struct timespec now = current_kernel_time();
 
+       *cbh = NULL;
+
        if (is_journal_aborted(journal))
                return 0;
 
@@ -806,7 +808,7 @@ wait_for_iobuf:
                if (err)
                        __jbd2_journal_abort_hard(journal);
        }
-       if (!err && !is_journal_aborted(journal))
+       if (cbh)
                err = journal_wait_on_commit_record(journal, cbh);
        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
index aba8eba..e0ec3db 100644 (file)
@@ -2413,10 +2413,12 @@ const char *jbd2_dev_to_name(dev_t device)
        new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
        if (!new_dev)
                return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
+       bd = bdget(device);
        spin_lock(&devname_cache_lock);
        if (devcache[i]) {
                if (devcache[i]->device == device) {
                        kfree(new_dev);
+                       bdput(bd);
                        ret = devcache[i]->devname;
                        spin_unlock(&devname_cache_lock);
                        return ret;
@@ -2425,7 +2427,6 @@ const char *jbd2_dev_to_name(dev_t device)
        }
        devcache[i] = new_dev;
        devcache[i]->device = device;
-       bd = bdget(device);
        if (bd) {
                bdevname(bd, devcache[i]->devname);
                bdput(bd);