ext3: Fix lock inversion in ext3_symlink()
Jan Kara [Thu, 21 Apr 2011 21:47:16 +0000 (23:47 +0200)]
ext3_symlink() cannot call __page_symlink() with transaction open.
__page_symlink() calls ext3_write_begin() which gets page lock which ranks
above transaction start (thus lock ordering is violated) and and also
ext3_write_begin() waits for a transaction commit when we run out of space
which never happens if we hold transaction open.

Fix the problem by stopping a transaction before calling __page_symlink()
(we have to be careful and put inode to orphan list so that it gets deleted
in case of crash) and starting another one after __page_symlink() returns
for addition of symlink into a directory.

Signed-off-by: Jan Kara <jack@suse.cz>

fs/ext3/namei.c

index 32f3b86..f6ce3e7 100644 (file)
@@ -2189,6 +2189,7 @@ static int ext3_symlink (struct inode * dir,
        handle_t *handle;
        struct inode * inode;
        int l, err, retries = 0;
+       int credits;
 
        l = strlen(symname)+1;
        if (l > dir->i_sb->s_blocksize)
@@ -2196,10 +2197,26 @@ static int ext3_symlink (struct inode * dir,
 
        dquot_initialize(dir);
 
+       if (l > EXT3_N_BLOCKS * 4) {
+               /*
+                * For non-fast symlinks, we just allocate inode and put it on
+                * orphan list in the first transaction => we need bitmap,
+                * group descriptor, sb, inode block, quota blocks.
+                */
+               credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+       } else {
+               /*
+                * Fast symlink. We have to add entry to directory
+                * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
+                * allocate new inode (bitmap, group descriptor, inode block,
+                * quota blocks, sb is already counted in previous macros).
+                */
+               credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+                         EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                         EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+       }
 retry:
-       handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
-                                       EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-                                       EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+       handle = ext3_journal_start(dir, credits);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
 
@@ -2211,21 +2228,45 @@ retry:
        if (IS_ERR(inode))
                goto out_stop;
 
-       if (l > sizeof (EXT3_I(inode)->i_data)) {
+       if (l > EXT3_N_BLOCKS * 4) {
                inode->i_op = &ext3_symlink_inode_operations;
                ext3_set_aops(inode);
                /*
-                * page_symlink() calls into ext3_prepare/commit_write.
-                * We have a transaction open.  All is sweetness.  It also sets
-                * i_size in generic_commit_write().
+                * We cannot call page_symlink() with transaction started
+                * because it calls into ext3_write_begin() which acquires page
+                * lock which ranks below transaction start (and it can also
+                * wait for journal commit if we are running out of space). So
+                * we have to stop transaction now and restart it when symlink
+                * contents is written. 
+                *
+                * To keep fs consistent in case of crash, we have to put inode
+                * to orphan list in the mean time.
                 */
+               drop_nlink(inode);
+               err = ext3_orphan_add(handle, inode);
+               ext3_journal_stop(handle);
+               if (err)
+                       goto err_drop_inode;
                err = __page_symlink(inode, symname, l, 1);
+               if (err)
+                       goto err_drop_inode;
+               /*
+                * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
+                * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+                */
+               handle = ext3_journal_start(dir,
+                               EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+                               EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
+               if (IS_ERR(handle)) {
+                       err = PTR_ERR(handle);
+                       goto err_drop_inode;
+               }
+               inc_nlink(inode);
+               err = ext3_orphan_del(handle, inode);
                if (err) {
+                       ext3_journal_stop(handle);
                        drop_nlink(inode);
-                       unlock_new_inode(inode);
-                       ext3_mark_inode_dirty(handle, inode);
-                       iput (inode);
-                       goto out_stop;
+                       goto err_drop_inode;
                }
        } else {
                inode->i_op = &ext3_fast_symlink_inode_operations;
@@ -2239,6 +2280,10 @@ out_stop:
        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
+err_drop_inode:
+       unlock_new_inode(inode);
+       iput (inode);
+       return err;
 }
 
 static int ext3_link (struct dentry * old_dentry,