jbd: Fix oops in journal_remove_journal_head()
Jan Kara [Fri, 24 Jun 2011 21:11:59 +0000 (23:11 +0200)]
journal_remove_journal_head() can oops when trying to access journal_head
returned by bh2jh(). This is caused for example by the following race:

TASK1 TASK2
  journal_commit_transaction()
    ...
    processing t_forget list
      __journal_refile_buffer(jh);
      if (!jh->b_transaction) {
        jbd_unlock_bh_state(bh);
journal_try_to_free_buffers()
  journal_grab_journal_head(bh)
  jbd_lock_bh_state(bh)
  __journal_try_to_free_buffer()
  journal_put_journal_head(jh)
        journal_remove_journal_head(bh);

journal_put_journal_head() in TASK2 sees that b_jcount == 0 and buffer is not
part of any transaction and thus frees journal_head before TASK1 gets to doing
so. Note that even buffer_head can be released by try_to_free_buffers() after
journal_put_journal_head() which adds even larger opportunity for oops (but I
didn't see this happen in reality).

Fix the problem by making transactions hold their own journal_head reference
(in b_jcount). That way we don't have to remove journal_head explicitely via
journal_remove_journal_head() and instead just remove journal_head when
b_jcount drops to zero. The result of this is that [__]journal_refile_buffer(),
[__]journal_unfile_buffer(), and __journal_remove_checkpoint() can free
journal_head which needs modification of a few callers. Also we have to be
careful because once journal_head is removed, buffer_head might be freed as
well. So we have to get our own buffer_head reference where it matters.

Signed-off-by: Jan Kara <jack@suse.cz>

fs/jbd/checkpoint.c
fs/jbd/commit.c
fs/jbd/journal.c
fs/jbd/transaction.c
include/linux/jbd.h

index dea7503..61655a3 100644 (file)
@@ -96,10 +96,14 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 
        if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
            !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
+               /*
+                * Get our reference so that bh cannot be freed before
+                * we unlock it
+                */
+               get_bh(bh);
                JBUFFER_TRACE(jh, "remove from checkpoint list");
                ret = __journal_remove_checkpoint(jh) + 1;
                jbd_unlock_bh_state(bh);
-               journal_remove_journal_head(bh);
                BUFFER_TRACE(bh, "release");
                __brelse(bh);
        } else {
@@ -221,8 +225,8 @@ restart:
                        spin_lock(&journal->j_list_lock);
                        goto restart;
                }
+               get_bh(bh);
                if (buffer_locked(bh)) {
-                       get_bh(bh);
                        spin_unlock(&journal->j_list_lock);
                        jbd_unlock_bh_state(bh);
                        wait_on_buffer(bh);
@@ -241,7 +245,6 @@ restart:
                 */
                released = __journal_remove_checkpoint(jh);
                jbd_unlock_bh_state(bh);
-               journal_remove_journal_head(bh);
                __brelse(bh);
        }
 
@@ -305,12 +308,12 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                ret = 1;
                if (unlikely(buffer_write_io_error(bh)))
                        ret = -EIO;
+               get_bh(bh);
                J_ASSERT_JH(jh, !buffer_jbddirty(bh));
                BUFFER_TRACE(bh, "remove from checkpoint");
                __journal_remove_checkpoint(jh);
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
-               journal_remove_journal_head(bh);
                __brelse(bh);
        } else {
                /*
@@ -526,9 +529,9 @@ int cleanup_journal_tail(journal_t *journal)
 /*
  * journal_clean_one_cp_list
  *
- * Find all the written-back checkpoint buffers in the given list and release them.
+ * Find all the written-back checkpoint buffers in the given list and release
+ * them.
  *
- * Called with the journal locked.
  * Called with j_list_lock held.
  * Returns number of bufers reaped (for debug)
  */
@@ -635,8 +638,8 @@ out:
  * checkpoint lists.
  *
  * The function returns 1 if it frees the transaction, 0 otherwise.
+ * The function can free jh and bh.
  *
- * This function is called with the journal locked.
  * This function is called with j_list_lock held.
  * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
@@ -655,13 +658,14 @@ int __journal_remove_checkpoint(struct journal_head *jh)
        }
        journal = transaction->t_journal;
 
+       JBUFFER_TRACE(jh, "removing from transaction");
        __buffer_unlink(jh);
        jh->b_cp_transaction = NULL;
+       journal_put_journal_head(jh);
 
        if (transaction->t_checkpoint_list != NULL ||
            transaction->t_checkpoint_io_list != NULL)
                goto out;
-       JBUFFER_TRACE(jh, "transaction has no more buffers");
 
        /*
         * There is one special case to worry about: if we have just pulled the
@@ -672,10 +676,8 @@ int __journal_remove_checkpoint(struct journal_head *jh)
         * The locking here around t_state is a bit sleazy.
         * See the comment at the end of journal_commit_transaction().
         */
-       if (transaction->t_state != T_FINISHED) {
-               JBUFFER_TRACE(jh, "belongs to running/committing transaction");
+       if (transaction->t_state != T_FINISHED)
                goto out;
-       }
 
        /* OK, that was the last buffer for the transaction: we can now
           safely remove this transaction from the log */
@@ -687,7 +689,6 @@ int __journal_remove_checkpoint(struct journal_head *jh)
        wake_up(&journal->j_wait_logspace);
        ret = 1;
 out:
-       JBUFFER_TRACE(jh, "exit");
        return ret;
 }
 
@@ -706,6 +707,8 @@ void __journal_insert_checkpoint(struct journal_head *jh,
        J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
        J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
 
+       /* Get reference for checkpointing transaction */
+       journal_grab_journal_head(jh2bh(jh));
        jh->b_cp_transaction = transaction;
 
        if (!transaction->t_checkpoint_list) {
index eedd201..8799207 100644 (file)
@@ -258,10 +258,6 @@ write_out_data:
                        jbd_unlock_bh_state(bh);
                        if (locked)
                                unlock_buffer(bh);
-                       journal_remove_journal_head(bh);
-                       /* One for our safety reference, other for
-                        * journal_remove_journal_head() */
-                       put_bh(bh);
                        release_data_buffer(bh);
                }
 
@@ -455,14 +451,9 @@ void journal_commit_transaction(journal_t *journal)
                }
                if (buffer_jbd(bh) && bh2jh(bh) == jh &&
                    jh->b_transaction == commit_transaction &&
-                   jh->b_jlist == BJ_Locked) {
+                   jh->b_jlist == BJ_Locked)
                        __journal_unfile_buffer(jh);
-                       jbd_unlock_bh_state(bh);
-                       journal_remove_journal_head(bh);
-                       put_bh(bh);
-               } else {
-                       jbd_unlock_bh_state(bh);
-               }
+               jbd_unlock_bh_state(bh);
                release_data_buffer(bh);
                cond_resched_lock(&journal->j_list_lock);
        }
@@ -807,10 +798,16 @@ restart_loop:
        while (commit_transaction->t_forget) {
                transaction_t *cp_transaction;
                struct buffer_head *bh;
+               int try_to_free = 0;
 
                jh = commit_transaction->t_forget;
                spin_unlock(&journal->j_list_lock);
                bh = jh2bh(jh);
+               /*
+                * Get a reference so that bh cannot be freed before we are
+                * done with it.
+                */
+               get_bh(bh);
                jbd_lock_bh_state(bh);
                J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
                        jh->b_transaction == journal->j_running_transaction);
@@ -868,28 +865,27 @@ restart_loop:
                        __journal_insert_checkpoint(jh, commit_transaction);
                        if (is_journal_aborted(journal))
                                clear_buffer_jbddirty(bh);
-                       JBUFFER_TRACE(jh, "refile for checkpoint writeback");
-                       __journal_refile_buffer(jh);
-                       jbd_unlock_bh_state(bh);
                } else {
                        J_ASSERT_BH(bh, !buffer_dirty(bh));
-                       /* The buffer on BJ_Forget list and not jbddirty means
+                       /*
+                        * The buffer on BJ_Forget list and not jbddirty means
                         * it has been freed by this transaction and hence it
                         * could not have been reallocated until this
                         * transaction has committed. *BUT* it could be
                         * reallocated once we have written all the data to
                         * disk and before we process the buffer on BJ_Forget
-                        * list. */
-                       JBUFFER_TRACE(jh, "refile or unfile freed buffer");
-                       __journal_refile_buffer(jh);
-                       if (!jh->b_transaction) {
-                               jbd_unlock_bh_state(bh);
-                                /* needs a brelse */
-                               journal_remove_journal_head(bh);
-                               release_buffer_page(bh);
-                       } else
-                               jbd_unlock_bh_state(bh);
+                        * list.
+                        */
+                       if (!jh->b_next_transaction)
+                               try_to_free = 1;
                }
+               JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+               __journal_refile_buffer(jh);
+               jbd_unlock_bh_state(bh);
+               if (try_to_free)
+                       release_buffer_page(bh);
+               else
+                       __brelse(bh);
                cond_resched_lock(&journal->j_list_lock);
        }
        spin_unlock(&journal->j_list_lock);
index ab019ee..9fe061f 100644 (file)
@@ -1803,10 +1803,9 @@ static void journal_free_journal_head(struct journal_head *jh)
  * When a buffer has its BH_JBD bit set it is immune from being released by
  * core kernel code, mainly via ->b_count.
  *
- * A journal_head may be detached from its buffer_head when the journal_head's
- * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
- * Various places in JBD call journal_remove_journal_head() to indicate that the
- * journal_head can be dropped if needed.
+ * A journal_head is detached from its buffer_head when the journal_head's
+ * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
+ * transaction (b_cp_transaction) hold their references to b_jcount.
  *
  * Various places in the kernel want to attach a journal_head to a buffer_head
  * _before_ attaching the journal_head to a transaction.  To protect the
@@ -1819,17 +1818,16 @@ static void journal_free_journal_head(struct journal_head *jh)
  *     (Attach a journal_head if needed.  Increments b_jcount)
  *     struct journal_head *jh = journal_add_journal_head(bh);
  *     ...
- *     jh->b_transaction = xxx;
- *     journal_put_journal_head(jh);
- *
- * Now, the journal_head's b_jcount is zero, but it is safe from being released
- * because it has a non-zero b_transaction.
+ *      (Get another reference for transaction)
+ *      journal_grab_journal_head(bh);
+ *      jh->b_transaction = xxx;
+ *      (Put original reference)
+ *      journal_put_journal_head(jh);
  */
 
 /*
  * Give a buffer_head a journal_head.
  *
- * Doesn't need the journal lock.
  * May sleep.
  */
 struct journal_head *journal_add_journal_head(struct buffer_head *bh)
@@ -1893,61 +1891,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
        struct journal_head *jh = bh2jh(bh);
 
        J_ASSERT_JH(jh, jh->b_jcount >= 0);
-
-       get_bh(bh);
-       if (jh->b_jcount == 0) {
-               if (jh->b_transaction == NULL &&
-                               jh->b_next_transaction == NULL &&
-                               jh->b_cp_transaction == NULL) {
-                       J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
-                       J_ASSERT_BH(bh, buffer_jbd(bh));
-                       J_ASSERT_BH(bh, jh2bh(jh) == bh);
-                       BUFFER_TRACE(bh, "remove journal_head");
-                       if (jh->b_frozen_data) {
-                               printk(KERN_WARNING "%s: freeing "
-                                               "b_frozen_data\n",
-                                               __func__);
-                               jbd_free(jh->b_frozen_data, bh->b_size);
-                       }
-                       if (jh->b_committed_data) {
-                               printk(KERN_WARNING "%s: freeing "
-                                               "b_committed_data\n",
-                                               __func__);
-                               jbd_free(jh->b_committed_data, bh->b_size);
-                       }
-                       bh->b_private = NULL;
-                       jh->b_bh = NULL;        /* debug, really */
-                       clear_buffer_jbd(bh);
-                       __brelse(bh);
-                       journal_free_journal_head(jh);
-               } else {
-                       BUFFER_TRACE(bh, "journal_head was locked");
-               }
+       J_ASSERT_JH(jh, jh->b_transaction == NULL);
+       J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+       J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+       J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
+       J_ASSERT_BH(bh, buffer_jbd(bh));
+       J_ASSERT_BH(bh, jh2bh(jh) == bh);
+       BUFFER_TRACE(bh, "remove journal_head");
+       if (jh->b_frozen_data) {
+               printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
+               jbd_free(jh->b_frozen_data, bh->b_size);
        }
+       if (jh->b_committed_data) {
+               printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
+               jbd_free(jh->b_committed_data, bh->b_size);
+       }
+       bh->b_private = NULL;
+       jh->b_bh = NULL;        /* debug, really */
+       clear_buffer_jbd(bh);
+       journal_free_journal_head(jh);
 }
 
 /*
- * journal_remove_journal_head(): if the buffer isn't attached to a transaction
- * and has a zero b_jcount then remove and release its journal_head.   If we did
- * see that the buffer is not used by any transaction we also "logically"
- * decrement ->b_count.
- *
- * We in fact take an additional increment on ->b_count as a convenience,
- * because the caller usually wants to do additional things with the bh
- * after calling here.
- * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some
- * time.  Once the caller has run __brelse(), the buffer is eligible for
- * reaping by try_to_free_buffers().
- */
-void journal_remove_journal_head(struct buffer_head *bh)
-{
-       jbd_lock_bh_journal_head(bh);
-       __journal_remove_journal_head(bh);
-       jbd_unlock_bh_journal_head(bh);
-}
-
-/*
- * Drop a reference on the passed journal_head.  If it fell to zero then try to
+ * Drop a reference on the passed journal_head.  If it fell to zero then
  * release the journal_head from the buffer_head.
  */
 void journal_put_journal_head(struct journal_head *jh)
@@ -1957,11 +1923,12 @@ void journal_put_journal_head(struct journal_head *jh)
        jbd_lock_bh_journal_head(bh);
        J_ASSERT_JH(jh, jh->b_jcount > 0);
        --jh->b_jcount;
-       if (!jh->b_jcount && !jh->b_transaction) {
+       if (!jh->b_jcount) {
                __journal_remove_journal_head(bh);
+               jbd_unlock_bh_journal_head(bh);
                __brelse(bh);
-       }
-       jbd_unlock_bh_journal_head(bh);
+       } else
+               jbd_unlock_bh_journal_head(bh);
 }
 
 /*
index dc39efd..7e59c6e 100644 (file)
@@ -696,7 +696,6 @@ repeat:
        if (!jh->b_transaction) {
                JBUFFER_TRACE(jh, "no transaction");
                J_ASSERT_JH(jh, !jh->b_next_transaction);
-               jh->b_transaction = transaction;
                JBUFFER_TRACE(jh, "file as BJ_Reserved");
                spin_lock(&journal->j_list_lock);
                __journal_file_buffer(jh, transaction, BJ_Reserved);
@@ -818,7 +817,6 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
                 * committed and so it's safe to clear the dirty bit.
                 */
                clear_buffer_dirty(jh2bh(jh));
-               jh->b_transaction = transaction;
 
                /* first access by this transaction */
                jh->b_modified = 0;
@@ -1069,8 +1067,9 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
                                ret = -EIO;
                                goto no_journal;
                        }
-
-                       if (jh->b_transaction != NULL) {
+                       /* We might have slept so buffer could be refiled now */
+                       if (jh->b_transaction != NULL &&
+                           jh->b_transaction != handle->h_transaction) {
                                JBUFFER_TRACE(jh, "unfile from commit");
                                __journal_temp_unlink_buffer(jh);
                                /* It still points to the committing
@@ -1091,8 +1090,6 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
                if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
                        JBUFFER_TRACE(jh, "not on correct data list: unfile");
                        J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
-                       __journal_temp_unlink_buffer(jh);
-                       jh->b_transaction = handle->h_transaction;
                        JBUFFER_TRACE(jh, "file as data");
                        __journal_file_buffer(jh, handle->h_transaction,
                                                BJ_SyncData);
@@ -1300,8 +1297,6 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
                        __journal_file_buffer(jh, transaction, BJ_Forget);
                } else {
                        __journal_unfile_buffer(jh);
-                       journal_remove_journal_head(bh);
-                       __brelse(bh);
                        if (!buffer_jbd(bh)) {
                                spin_unlock(&journal->j_list_lock);
                                jbd_unlock_bh_state(bh);
@@ -1622,19 +1617,32 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh)
                mark_buffer_dirty(bh);  /* Expose it to the VM */
 }
 
+/*
+ * Remove buffer from all transactions.
+ *
+ * Called with bh_state lock and j_list_lock
+ *
+ * jh and bh may be already freed when this function returns.
+ */
 void __journal_unfile_buffer(struct journal_head *jh)
 {
        __journal_temp_unlink_buffer(jh);
        jh->b_transaction = NULL;
+       journal_put_journal_head(jh);
 }
 
 void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
 {
-       jbd_lock_bh_state(jh2bh(jh));
+       struct buffer_head *bh = jh2bh(jh);
+
+       /* Get reference so that buffer cannot be freed before we unlock it */
+       get_bh(bh);
+       jbd_lock_bh_state(bh);
        spin_lock(&journal->j_list_lock);
        __journal_unfile_buffer(jh);
        spin_unlock(&journal->j_list_lock);
-       jbd_unlock_bh_state(jh2bh(jh));
+       jbd_unlock_bh_state(bh);
+       __brelse(bh);
 }
 
 /*
@@ -1661,16 +1669,12 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
                        /* A written-back ordered data buffer */
                        JBUFFER_TRACE(jh, "release data");
                        __journal_unfile_buffer(jh);
-                       journal_remove_journal_head(bh);
-                       __brelse(bh);
                }
        } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
                /* written-back checkpointed metadata buffer */
                if (jh->b_jlist == BJ_None) {
                        JBUFFER_TRACE(jh, "remove from checkpoint list");
                        __journal_remove_checkpoint(jh);
-                       journal_remove_journal_head(bh);
-                       __brelse(bh);
                }
        }
        spin_unlock(&journal->j_list_lock);
@@ -1733,7 +1737,7 @@ int journal_try_to_free_buffers(journal_t *journal,
                /*
                 * We take our own ref against the journal_head here to avoid
                 * having to add tons of locking around each instance of
-                * journal_remove_journal_head() and journal_put_journal_head().
+                * journal_put_journal_head().
                 */
                jh = journal_grab_journal_head(bh);
                if (!jh)
@@ -1770,10 +1774,9 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
        int may_free = 1;
        struct buffer_head *bh = jh2bh(jh);
 
-       __journal_unfile_buffer(jh);
-
        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
+               __journal_temp_unlink_buffer(jh);
                /*
                 * We don't want to write the buffer anymore, clear the
                 * bit so that we don't confuse checks in
@@ -1784,8 +1787,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
-               journal_remove_journal_head(bh);
-               __brelse(bh);
+               __journal_unfile_buffer(jh);
        }
        return may_free;
 }
@@ -2070,6 +2072,8 @@ void __journal_file_buffer(struct journal_head *jh,
 
        if (jh->b_transaction)
                __journal_temp_unlink_buffer(jh);
+       else
+               journal_grab_journal_head(bh);
        jh->b_transaction = transaction;
 
        switch (jlist) {
@@ -2127,9 +2131,10 @@ void journal_file_buffer(struct journal_head *jh,
  * already started to be used by a subsequent transaction, refile the
  * buffer on that transaction's metadata list.
  *
- * Called under journal->j_list_lock
- *
+ * Called under j_list_lock
  * Called under jbd_lock_bh_state(jh2bh(jh))
+ *
+ * jh and bh may be already free when this function returns
  */
 void __journal_refile_buffer(struct journal_head *jh)
 {
@@ -2153,6 +2158,11 @@ void __journal_refile_buffer(struct journal_head *jh)
 
        was_dirty = test_clear_buffer_jbddirty(bh);
        __journal_temp_unlink_buffer(jh);
+       /*
+        * We set b_transaction here because b_next_transaction will inherit
+        * our jh reference and thus __journal_file_buffer() must not take a
+        * new one.
+        */
        jh->b_transaction = jh->b_next_transaction;
        jh->b_next_transaction = NULL;
        if (buffer_freed(bh))
@@ -2169,30 +2179,21 @@ void __journal_refile_buffer(struct journal_head *jh)
 }
 
 /*
- * For the unlocked version of this call, also make sure that any
- * hanging journal_head is cleaned up if necessary.
- *
- * __journal_refile_buffer is usually called as part of a single locked
- * operation on a buffer_head, in which the caller is probably going to
- * be hooking the journal_head onto other lists.  In that case it is up
- * to the caller to remove the journal_head if necessary.  For the
- * unlocked journal_refile_buffer call, the caller isn't going to be
- * doing anything else to the buffer so we need to do the cleanup
- * ourselves to avoid a jh leak.
- *
- * *** The journal_head may be freed by this call! ***
+ * __journal_refile_buffer() with necessary locking added. We take our bh
+ * reference so that we can safely unlock bh.
+ *
+ * The jh and bh may be freed by this call.
  */
 void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 {
        struct buffer_head *bh = jh2bh(jh);
 
+       /* Get reference so that buffer cannot be freed before we unlock it */
+       get_bh(bh);
        jbd_lock_bh_state(bh);
        spin_lock(&journal->j_list_lock);
-
        __journal_refile_buffer(jh);
        jbd_unlock_bh_state(bh);
-       journal_remove_journal_head(bh);
-
        spin_unlock(&journal->j_list_lock);
        __brelse(bh);
 }
index e069650..e6a5e34 100644 (file)
@@ -940,7 +940,6 @@ extern int     journal_force_commit(journal_t *);
  */
 struct journal_head *journal_add_journal_head(struct buffer_head *bh);
 struct journal_head *journal_grab_journal_head(struct buffer_head *bh);
-void journal_remove_journal_head(struct buffer_head *bh);
 void journal_put_journal_head(struct journal_head *jh);
 
 /*