ext4: Add the journal checksum feature
[linux-2.6.git] / fs / jbd2 / commit.c
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25
26 /*
27  * Default IO end handler for temporary BJ_IO buffer_heads.
28  */
29 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30 {
31         BUFFER_TRACE(bh, "");
32         if (uptodate)
33                 set_buffer_uptodate(bh);
34         else
35                 clear_buffer_uptodate(bh);
36         unlock_buffer(bh);
37 }
38
39 /*
40  * When an ext3-ordered file is truncated, it is possible that many pages are
41  * not sucessfully freed, because they are attached to a committing transaction.
42  * After the transaction commits, these pages are left on the LRU, with no
43  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
44  * by the VM, but their apparent absence upsets the VM accounting, and it makes
45  * the numbers in /proc/meminfo look odd.
46  *
47  * So here, we have a buffer which has just come off the forget list.  Look to
48  * see if we can strip all buffers from the backing page.
49  *
50  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
51  * caller provided us with a ref against the buffer, and we drop that here.
52  */
53 static void release_buffer_page(struct buffer_head *bh)
54 {
55         struct page *page;
56
57         if (buffer_dirty(bh))
58                 goto nope;
59         if (atomic_read(&bh->b_count) != 1)
60                 goto nope;
61         page = bh->b_page;
62         if (!page)
63                 goto nope;
64         if (page->mapping)
65                 goto nope;
66
67         /* OK, it's a truncated page */
68         if (TestSetPageLocked(page))
69                 goto nope;
70
71         page_cache_get(page);
72         __brelse(bh);
73         try_to_free_buffers(page);
74         unlock_page(page);
75         page_cache_release(page);
76         return;
77
78 nope:
79         __brelse(bh);
80 }
81
82 /*
83  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
84  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
85  * return 0.  j_list_lock is dropped in this case.
86  */
87 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
88 {
89         if (!jbd_trylock_bh_state(bh)) {
90                 spin_unlock(&journal->j_list_lock);
91                 schedule();
92                 return 0;
93         }
94         return 1;
95 }
96
97 /*
98  * Done it all: now submit the commit record.  We should have
99  * cleaned up our previous buffers by now, so if we are in abort
100  * mode we can now just skip the rest of the journal write
101  * entirely.
102  *
103  * Returns 1 if the journal needs to be aborted or 0 on success
104  */
105 static int journal_submit_commit_record(journal_t *journal,
106                                         transaction_t *commit_transaction,
107                                         struct buffer_head **cbh,
108                                         __u32 crc32_sum)
109 {
110         struct journal_head *descriptor;
111         struct commit_header *tmp;
112         struct buffer_head *bh;
113         int ret;
114         int barrier_done = 0;
115
116         if (is_journal_aborted(journal))
117                 return 0;
118
119         descriptor = jbd2_journal_get_descriptor_buffer(journal);
120         if (!descriptor)
121                 return 1;
122
123         bh = jh2bh(descriptor);
124
125         tmp = (struct commit_header *)bh->b_data;
126         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
127         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
128         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
129
130         if (JBD2_HAS_COMPAT_FEATURE(journal,
131                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
132                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
133                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
134                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
135         }
136
137         JBUFFER_TRACE(descriptor, "submit commit block");
138         lock_buffer(bh);
139
140         set_buffer_dirty(bh);
141         set_buffer_uptodate(bh);
142         bh->b_end_io = journal_end_buffer_io_sync;
143
144         if (journal->j_flags & JBD2_BARRIER &&
145                 !JBD2_HAS_COMPAT_FEATURE(journal,
146                                          JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
147                 set_buffer_ordered(bh);
148                 barrier_done = 1;
149         }
150         ret = submit_bh(WRITE, bh);
151
152         /* is it possible for another commit to fail at roughly
153          * the same time as this one?  If so, we don't want to
154          * trust the barrier flag in the super, but instead want
155          * to remember if we sent a barrier request
156          */
157         if (ret == -EOPNOTSUPP && barrier_done) {
158                 char b[BDEVNAME_SIZE];
159
160                 printk(KERN_WARNING
161                         "JBD: barrier-based sync failed on %s - "
162                         "disabling barriers\n",
163                         bdevname(journal->j_dev, b));
164                 spin_lock(&journal->j_state_lock);
165                 journal->j_flags &= ~JBD2_BARRIER;
166                 spin_unlock(&journal->j_state_lock);
167
168                 /* And try again, without the barrier */
169                 clear_buffer_ordered(bh);
170                 set_buffer_uptodate(bh);
171                 set_buffer_dirty(bh);
172                 ret = submit_bh(WRITE, bh);
173         }
174         *cbh = bh;
175         return ret;
176 }
177
178 /*
179  * This function along with journal_submit_commit_record
180  * allows to write the commit record asynchronously.
181  */
182 static int journal_wait_on_commit_record(struct buffer_head *bh)
183 {
184         int ret = 0;
185
186         clear_buffer_dirty(bh);
187         wait_on_buffer(bh);
188
189         if (unlikely(!buffer_uptodate(bh)))
190                 ret = -EIO;
191         put_bh(bh);            /* One for getblk() */
192         jbd2_journal_put_journal_head(bh2jh(bh));
193
194         return ret;
195 }
196
197 /*
198  * Wait for all submitted IO to complete.
199  */
200 static int journal_wait_on_locked_list(journal_t *journal,
201                                        transaction_t *commit_transaction)
202 {
203         int ret = 0;
204         struct journal_head *jh;
205
206         while (commit_transaction->t_locked_list) {
207                 struct buffer_head *bh;
208
209                 jh = commit_transaction->t_locked_list->b_tprev;
210                 bh = jh2bh(jh);
211                 get_bh(bh);
212                 if (buffer_locked(bh)) {
213                         spin_unlock(&journal->j_list_lock);
214                         wait_on_buffer(bh);
215                         if (unlikely(!buffer_uptodate(bh)))
216                                 ret = -EIO;
217                         spin_lock(&journal->j_list_lock);
218                 }
219                 if (!inverted_lock(journal, bh)) {
220                         put_bh(bh);
221                         spin_lock(&journal->j_list_lock);
222                         continue;
223                 }
224                 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
225                         __jbd2_journal_unfile_buffer(jh);
226                         jbd_unlock_bh_state(bh);
227                         jbd2_journal_remove_journal_head(bh);
228                         put_bh(bh);
229                 } else {
230                         jbd_unlock_bh_state(bh);
231                 }
232                 put_bh(bh);
233                 cond_resched_lock(&journal->j_list_lock);
234         }
235         return ret;
236   }
237
238 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
239 {
240         int i;
241
242         for (i = 0; i < bufs; i++) {
243                 wbuf[i]->b_end_io = end_buffer_write_sync;
244                 /* We use-up our safety reference in submit_bh() */
245                 submit_bh(WRITE, wbuf[i]);
246         }
247 }
248
249 /*
250  *  Submit all the data buffers to disk
251  */
252 static void journal_submit_data_buffers(journal_t *journal,
253                                 transaction_t *commit_transaction)
254 {
255         struct journal_head *jh;
256         struct buffer_head *bh;
257         int locked;
258         int bufs = 0;
259         struct buffer_head **wbuf = journal->j_wbuf;
260
261         /*
262          * Whenever we unlock the journal and sleep, things can get added
263          * onto ->t_sync_datalist, so we have to keep looping back to
264          * write_out_data until we *know* that the list is empty.
265          *
266          * Cleanup any flushed data buffers from the data list.  Even in
267          * abort mode, we want to flush this out as soon as possible.
268          */
269 write_out_data:
270         cond_resched();
271         spin_lock(&journal->j_list_lock);
272
273         while (commit_transaction->t_sync_datalist) {
274                 jh = commit_transaction->t_sync_datalist;
275                 bh = jh2bh(jh);
276                 locked = 0;
277
278                 /* Get reference just to make sure buffer does not disappear
279                  * when we are forced to drop various locks */
280                 get_bh(bh);
281                 /* If the buffer is dirty, we need to submit IO and hence
282                  * we need the buffer lock. We try to lock the buffer without
283                  * blocking. If we fail, we need to drop j_list_lock and do
284                  * blocking lock_buffer().
285                  */
286                 if (buffer_dirty(bh)) {
287                         if (test_set_buffer_locked(bh)) {
288                                 BUFFER_TRACE(bh, "needs blocking lock");
289                                 spin_unlock(&journal->j_list_lock);
290                                 /* Write out all data to prevent deadlocks */
291                                 journal_do_submit_data(wbuf, bufs);
292                                 bufs = 0;
293                                 lock_buffer(bh);
294                                 spin_lock(&journal->j_list_lock);
295                         }
296                         locked = 1;
297                 }
298                 /* We have to get bh_state lock. Again out of order, sigh. */
299                 if (!inverted_lock(journal, bh)) {
300                         jbd_lock_bh_state(bh);
301                         spin_lock(&journal->j_list_lock);
302                 }
303                 /* Someone already cleaned up the buffer? */
304                 if (!buffer_jbd(bh)
305                         || jh->b_transaction != commit_transaction
306                         || jh->b_jlist != BJ_SyncData) {
307                         jbd_unlock_bh_state(bh);
308                         if (locked)
309                                 unlock_buffer(bh);
310                         BUFFER_TRACE(bh, "already cleaned up");
311                         put_bh(bh);
312                         continue;
313                 }
314                 if (locked && test_clear_buffer_dirty(bh)) {
315                         BUFFER_TRACE(bh, "needs writeout, adding to array");
316                         wbuf[bufs++] = bh;
317                         __jbd2_journal_file_buffer(jh, commit_transaction,
318                                                 BJ_Locked);
319                         jbd_unlock_bh_state(bh);
320                         if (bufs == journal->j_wbufsize) {
321                                 spin_unlock(&journal->j_list_lock);
322                                 journal_do_submit_data(wbuf, bufs);
323                                 bufs = 0;
324                                 goto write_out_data;
325                         }
326                 } else if (!locked && buffer_locked(bh)) {
327                         __jbd2_journal_file_buffer(jh, commit_transaction,
328                                                 BJ_Locked);
329                         jbd_unlock_bh_state(bh);
330                         put_bh(bh);
331                 } else {
332                         BUFFER_TRACE(bh, "writeout complete: unfile");
333                         __jbd2_journal_unfile_buffer(jh);
334                         jbd_unlock_bh_state(bh);
335                         if (locked)
336                                 unlock_buffer(bh);
337                         jbd2_journal_remove_journal_head(bh);
338                         /* Once for our safety reference, once for
339                          * jbd2_journal_remove_journal_head() */
340                         put_bh(bh);
341                         put_bh(bh);
342                 }
343
344                 if (lock_need_resched(&journal->j_list_lock)) {
345                         spin_unlock(&journal->j_list_lock);
346                         goto write_out_data;
347                 }
348         }
349         spin_unlock(&journal->j_list_lock);
350         journal_do_submit_data(wbuf, bufs);
351 }
352
353 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
354 {
355         struct page *page = bh->b_page;
356         char *addr;
357         __u32 checksum;
358
359         addr = kmap_atomic(page, KM_USER0);
360         checksum = crc32_be(crc32_sum,
361                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
362         kunmap_atomic(addr, KM_USER0);
363
364         return checksum;
365 }
366
367 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
368                                    unsigned long long block)
369 {
370         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
371         if (tag_bytes > JBD2_TAG_SIZE32)
372                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
373 }
374
375 /*
376  * jbd2_journal_commit_transaction
377  *
378  * The primary function for committing a transaction to the log.  This
379  * function is called by the journal thread to begin a complete commit.
380  */
381 void jbd2_journal_commit_transaction(journal_t *journal)
382 {
383         struct transaction_stats_s stats;
384         transaction_t *commit_transaction;
385         struct journal_head *jh, *new_jh, *descriptor;
386         struct buffer_head **wbuf = journal->j_wbuf;
387         int bufs;
388         int flags;
389         int err;
390         unsigned long long blocknr;
391         char *tagp = NULL;
392         journal_header_t *header;
393         journal_block_tag_t *tag = NULL;
394         int space_left = 0;
395         int first_tag = 0;
396         int tag_flag;
397         int i;
398         int tag_bytes = journal_tag_bytes(journal);
399         struct buffer_head *cbh = NULL; /* For transactional checksums */
400         __u32 crc32_sum = ~0;
401
402         /*
403          * First job: lock down the current transaction and wait for
404          * all outstanding updates to complete.
405          */
406
407 #ifdef COMMIT_STATS
408         spin_lock(&journal->j_list_lock);
409         summarise_journal_usage(journal);
410         spin_unlock(&journal->j_list_lock);
411 #endif
412
413         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
414         if (journal->j_flags & JBD2_FLUSHED) {
415                 jbd_debug(3, "super block updated\n");
416                 jbd2_journal_update_superblock(journal, 1);
417         } else {
418                 jbd_debug(3, "superblock not updated\n");
419         }
420
421         J_ASSERT(journal->j_running_transaction != NULL);
422         J_ASSERT(journal->j_committing_transaction == NULL);
423
424         commit_transaction = journal->j_running_transaction;
425         J_ASSERT(commit_transaction->t_state == T_RUNNING);
426
427         jbd_debug(1, "JBD: starting commit of transaction %d\n",
428                         commit_transaction->t_tid);
429
430         spin_lock(&journal->j_state_lock);
431         commit_transaction->t_state = T_LOCKED;
432
433         stats.u.run.rs_wait = commit_transaction->t_max_wait;
434         stats.u.run.rs_locked = jiffies;
435         stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
436                                                 stats.u.run.rs_locked);
437
438         spin_lock(&commit_transaction->t_handle_lock);
439         while (commit_transaction->t_updates) {
440                 DEFINE_WAIT(wait);
441
442                 prepare_to_wait(&journal->j_wait_updates, &wait,
443                                         TASK_UNINTERRUPTIBLE);
444                 if (commit_transaction->t_updates) {
445                         spin_unlock(&commit_transaction->t_handle_lock);
446                         spin_unlock(&journal->j_state_lock);
447                         schedule();
448                         spin_lock(&journal->j_state_lock);
449                         spin_lock(&commit_transaction->t_handle_lock);
450                 }
451                 finish_wait(&journal->j_wait_updates, &wait);
452         }
453         spin_unlock(&commit_transaction->t_handle_lock);
454
455         J_ASSERT (commit_transaction->t_outstanding_credits <=
456                         journal->j_max_transaction_buffers);
457
458         /*
459          * First thing we are allowed to do is to discard any remaining
460          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
461          * that there are no such buffers: if a large filesystem
462          * operation like a truncate needs to split itself over multiple
463          * transactions, then it may try to do a jbd2_journal_restart() while
464          * there are still BJ_Reserved buffers outstanding.  These must
465          * be released cleanly from the current transaction.
466          *
467          * In this case, the filesystem must still reserve write access
468          * again before modifying the buffer in the new transaction, but
469          * we do not require it to remember exactly which old buffers it
470          * has reserved.  This is consistent with the existing behaviour
471          * that multiple jbd2_journal_get_write_access() calls to the same
472          * buffer are perfectly permissable.
473          */
474         while (commit_transaction->t_reserved_list) {
475                 jh = commit_transaction->t_reserved_list;
476                 JBUFFER_TRACE(jh, "reserved, unused: refile");
477                 /*
478                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
479                  * leave undo-committed data.
480                  */
481                 if (jh->b_committed_data) {
482                         struct buffer_head *bh = jh2bh(jh);
483
484                         jbd_lock_bh_state(bh);
485                         jbd2_free(jh->b_committed_data, bh->b_size);
486                         jh->b_committed_data = NULL;
487                         jbd_unlock_bh_state(bh);
488                 }
489                 jbd2_journal_refile_buffer(journal, jh);
490         }
491
492         /*
493          * Now try to drop any written-back buffers from the journal's
494          * checkpoint lists.  We do this *before* commit because it potentially
495          * frees some memory
496          */
497         spin_lock(&journal->j_list_lock);
498         __jbd2_journal_clean_checkpoint_list(journal);
499         spin_unlock(&journal->j_list_lock);
500
501         jbd_debug (3, "JBD: commit phase 1\n");
502
503         /*
504          * Switch to a new revoke table.
505          */
506         jbd2_journal_switch_revoke_table(journal);
507
508         stats.u.run.rs_flushing = jiffies;
509         stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
510                                                stats.u.run.rs_flushing);
511
512         commit_transaction->t_state = T_FLUSH;
513         journal->j_committing_transaction = commit_transaction;
514         journal->j_running_transaction = NULL;
515         commit_transaction->t_log_start = journal->j_head;
516         wake_up(&journal->j_wait_transaction_locked);
517         spin_unlock(&journal->j_state_lock);
518
519         jbd_debug (3, "JBD: commit phase 2\n");
520
521         /*
522          * First, drop modified flag: all accesses to the buffers
523          * will be tracked for a new trasaction only -bzzz
524          */
525         spin_lock(&journal->j_list_lock);
526         if (commit_transaction->t_buffers) {
527                 new_jh = jh = commit_transaction->t_buffers->b_tnext;
528                 do {
529                         J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
530                                         new_jh->b_modified == 0);
531                         new_jh->b_modified = 0;
532                         new_jh = new_jh->b_tnext;
533                 } while (new_jh != jh);
534         }
535         spin_unlock(&journal->j_list_lock);
536
537         /*
538          * Now start flushing things to disk, in the order they appear
539          * on the transaction lists.  Data blocks go first.
540          */
541         err = 0;
542         journal_submit_data_buffers(journal, commit_transaction);
543
544         /*
545          * Wait for all previously submitted IO to complete if commit
546          * record is to be written synchronously.
547          */
548         spin_lock(&journal->j_list_lock);
549         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
550                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
551                 err = journal_wait_on_locked_list(journal,
552                                                 commit_transaction);
553
554         spin_unlock(&journal->j_list_lock);
555
556         if (err)
557                 jbd2_journal_abort(journal, err);
558
559         jbd2_journal_write_revoke_records(journal, commit_transaction);
560
561         jbd_debug(3, "JBD: commit phase 2\n");
562
563         /*
564          * If we found any dirty or locked buffers, then we should have
565          * looped back up to the write_out_data label.  If there weren't
566          * any then journal_clean_data_list should have wiped the list
567          * clean by now, so check that it is in fact empty.
568          */
569         J_ASSERT (commit_transaction->t_sync_datalist == NULL);
570
571         jbd_debug (3, "JBD: commit phase 3\n");
572
573         /*
574          * Way to go: we have now written out all of the data for a
575          * transaction!  Now comes the tricky part: we need to write out
576          * metadata.  Loop over the transaction's entire buffer list:
577          */
578         commit_transaction->t_state = T_COMMIT;
579
580         stats.u.run.rs_logging = jiffies;
581         stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
582                                                  stats.u.run.rs_logging);
583         stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
584         stats.u.run.rs_blocks_logged = 0;
585
586         descriptor = NULL;
587         bufs = 0;
588         while (commit_transaction->t_buffers) {
589
590                 /* Find the next buffer to be journaled... */
591
592                 jh = commit_transaction->t_buffers;
593
594                 /* If we're in abort mode, we just un-journal the buffer and
595                    release it for background writing. */
596
597                 if (is_journal_aborted(journal)) {
598                         JBUFFER_TRACE(jh, "journal is aborting: refile");
599                         jbd2_journal_refile_buffer(journal, jh);
600                         /* If that was the last one, we need to clean up
601                          * any descriptor buffers which may have been
602                          * already allocated, even if we are now
603                          * aborting. */
604                         if (!commit_transaction->t_buffers)
605                                 goto start_journal_io;
606                         continue;
607                 }
608
609                 /* Make sure we have a descriptor block in which to
610                    record the metadata buffer. */
611
612                 if (!descriptor) {
613                         struct buffer_head *bh;
614
615                         J_ASSERT (bufs == 0);
616
617                         jbd_debug(4, "JBD: get descriptor\n");
618
619                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
620                         if (!descriptor) {
621                                 jbd2_journal_abort(journal, -EIO);
622                                 continue;
623                         }
624
625                         bh = jh2bh(descriptor);
626                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
627                                 (unsigned long long)bh->b_blocknr, bh->b_data);
628                         header = (journal_header_t *)&bh->b_data[0];
629                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
630                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
631                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
632
633                         tagp = &bh->b_data[sizeof(journal_header_t)];
634                         space_left = bh->b_size - sizeof(journal_header_t);
635                         first_tag = 1;
636                         set_buffer_jwrite(bh);
637                         set_buffer_dirty(bh);
638                         wbuf[bufs++] = bh;
639
640                         /* Record it so that we can wait for IO
641                            completion later */
642                         BUFFER_TRACE(bh, "ph3: file as descriptor");
643                         jbd2_journal_file_buffer(descriptor, commit_transaction,
644                                         BJ_LogCtl);
645                 }
646
647                 /* Where is the buffer to be written? */
648
649                 err = jbd2_journal_next_log_block(journal, &blocknr);
650                 /* If the block mapping failed, just abandon the buffer
651                    and repeat this loop: we'll fall into the
652                    refile-on-abort condition above. */
653                 if (err) {
654                         jbd2_journal_abort(journal, err);
655                         continue;
656                 }
657
658                 /*
659                  * start_this_handle() uses t_outstanding_credits to determine
660                  * the free space in the log, but this counter is changed
661                  * by jbd2_journal_next_log_block() also.
662                  */
663                 commit_transaction->t_outstanding_credits--;
664
665                 /* Bump b_count to prevent truncate from stumbling over
666                    the shadowed buffer!  @@@ This can go if we ever get
667                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
668                 atomic_inc(&jh2bh(jh)->b_count);
669
670                 /* Make a temporary IO buffer with which to write it out
671                    (this will requeue both the metadata buffer and the
672                    temporary IO buffer). new_bh goes on BJ_IO*/
673
674                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
675                 /*
676                  * akpm: jbd2_journal_write_metadata_buffer() sets
677                  * new_bh->b_transaction to commit_transaction.
678                  * We need to clean this up before we release new_bh
679                  * (which is of type BJ_IO)
680                  */
681                 JBUFFER_TRACE(jh, "ph3: write metadata");
682                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
683                                                       jh, &new_jh, blocknr);
684                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
685                 wbuf[bufs++] = jh2bh(new_jh);
686
687                 /* Record the new block's tag in the current descriptor
688                    buffer */
689
690                 tag_flag = 0;
691                 if (flags & 1)
692                         tag_flag |= JBD2_FLAG_ESCAPE;
693                 if (!first_tag)
694                         tag_flag |= JBD2_FLAG_SAME_UUID;
695
696                 tag = (journal_block_tag_t *) tagp;
697                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
698                 tag->t_flags = cpu_to_be32(tag_flag);
699                 tagp += tag_bytes;
700                 space_left -= tag_bytes;
701
702                 if (first_tag) {
703                         memcpy (tagp, journal->j_uuid, 16);
704                         tagp += 16;
705                         space_left -= 16;
706                         first_tag = 0;
707                 }
708
709                 /* If there's no more to do, or if the descriptor is full,
710                    let the IO rip! */
711
712                 if (bufs == journal->j_wbufsize ||
713                     commit_transaction->t_buffers == NULL ||
714                     space_left < tag_bytes + 16) {
715
716                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
717
718                         /* Write an end-of-descriptor marker before
719                            submitting the IOs.  "tag" still points to
720                            the last tag we set up. */
721
722                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
723
724 start_journal_io:
725                         for (i = 0; i < bufs; i++) {
726                                 struct buffer_head *bh = wbuf[i];
727                                 /*
728                                  * Compute checksum.
729                                  */
730                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
731                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
732                                         crc32_sum =
733                                             jbd2_checksum_data(crc32_sum, bh);
734                                 }
735
736                                 lock_buffer(bh);
737                                 clear_buffer_dirty(bh);
738                                 set_buffer_uptodate(bh);
739                                 bh->b_end_io = journal_end_buffer_io_sync;
740                                 submit_bh(WRITE, bh);
741                         }
742                         cond_resched();
743                         stats.u.run.rs_blocks_logged += bufs;
744
745                         /* Force a new descriptor to be generated next
746                            time round the loop. */
747                         descriptor = NULL;
748                         bufs = 0;
749                 }
750         }
751
752         /* Done it all: now write the commit record asynchronously. */
753
754         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
755                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
756                 err = journal_submit_commit_record(journal, commit_transaction,
757                                                  &cbh, crc32_sum);
758                 if (err)
759                         __jbd2_journal_abort_hard(journal);
760
761                 spin_lock(&journal->j_list_lock);
762                 err = journal_wait_on_locked_list(journal,
763                                                 commit_transaction);
764                 spin_unlock(&journal->j_list_lock);
765                 if (err)
766                         __jbd2_journal_abort_hard(journal);
767         }
768
769         /* Lo and behold: we have just managed to send a transaction to
770            the log.  Before we can commit it, wait for the IO so far to
771            complete.  Control buffers being written are on the
772            transaction's t_log_list queue, and metadata buffers are on
773            the t_iobuf_list queue.
774
775            Wait for the buffers in reverse order.  That way we are
776            less likely to be woken up until all IOs have completed, and
777            so we incur less scheduling load.
778         */
779
780         jbd_debug(3, "JBD: commit phase 4\n");
781
782         /*
783          * akpm: these are BJ_IO, and j_list_lock is not needed.
784          * See __journal_try_to_free_buffer.
785          */
786 wait_for_iobuf:
787         while (commit_transaction->t_iobuf_list != NULL) {
788                 struct buffer_head *bh;
789
790                 jh = commit_transaction->t_iobuf_list->b_tprev;
791                 bh = jh2bh(jh);
792                 if (buffer_locked(bh)) {
793                         wait_on_buffer(bh);
794                         goto wait_for_iobuf;
795                 }
796                 if (cond_resched())
797                         goto wait_for_iobuf;
798
799                 if (unlikely(!buffer_uptodate(bh)))
800                         err = -EIO;
801
802                 clear_buffer_jwrite(bh);
803
804                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
805                 jbd2_journal_unfile_buffer(journal, jh);
806
807                 /*
808                  * ->t_iobuf_list should contain only dummy buffer_heads
809                  * which were created by jbd2_journal_write_metadata_buffer().
810                  */
811                 BUFFER_TRACE(bh, "dumping temporary bh");
812                 jbd2_journal_put_journal_head(jh);
813                 __brelse(bh);
814                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
815                 free_buffer_head(bh);
816
817                 /* We also have to unlock and free the corresponding
818                    shadowed buffer */
819                 jh = commit_transaction->t_shadow_list->b_tprev;
820                 bh = jh2bh(jh);
821                 clear_bit(BH_JWrite, &bh->b_state);
822                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
823
824                 /* The metadata is now released for reuse, but we need
825                    to remember it against this transaction so that when
826                    we finally commit, we can do any checkpointing
827                    required. */
828                 JBUFFER_TRACE(jh, "file as BJ_Forget");
829                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
830                 /* Wake up any transactions which were waiting for this
831                    IO to complete */
832                 wake_up_bit(&bh->b_state, BH_Unshadow);
833                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
834                 __brelse(bh);
835         }
836
837         J_ASSERT (commit_transaction->t_shadow_list == NULL);
838
839         jbd_debug(3, "JBD: commit phase 5\n");
840
841         /* Here we wait for the revoke record and descriptor record buffers */
842  wait_for_ctlbuf:
843         while (commit_transaction->t_log_list != NULL) {
844                 struct buffer_head *bh;
845
846                 jh = commit_transaction->t_log_list->b_tprev;
847                 bh = jh2bh(jh);
848                 if (buffer_locked(bh)) {
849                         wait_on_buffer(bh);
850                         goto wait_for_ctlbuf;
851                 }
852                 if (cond_resched())
853                         goto wait_for_ctlbuf;
854
855                 if (unlikely(!buffer_uptodate(bh)))
856                         err = -EIO;
857
858                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
859                 clear_buffer_jwrite(bh);
860                 jbd2_journal_unfile_buffer(journal, jh);
861                 jbd2_journal_put_journal_head(jh);
862                 __brelse(bh);           /* One for getblk */
863                 /* AKPM: bforget here */
864         }
865
866         jbd_debug(3, "JBD: commit phase 6\n");
867
868         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
869                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
870                 err = journal_submit_commit_record(journal, commit_transaction,
871                                                 &cbh, crc32_sum);
872                 if (err)
873                         __jbd2_journal_abort_hard(journal);
874         }
875         err = journal_wait_on_commit_record(cbh);
876
877         if (err)
878                 jbd2_journal_abort(journal, err);
879
880         /* End of a transaction!  Finally, we can do checkpoint
881            processing: any buffers committed as a result of this
882            transaction can be removed from any checkpoint list it was on
883            before. */
884
885         jbd_debug(3, "JBD: commit phase 7\n");
886
887         J_ASSERT(commit_transaction->t_sync_datalist == NULL);
888         J_ASSERT(commit_transaction->t_buffers == NULL);
889         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
890         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
891         J_ASSERT(commit_transaction->t_shadow_list == NULL);
892         J_ASSERT(commit_transaction->t_log_list == NULL);
893
894 restart_loop:
895         /*
896          * As there are other places (journal_unmap_buffer()) adding buffers
897          * to this list we have to be careful and hold the j_list_lock.
898          */
899         spin_lock(&journal->j_list_lock);
900         while (commit_transaction->t_forget) {
901                 transaction_t *cp_transaction;
902                 struct buffer_head *bh;
903
904                 jh = commit_transaction->t_forget;
905                 spin_unlock(&journal->j_list_lock);
906                 bh = jh2bh(jh);
907                 jbd_lock_bh_state(bh);
908                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
909                         jh->b_transaction == journal->j_running_transaction);
910
911                 /*
912                  * If there is undo-protected committed data against
913                  * this buffer, then we can remove it now.  If it is a
914                  * buffer needing such protection, the old frozen_data
915                  * field now points to a committed version of the
916                  * buffer, so rotate that field to the new committed
917                  * data.
918                  *
919                  * Otherwise, we can just throw away the frozen data now.
920                  */
921                 if (jh->b_committed_data) {
922                         jbd2_free(jh->b_committed_data, bh->b_size);
923                         jh->b_committed_data = NULL;
924                         if (jh->b_frozen_data) {
925                                 jh->b_committed_data = jh->b_frozen_data;
926                                 jh->b_frozen_data = NULL;
927                         }
928                 } else if (jh->b_frozen_data) {
929                         jbd2_free(jh->b_frozen_data, bh->b_size);
930                         jh->b_frozen_data = NULL;
931                 }
932
933                 spin_lock(&journal->j_list_lock);
934                 cp_transaction = jh->b_cp_transaction;
935                 if (cp_transaction) {
936                         JBUFFER_TRACE(jh, "remove from old cp transaction");
937                         cp_transaction->t_chp_stats.cs_dropped++;
938                         __jbd2_journal_remove_checkpoint(jh);
939                 }
940
941                 /* Only re-checkpoint the buffer_head if it is marked
942                  * dirty.  If the buffer was added to the BJ_Forget list
943                  * by jbd2_journal_forget, it may no longer be dirty and
944                  * there's no point in keeping a checkpoint record for
945                  * it. */
946
947                 /* A buffer which has been freed while still being
948                  * journaled by a previous transaction may end up still
949                  * being dirty here, but we want to avoid writing back
950                  * that buffer in the future now that the last use has
951                  * been committed.  That's not only a performance gain,
952                  * it also stops aliasing problems if the buffer is left
953                  * behind for writeback and gets reallocated for another
954                  * use in a different page. */
955                 if (buffer_freed(bh)) {
956                         clear_buffer_freed(bh);
957                         clear_buffer_jbddirty(bh);
958                 }
959
960                 if (buffer_jbddirty(bh)) {
961                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
962                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
963                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
964                         __jbd2_journal_refile_buffer(jh);
965                         jbd_unlock_bh_state(bh);
966                 } else {
967                         J_ASSERT_BH(bh, !buffer_dirty(bh));
968                         /* The buffer on BJ_Forget list and not jbddirty means
969                          * it has been freed by this transaction and hence it
970                          * could not have been reallocated until this
971                          * transaction has committed. *BUT* it could be
972                          * reallocated once we have written all the data to
973                          * disk and before we process the buffer on BJ_Forget
974                          * list. */
975                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
976                         __jbd2_journal_refile_buffer(jh);
977                         if (!jh->b_transaction) {
978                                 jbd_unlock_bh_state(bh);
979                                  /* needs a brelse */
980                                 jbd2_journal_remove_journal_head(bh);
981                                 release_buffer_page(bh);
982                         } else
983                                 jbd_unlock_bh_state(bh);
984                 }
985                 cond_resched_lock(&journal->j_list_lock);
986         }
987         spin_unlock(&journal->j_list_lock);
988         /*
989          * This is a bit sleazy.  We use j_list_lock to protect transition
990          * of a transaction into T_FINISHED state and calling
991          * __jbd2_journal_drop_transaction(). Otherwise we could race with
992          * other checkpointing code processing the transaction...
993          */
994         spin_lock(&journal->j_state_lock);
995         spin_lock(&journal->j_list_lock);
996         /*
997          * Now recheck if some buffers did not get attached to the transaction
998          * while the lock was dropped...
999          */
1000         if (commit_transaction->t_forget) {
1001                 spin_unlock(&journal->j_list_lock);
1002                 spin_unlock(&journal->j_state_lock);
1003                 goto restart_loop;
1004         }
1005
1006         /* Done with this transaction! */
1007
1008         jbd_debug(3, "JBD: commit phase 8\n");
1009
1010         J_ASSERT(commit_transaction->t_state == T_COMMIT);
1011
1012         commit_transaction->t_start = jiffies;
1013         stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
1014                                                 commit_transaction->t_start);
1015
1016         /*
1017          * File the transaction for history
1018          */
1019         stats.ts_type = JBD2_STATS_RUN;
1020         stats.ts_tid = commit_transaction->t_tid;
1021         stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
1022         spin_lock(&journal->j_history_lock);
1023         memcpy(journal->j_history + journal->j_history_cur, &stats,
1024                         sizeof(stats));
1025         if (++journal->j_history_cur == journal->j_history_max)
1026                 journal->j_history_cur = 0;
1027
1028         /*
1029          * Calculate overall stats
1030          */
1031         journal->j_stats.ts_tid++;
1032         journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
1033         journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
1034         journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
1035         journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
1036         journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
1037         journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
1038         journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
1039         journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
1040         spin_unlock(&journal->j_history_lock);
1041
1042         commit_transaction->t_state = T_FINISHED;
1043         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1044         journal->j_commit_sequence = commit_transaction->t_tid;
1045         journal->j_committing_transaction = NULL;
1046         spin_unlock(&journal->j_state_lock);
1047
1048         if (commit_transaction->t_checkpoint_list == NULL &&
1049             commit_transaction->t_checkpoint_io_list == NULL) {
1050                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1051         } else {
1052                 if (journal->j_checkpoint_transactions == NULL) {
1053                         journal->j_checkpoint_transactions = commit_transaction;
1054                         commit_transaction->t_cpnext = commit_transaction;
1055                         commit_transaction->t_cpprev = commit_transaction;
1056                 } else {
1057                         commit_transaction->t_cpnext =
1058                                 journal->j_checkpoint_transactions;
1059                         commit_transaction->t_cpprev =
1060                                 commit_transaction->t_cpnext->t_cpprev;
1061                         commit_transaction->t_cpnext->t_cpprev =
1062                                 commit_transaction;
1063                         commit_transaction->t_cpprev->t_cpnext =
1064                                 commit_transaction;
1065                 }
1066         }
1067         spin_unlock(&journal->j_list_lock);
1068
1069         jbd_debug(1, "JBD: commit %d complete, head %d\n",
1070                   journal->j_commit_sequence, journal->j_tail_sequence);
1071
1072         wake_up(&journal->j_wait_done_commit);
1073 }