eedd201374a8d09de63580ea122baa2cfd44ea2b
[linux-2.6.git] / fs / jbd / commit.c
1 /*
2  * linux/fs/jbd/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd.h>
19 #include <linux/errno.h>
20 #include <linux/mm.h>
21 #include <linux/pagemap.h>
22 #include <linux/bio.h>
23 #include <linux/blkdev.h>
24 #include <trace/events/jbd.h>
25
26 /*
27  * Default IO end handler for temporary BJ_IO buffer_heads.
28  */
29 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30 {
31         BUFFER_TRACE(bh, "");
32         if (uptodate)
33                 set_buffer_uptodate(bh);
34         else
35                 clear_buffer_uptodate(bh);
36         unlock_buffer(bh);
37 }
38
39 /*
40  * When an ext3-ordered file is truncated, it is possible that many pages are
41  * not successfully freed, because they are attached to a committing transaction.
42  * After the transaction commits, these pages are left on the LRU, with no
43  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
44  * by the VM, but their apparent absence upsets the VM accounting, and it makes
45  * the numbers in /proc/meminfo look odd.
46  *
47  * So here, we have a buffer which has just come off the forget list.  Look to
48  * see if we can strip all buffers from the backing page.
49  *
50  * Called under journal->j_list_lock.  The caller provided us with a ref
51  * against the buffer, and we drop that here.
52  */
53 static void release_buffer_page(struct buffer_head *bh)
54 {
55         struct page *page;
56
57         if (buffer_dirty(bh))
58                 goto nope;
59         if (atomic_read(&bh->b_count) != 1)
60                 goto nope;
61         page = bh->b_page;
62         if (!page)
63                 goto nope;
64         if (page->mapping)
65                 goto nope;
66
67         /* OK, it's a truncated page */
68         if (!trylock_page(page))
69                 goto nope;
70
71         page_cache_get(page);
72         __brelse(bh);
73         try_to_free_buffers(page);
74         unlock_page(page);
75         page_cache_release(page);
76         return;
77
78 nope:
79         __brelse(bh);
80 }
81
82 /*
83  * Decrement reference counter for data buffer. If it has been marked
84  * 'BH_Freed', release it and the page to which it belongs if possible.
85  */
86 static void release_data_buffer(struct buffer_head *bh)
87 {
88         if (buffer_freed(bh)) {
89                 clear_buffer_freed(bh);
90                 release_buffer_page(bh);
91         } else
92                 put_bh(bh);
93 }
94
95 /*
96  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
97  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
98  * return 0.  j_list_lock is dropped in this case.
99  */
100 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
101 {
102         if (!jbd_trylock_bh_state(bh)) {
103                 spin_unlock(&journal->j_list_lock);
104                 schedule();
105                 return 0;
106         }
107         return 1;
108 }
109
110 /* Done it all: now write the commit record.  We should have
111  * cleaned up our previous buffers by now, so if we are in abort
112  * mode we can now just skip the rest of the journal write
113  * entirely.
114  *
115  * Returns 1 if the journal needs to be aborted or 0 on success
116  */
117 static int journal_write_commit_record(journal_t *journal,
118                                         transaction_t *commit_transaction)
119 {
120         struct journal_head *descriptor;
121         struct buffer_head *bh;
122         journal_header_t *header;
123         int ret;
124
125         if (is_journal_aborted(journal))
126                 return 0;
127
128         descriptor = journal_get_descriptor_buffer(journal);
129         if (!descriptor)
130                 return 1;
131
132         bh = jh2bh(descriptor);
133
134         header = (journal_header_t *)(bh->b_data);
135         header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
136         header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
137         header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
138
139         JBUFFER_TRACE(descriptor, "write commit block");
140         set_buffer_dirty(bh);
141
142         if (journal->j_flags & JFS_BARRIER)
143                 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
144         else
145                 ret = sync_dirty_buffer(bh);
146
147         put_bh(bh);             /* One for getblk() */
148         journal_put_journal_head(descriptor);
149
150         return (ret == -EIO);
151 }
152
153 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
154                                    int write_op)
155 {
156         int i;
157
158         for (i = 0; i < bufs; i++) {
159                 wbuf[i]->b_end_io = end_buffer_write_sync;
160                 /* We use-up our safety reference in submit_bh() */
161                 submit_bh(write_op, wbuf[i]);
162         }
163 }
164
165 /*
166  *  Submit all the data buffers to disk
167  */
168 static int journal_submit_data_buffers(journal_t *journal,
169                                        transaction_t *commit_transaction,
170                                        int write_op)
171 {
172         struct journal_head *jh;
173         struct buffer_head *bh;
174         int locked;
175         int bufs = 0;
176         struct buffer_head **wbuf = journal->j_wbuf;
177         int err = 0;
178
179         /*
180          * Whenever we unlock the journal and sleep, things can get added
181          * onto ->t_sync_datalist, so we have to keep looping back to
182          * write_out_data until we *know* that the list is empty.
183          *
184          * Cleanup any flushed data buffers from the data list.  Even in
185          * abort mode, we want to flush this out as soon as possible.
186          */
187 write_out_data:
188         cond_resched();
189         spin_lock(&journal->j_list_lock);
190
191         while (commit_transaction->t_sync_datalist) {
192                 jh = commit_transaction->t_sync_datalist;
193                 bh = jh2bh(jh);
194                 locked = 0;
195
196                 /* Get reference just to make sure buffer does not disappear
197                  * when we are forced to drop various locks */
198                 get_bh(bh);
199                 /* If the buffer is dirty, we need to submit IO and hence
200                  * we need the buffer lock. We try to lock the buffer without
201                  * blocking. If we fail, we need to drop j_list_lock and do
202                  * blocking lock_buffer().
203                  */
204                 if (buffer_dirty(bh)) {
205                         if (!trylock_buffer(bh)) {
206                                 BUFFER_TRACE(bh, "needs blocking lock");
207                                 spin_unlock(&journal->j_list_lock);
208                                 trace_jbd_do_submit_data(journal,
209                                                      commit_transaction);
210                                 /* Write out all data to prevent deadlocks */
211                                 journal_do_submit_data(wbuf, bufs, write_op);
212                                 bufs = 0;
213                                 lock_buffer(bh);
214                                 spin_lock(&journal->j_list_lock);
215                         }
216                         locked = 1;
217                 }
218                 /* We have to get bh_state lock. Again out of order, sigh. */
219                 if (!inverted_lock(journal, bh)) {
220                         jbd_lock_bh_state(bh);
221                         spin_lock(&journal->j_list_lock);
222                 }
223                 /* Someone already cleaned up the buffer? */
224                 if (!buffer_jbd(bh) || bh2jh(bh) != jh
225                         || jh->b_transaction != commit_transaction
226                         || jh->b_jlist != BJ_SyncData) {
227                         jbd_unlock_bh_state(bh);
228                         if (locked)
229                                 unlock_buffer(bh);
230                         BUFFER_TRACE(bh, "already cleaned up");
231                         release_data_buffer(bh);
232                         continue;
233                 }
234                 if (locked && test_clear_buffer_dirty(bh)) {
235                         BUFFER_TRACE(bh, "needs writeout, adding to array");
236                         wbuf[bufs++] = bh;
237                         __journal_file_buffer(jh, commit_transaction,
238                                                 BJ_Locked);
239                         jbd_unlock_bh_state(bh);
240                         if (bufs == journal->j_wbufsize) {
241                                 spin_unlock(&journal->j_list_lock);
242                                 trace_jbd_do_submit_data(journal,
243                                                      commit_transaction);
244                                 journal_do_submit_data(wbuf, bufs, write_op);
245                                 bufs = 0;
246                                 goto write_out_data;
247                         }
248                 } else if (!locked && buffer_locked(bh)) {
249                         __journal_file_buffer(jh, commit_transaction,
250                                                 BJ_Locked);
251                         jbd_unlock_bh_state(bh);
252                         put_bh(bh);
253                 } else {
254                         BUFFER_TRACE(bh, "writeout complete: unfile");
255                         if (unlikely(!buffer_uptodate(bh)))
256                                 err = -EIO;
257                         __journal_unfile_buffer(jh);
258                         jbd_unlock_bh_state(bh);
259                         if (locked)
260                                 unlock_buffer(bh);
261                         journal_remove_journal_head(bh);
262                         /* One for our safety reference, other for
263                          * journal_remove_journal_head() */
264                         put_bh(bh);
265                         release_data_buffer(bh);
266                 }
267
268                 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
269                         spin_unlock(&journal->j_list_lock);
270                         goto write_out_data;
271                 }
272         }
273         spin_unlock(&journal->j_list_lock);
274         trace_jbd_do_submit_data(journal, commit_transaction);
275         journal_do_submit_data(wbuf, bufs, write_op);
276
277         return err;
278 }
279
280 /*
281  * journal_commit_transaction
282  *
283  * The primary function for committing a transaction to the log.  This
284  * function is called by the journal thread to begin a complete commit.
285  */
286 void journal_commit_transaction(journal_t *journal)
287 {
288         transaction_t *commit_transaction;
289         struct journal_head *jh, *new_jh, *descriptor;
290         struct buffer_head **wbuf = journal->j_wbuf;
291         int bufs;
292         int flags;
293         int err;
294         unsigned int blocknr;
295         ktime_t start_time;
296         u64 commit_time;
297         char *tagp = NULL;
298         journal_header_t *header;
299         journal_block_tag_t *tag = NULL;
300         int space_left = 0;
301         int first_tag = 0;
302         int tag_flag;
303         int i;
304         struct blk_plug plug;
305
306         /*
307          * First job: lock down the current transaction and wait for
308          * all outstanding updates to complete.
309          */
310
311         /* Do we need to erase the effects of a prior journal_flush? */
312         if (journal->j_flags & JFS_FLUSHED) {
313                 jbd_debug(3, "super block updated\n");
314                 journal_update_superblock(journal, 1);
315         } else {
316                 jbd_debug(3, "superblock not updated\n");
317         }
318
319         J_ASSERT(journal->j_running_transaction != NULL);
320         J_ASSERT(journal->j_committing_transaction == NULL);
321
322         commit_transaction = journal->j_running_transaction;
323         J_ASSERT(commit_transaction->t_state == T_RUNNING);
324
325         trace_jbd_start_commit(journal, commit_transaction);
326         jbd_debug(1, "JBD: starting commit of transaction %d\n",
327                         commit_transaction->t_tid);
328
329         spin_lock(&journal->j_state_lock);
330         commit_transaction->t_state = T_LOCKED;
331
332         trace_jbd_commit_locking(journal, commit_transaction);
333         spin_lock(&commit_transaction->t_handle_lock);
334         while (commit_transaction->t_updates) {
335                 DEFINE_WAIT(wait);
336
337                 prepare_to_wait(&journal->j_wait_updates, &wait,
338                                         TASK_UNINTERRUPTIBLE);
339                 if (commit_transaction->t_updates) {
340                         spin_unlock(&commit_transaction->t_handle_lock);
341                         spin_unlock(&journal->j_state_lock);
342                         schedule();
343                         spin_lock(&journal->j_state_lock);
344                         spin_lock(&commit_transaction->t_handle_lock);
345                 }
346                 finish_wait(&journal->j_wait_updates, &wait);
347         }
348         spin_unlock(&commit_transaction->t_handle_lock);
349
350         J_ASSERT (commit_transaction->t_outstanding_credits <=
351                         journal->j_max_transaction_buffers);
352
353         /*
354          * First thing we are allowed to do is to discard any remaining
355          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
356          * that there are no such buffers: if a large filesystem
357          * operation like a truncate needs to split itself over multiple
358          * transactions, then it may try to do a journal_restart() while
359          * there are still BJ_Reserved buffers outstanding.  These must
360          * be released cleanly from the current transaction.
361          *
362          * In this case, the filesystem must still reserve write access
363          * again before modifying the buffer in the new transaction, but
364          * we do not require it to remember exactly which old buffers it
365          * has reserved.  This is consistent with the existing behaviour
366          * that multiple journal_get_write_access() calls to the same
367          * buffer are perfectly permissible.
368          */
369         while (commit_transaction->t_reserved_list) {
370                 jh = commit_transaction->t_reserved_list;
371                 JBUFFER_TRACE(jh, "reserved, unused: refile");
372                 /*
373                  * A journal_get_undo_access()+journal_release_buffer() may
374                  * leave undo-committed data.
375                  */
376                 if (jh->b_committed_data) {
377                         struct buffer_head *bh = jh2bh(jh);
378
379                         jbd_lock_bh_state(bh);
380                         jbd_free(jh->b_committed_data, bh->b_size);
381                         jh->b_committed_data = NULL;
382                         jbd_unlock_bh_state(bh);
383                 }
384                 journal_refile_buffer(journal, jh);
385         }
386
387         /*
388          * Now try to drop any written-back buffers from the journal's
389          * checkpoint lists.  We do this *before* commit because it potentially
390          * frees some memory
391          */
392         spin_lock(&journal->j_list_lock);
393         __journal_clean_checkpoint_list(journal);
394         spin_unlock(&journal->j_list_lock);
395
396         jbd_debug (3, "JBD: commit phase 1\n");
397
398         /*
399          * Switch to a new revoke table.
400          */
401         journal_switch_revoke_table(journal);
402
403         trace_jbd_commit_flushing(journal, commit_transaction);
404         commit_transaction->t_state = T_FLUSH;
405         journal->j_committing_transaction = commit_transaction;
406         journal->j_running_transaction = NULL;
407         start_time = ktime_get();
408         commit_transaction->t_log_start = journal->j_head;
409         wake_up(&journal->j_wait_transaction_locked);
410         spin_unlock(&journal->j_state_lock);
411
412         jbd_debug (3, "JBD: commit phase 2\n");
413
414         /*
415          * Now start flushing things to disk, in the order they appear
416          * on the transaction lists.  Data blocks go first.
417          */
418         blk_start_plug(&plug);
419         err = journal_submit_data_buffers(journal, commit_transaction,
420                                           WRITE_SYNC);
421         blk_finish_plug(&plug);
422
423         /*
424          * Wait for all previously submitted IO to complete.
425          */
426         spin_lock(&journal->j_list_lock);
427         while (commit_transaction->t_locked_list) {
428                 struct buffer_head *bh;
429
430                 jh = commit_transaction->t_locked_list->b_tprev;
431                 bh = jh2bh(jh);
432                 get_bh(bh);
433                 if (buffer_locked(bh)) {
434                         spin_unlock(&journal->j_list_lock);
435                         wait_on_buffer(bh);
436                         spin_lock(&journal->j_list_lock);
437                 }
438                 if (unlikely(!buffer_uptodate(bh))) {
439                         if (!trylock_page(bh->b_page)) {
440                                 spin_unlock(&journal->j_list_lock);
441                                 lock_page(bh->b_page);
442                                 spin_lock(&journal->j_list_lock);
443                         }
444                         if (bh->b_page->mapping)
445                                 set_bit(AS_EIO, &bh->b_page->mapping->flags);
446
447                         unlock_page(bh->b_page);
448                         SetPageError(bh->b_page);
449                         err = -EIO;
450                 }
451                 if (!inverted_lock(journal, bh)) {
452                         put_bh(bh);
453                         spin_lock(&journal->j_list_lock);
454                         continue;
455                 }
456                 if (buffer_jbd(bh) && bh2jh(bh) == jh &&
457                     jh->b_transaction == commit_transaction &&
458                     jh->b_jlist == BJ_Locked) {
459                         __journal_unfile_buffer(jh);
460                         jbd_unlock_bh_state(bh);
461                         journal_remove_journal_head(bh);
462                         put_bh(bh);
463                 } else {
464                         jbd_unlock_bh_state(bh);
465                 }
466                 release_data_buffer(bh);
467                 cond_resched_lock(&journal->j_list_lock);
468         }
469         spin_unlock(&journal->j_list_lock);
470
471         if (err) {
472                 char b[BDEVNAME_SIZE];
473
474                 printk(KERN_WARNING
475                         "JBD: Detected IO errors while flushing file data "
476                         "on %s\n", bdevname(journal->j_fs_dev, b));
477                 if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
478                         journal_abort(journal, err);
479                 err = 0;
480         }
481
482         blk_start_plug(&plug);
483
484         journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC);
485
486         /*
487          * If we found any dirty or locked buffers, then we should have
488          * looped back up to the write_out_data label.  If there weren't
489          * any then journal_clean_data_list should have wiped the list
490          * clean by now, so check that it is in fact empty.
491          */
492         J_ASSERT (commit_transaction->t_sync_datalist == NULL);
493
494         jbd_debug (3, "JBD: commit phase 3\n");
495
496         /*
497          * Way to go: we have now written out all of the data for a
498          * transaction!  Now comes the tricky part: we need to write out
499          * metadata.  Loop over the transaction's entire buffer list:
500          */
501         spin_lock(&journal->j_state_lock);
502         commit_transaction->t_state = T_COMMIT;
503         spin_unlock(&journal->j_state_lock);
504
505         trace_jbd_commit_logging(journal, commit_transaction);
506         J_ASSERT(commit_transaction->t_nr_buffers <=
507                  commit_transaction->t_outstanding_credits);
508
509         descriptor = NULL;
510         bufs = 0;
511         while (commit_transaction->t_buffers) {
512
513                 /* Find the next buffer to be journaled... */
514
515                 jh = commit_transaction->t_buffers;
516
517                 /* If we're in abort mode, we just un-journal the buffer and
518                    release it. */
519
520                 if (is_journal_aborted(journal)) {
521                         clear_buffer_jbddirty(jh2bh(jh));
522                         JBUFFER_TRACE(jh, "journal is aborting: refile");
523                         journal_refile_buffer(journal, jh);
524                         /* If that was the last one, we need to clean up
525                          * any descriptor buffers which may have been
526                          * already allocated, even if we are now
527                          * aborting. */
528                         if (!commit_transaction->t_buffers)
529                                 goto start_journal_io;
530                         continue;
531                 }
532
533                 /* Make sure we have a descriptor block in which to
534                    record the metadata buffer. */
535
536                 if (!descriptor) {
537                         struct buffer_head *bh;
538
539                         J_ASSERT (bufs == 0);
540
541                         jbd_debug(4, "JBD: get descriptor\n");
542
543                         descriptor = journal_get_descriptor_buffer(journal);
544                         if (!descriptor) {
545                                 journal_abort(journal, -EIO);
546                                 continue;
547                         }
548
549                         bh = jh2bh(descriptor);
550                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
551                                 (unsigned long long)bh->b_blocknr, bh->b_data);
552                         header = (journal_header_t *)&bh->b_data[0];
553                         header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
554                         header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
555                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
556
557                         tagp = &bh->b_data[sizeof(journal_header_t)];
558                         space_left = bh->b_size - sizeof(journal_header_t);
559                         first_tag = 1;
560                         set_buffer_jwrite(bh);
561                         set_buffer_dirty(bh);
562                         wbuf[bufs++] = bh;
563
564                         /* Record it so that we can wait for IO
565                            completion later */
566                         BUFFER_TRACE(bh, "ph3: file as descriptor");
567                         journal_file_buffer(descriptor, commit_transaction,
568                                         BJ_LogCtl);
569                 }
570
571                 /* Where is the buffer to be written? */
572
573                 err = journal_next_log_block(journal, &blocknr);
574                 /* If the block mapping failed, just abandon the buffer
575                    and repeat this loop: we'll fall into the
576                    refile-on-abort condition above. */
577                 if (err) {
578                         journal_abort(journal, err);
579                         continue;
580                 }
581
582                 /*
583                  * start_this_handle() uses t_outstanding_credits to determine
584                  * the free space in the log, but this counter is changed
585                  * by journal_next_log_block() also.
586                  */
587                 commit_transaction->t_outstanding_credits--;
588
589                 /* Bump b_count to prevent truncate from stumbling over
590                    the shadowed buffer!  @@@ This can go if we ever get
591                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
592                 get_bh(jh2bh(jh));
593
594                 /* Make a temporary IO buffer with which to write it out
595                    (this will requeue both the metadata buffer and the
596                    temporary IO buffer). new_bh goes on BJ_IO*/
597
598                 set_buffer_jwrite(jh2bh(jh));
599                 /*
600                  * akpm: journal_write_metadata_buffer() sets
601                  * new_bh->b_transaction to commit_transaction.
602                  * We need to clean this up before we release new_bh
603                  * (which is of type BJ_IO)
604                  */
605                 JBUFFER_TRACE(jh, "ph3: write metadata");
606                 flags = journal_write_metadata_buffer(commit_transaction,
607                                                       jh, &new_jh, blocknr);
608                 set_buffer_jwrite(jh2bh(new_jh));
609                 wbuf[bufs++] = jh2bh(new_jh);
610
611                 /* Record the new block's tag in the current descriptor
612                    buffer */
613
614                 tag_flag = 0;
615                 if (flags & 1)
616                         tag_flag |= JFS_FLAG_ESCAPE;
617                 if (!first_tag)
618                         tag_flag |= JFS_FLAG_SAME_UUID;
619
620                 tag = (journal_block_tag_t *) tagp;
621                 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
622                 tag->t_flags = cpu_to_be32(tag_flag);
623                 tagp += sizeof(journal_block_tag_t);
624                 space_left -= sizeof(journal_block_tag_t);
625
626                 if (first_tag) {
627                         memcpy (tagp, journal->j_uuid, 16);
628                         tagp += 16;
629                         space_left -= 16;
630                         first_tag = 0;
631                 }
632
633                 /* If there's no more to do, or if the descriptor is full,
634                    let the IO rip! */
635
636                 if (bufs == journal->j_wbufsize ||
637                     commit_transaction->t_buffers == NULL ||
638                     space_left < sizeof(journal_block_tag_t) + 16) {
639
640                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
641
642                         /* Write an end-of-descriptor marker before
643                            submitting the IOs.  "tag" still points to
644                            the last tag we set up. */
645
646                         tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
647
648 start_journal_io:
649                         for (i = 0; i < bufs; i++) {
650                                 struct buffer_head *bh = wbuf[i];
651                                 lock_buffer(bh);
652                                 clear_buffer_dirty(bh);
653                                 set_buffer_uptodate(bh);
654                                 bh->b_end_io = journal_end_buffer_io_sync;
655                                 submit_bh(WRITE_SYNC, bh);
656                         }
657                         cond_resched();
658
659                         /* Force a new descriptor to be generated next
660                            time round the loop. */
661                         descriptor = NULL;
662                         bufs = 0;
663                 }
664         }
665
666         blk_finish_plug(&plug);
667
668         /* Lo and behold: we have just managed to send a transaction to
669            the log.  Before we can commit it, wait for the IO so far to
670            complete.  Control buffers being written are on the
671            transaction's t_log_list queue, and metadata buffers are on
672            the t_iobuf_list queue.
673
674            Wait for the buffers in reverse order.  That way we are
675            less likely to be woken up until all IOs have completed, and
676            so we incur less scheduling load.
677         */
678
679         jbd_debug(3, "JBD: commit phase 4\n");
680
681         /*
682          * akpm: these are BJ_IO, and j_list_lock is not needed.
683          * See __journal_try_to_free_buffer.
684          */
685 wait_for_iobuf:
686         while (commit_transaction->t_iobuf_list != NULL) {
687                 struct buffer_head *bh;
688
689                 jh = commit_transaction->t_iobuf_list->b_tprev;
690                 bh = jh2bh(jh);
691                 if (buffer_locked(bh)) {
692                         wait_on_buffer(bh);
693                         goto wait_for_iobuf;
694                 }
695                 if (cond_resched())
696                         goto wait_for_iobuf;
697
698                 if (unlikely(!buffer_uptodate(bh)))
699                         err = -EIO;
700
701                 clear_buffer_jwrite(bh);
702
703                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
704                 journal_unfile_buffer(journal, jh);
705
706                 /*
707                  * ->t_iobuf_list should contain only dummy buffer_heads
708                  * which were created by journal_write_metadata_buffer().
709                  */
710                 BUFFER_TRACE(bh, "dumping temporary bh");
711                 journal_put_journal_head(jh);
712                 __brelse(bh);
713                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
714                 free_buffer_head(bh);
715
716                 /* We also have to unlock and free the corresponding
717                    shadowed buffer */
718                 jh = commit_transaction->t_shadow_list->b_tprev;
719                 bh = jh2bh(jh);
720                 clear_buffer_jwrite(bh);
721                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
722
723                 /* The metadata is now released for reuse, but we need
724                    to remember it against this transaction so that when
725                    we finally commit, we can do any checkpointing
726                    required. */
727                 JBUFFER_TRACE(jh, "file as BJ_Forget");
728                 journal_file_buffer(jh, commit_transaction, BJ_Forget);
729                 /*
730                  * Wake up any transactions which were waiting for this
731                  * IO to complete. The barrier must be here so that changes
732                  * by journal_file_buffer() take effect before wake_up_bit()
733                  * does the waitqueue check.
734                  */
735                 smp_mb();
736                 wake_up_bit(&bh->b_state, BH_Unshadow);
737                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
738                 __brelse(bh);
739         }
740
741         J_ASSERT (commit_transaction->t_shadow_list == NULL);
742
743         jbd_debug(3, "JBD: commit phase 5\n");
744
745         /* Here we wait for the revoke record and descriptor record buffers */
746  wait_for_ctlbuf:
747         while (commit_transaction->t_log_list != NULL) {
748                 struct buffer_head *bh;
749
750                 jh = commit_transaction->t_log_list->b_tprev;
751                 bh = jh2bh(jh);
752                 if (buffer_locked(bh)) {
753                         wait_on_buffer(bh);
754                         goto wait_for_ctlbuf;
755                 }
756                 if (cond_resched())
757                         goto wait_for_ctlbuf;
758
759                 if (unlikely(!buffer_uptodate(bh)))
760                         err = -EIO;
761
762                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
763                 clear_buffer_jwrite(bh);
764                 journal_unfile_buffer(journal, jh);
765                 journal_put_journal_head(jh);
766                 __brelse(bh);           /* One for getblk */
767                 /* AKPM: bforget here */
768         }
769
770         if (err)
771                 journal_abort(journal, err);
772
773         jbd_debug(3, "JBD: commit phase 6\n");
774
775         /* All metadata is written, now write commit record and do cleanup */
776         spin_lock(&journal->j_state_lock);
777         J_ASSERT(commit_transaction->t_state == T_COMMIT);
778         commit_transaction->t_state = T_COMMIT_RECORD;
779         spin_unlock(&journal->j_state_lock);
780
781         if (journal_write_commit_record(journal, commit_transaction))
782                 err = -EIO;
783
784         if (err)
785                 journal_abort(journal, err);
786
787         /* End of a transaction!  Finally, we can do checkpoint
788            processing: any buffers committed as a result of this
789            transaction can be removed from any checkpoint list it was on
790            before. */
791
792         jbd_debug(3, "JBD: commit phase 7\n");
793
794         J_ASSERT(commit_transaction->t_sync_datalist == NULL);
795         J_ASSERT(commit_transaction->t_buffers == NULL);
796         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
797         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
798         J_ASSERT(commit_transaction->t_shadow_list == NULL);
799         J_ASSERT(commit_transaction->t_log_list == NULL);
800
801 restart_loop:
802         /*
803          * As there are other places (journal_unmap_buffer()) adding buffers
804          * to this list we have to be careful and hold the j_list_lock.
805          */
806         spin_lock(&journal->j_list_lock);
807         while (commit_transaction->t_forget) {
808                 transaction_t *cp_transaction;
809                 struct buffer_head *bh;
810
811                 jh = commit_transaction->t_forget;
812                 spin_unlock(&journal->j_list_lock);
813                 bh = jh2bh(jh);
814                 jbd_lock_bh_state(bh);
815                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
816                         jh->b_transaction == journal->j_running_transaction);
817
818                 /*
819                  * If there is undo-protected committed data against
820                  * this buffer, then we can remove it now.  If it is a
821                  * buffer needing such protection, the old frozen_data
822                  * field now points to a committed version of the
823                  * buffer, so rotate that field to the new committed
824                  * data.
825                  *
826                  * Otherwise, we can just throw away the frozen data now.
827                  */
828                 if (jh->b_committed_data) {
829                         jbd_free(jh->b_committed_data, bh->b_size);
830                         jh->b_committed_data = NULL;
831                         if (jh->b_frozen_data) {
832                                 jh->b_committed_data = jh->b_frozen_data;
833                                 jh->b_frozen_data = NULL;
834                         }
835                 } else if (jh->b_frozen_data) {
836                         jbd_free(jh->b_frozen_data, bh->b_size);
837                         jh->b_frozen_data = NULL;
838                 }
839
840                 spin_lock(&journal->j_list_lock);
841                 cp_transaction = jh->b_cp_transaction;
842                 if (cp_transaction) {
843                         JBUFFER_TRACE(jh, "remove from old cp transaction");
844                         __journal_remove_checkpoint(jh);
845                 }
846
847                 /* Only re-checkpoint the buffer_head if it is marked
848                  * dirty.  If the buffer was added to the BJ_Forget list
849                  * by journal_forget, it may no longer be dirty and
850                  * there's no point in keeping a checkpoint record for
851                  * it. */
852
853                 /* A buffer which has been freed while still being
854                  * journaled by a previous transaction may end up still
855                  * being dirty here, but we want to avoid writing back
856                  * that buffer in the future after the "add to orphan"
857                  * operation been committed,  That's not only a performance
858                  * gain, it also stops aliasing problems if the buffer is
859                  * left behind for writeback and gets reallocated for another
860                  * use in a different page. */
861                 if (buffer_freed(bh) && !jh->b_next_transaction) {
862                         clear_buffer_freed(bh);
863                         clear_buffer_jbddirty(bh);
864                 }
865
866                 if (buffer_jbddirty(bh)) {
867                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
868                         __journal_insert_checkpoint(jh, commit_transaction);
869                         if (is_journal_aborted(journal))
870                                 clear_buffer_jbddirty(bh);
871                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
872                         __journal_refile_buffer(jh);
873                         jbd_unlock_bh_state(bh);
874                 } else {
875                         J_ASSERT_BH(bh, !buffer_dirty(bh));
876                         /* The buffer on BJ_Forget list and not jbddirty means
877                          * it has been freed by this transaction and hence it
878                          * could not have been reallocated until this
879                          * transaction has committed. *BUT* it could be
880                          * reallocated once we have written all the data to
881                          * disk and before we process the buffer on BJ_Forget
882                          * list. */
883                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
884                         __journal_refile_buffer(jh);
885                         if (!jh->b_transaction) {
886                                 jbd_unlock_bh_state(bh);
887                                  /* needs a brelse */
888                                 journal_remove_journal_head(bh);
889                                 release_buffer_page(bh);
890                         } else
891                                 jbd_unlock_bh_state(bh);
892                 }
893                 cond_resched_lock(&journal->j_list_lock);
894         }
895         spin_unlock(&journal->j_list_lock);
896         /*
897          * This is a bit sleazy.  We use j_list_lock to protect transition
898          * of a transaction into T_FINISHED state and calling
899          * __journal_drop_transaction(). Otherwise we could race with
900          * other checkpointing code processing the transaction...
901          */
902         spin_lock(&journal->j_state_lock);
903         spin_lock(&journal->j_list_lock);
904         /*
905          * Now recheck if some buffers did not get attached to the transaction
906          * while the lock was dropped...
907          */
908         if (commit_transaction->t_forget) {
909                 spin_unlock(&journal->j_list_lock);
910                 spin_unlock(&journal->j_state_lock);
911                 goto restart_loop;
912         }
913
914         /* Done with this transaction! */
915
916         jbd_debug(3, "JBD: commit phase 8\n");
917
918         J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
919
920         commit_transaction->t_state = T_FINISHED;
921         J_ASSERT(commit_transaction == journal->j_committing_transaction);
922         journal->j_commit_sequence = commit_transaction->t_tid;
923         journal->j_committing_transaction = NULL;
924         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
925
926         /*
927          * weight the commit time higher than the average time so we don't
928          * react too strongly to vast changes in commit time
929          */
930         if (likely(journal->j_average_commit_time))
931                 journal->j_average_commit_time = (commit_time*3 +
932                                 journal->j_average_commit_time) / 4;
933         else
934                 journal->j_average_commit_time = commit_time;
935
936         spin_unlock(&journal->j_state_lock);
937
938         if (commit_transaction->t_checkpoint_list == NULL &&
939             commit_transaction->t_checkpoint_io_list == NULL) {
940                 __journal_drop_transaction(journal, commit_transaction);
941         } else {
942                 if (journal->j_checkpoint_transactions == NULL) {
943                         journal->j_checkpoint_transactions = commit_transaction;
944                         commit_transaction->t_cpnext = commit_transaction;
945                         commit_transaction->t_cpprev = commit_transaction;
946                 } else {
947                         commit_transaction->t_cpnext =
948                                 journal->j_checkpoint_transactions;
949                         commit_transaction->t_cpprev =
950                                 commit_transaction->t_cpnext->t_cpprev;
951                         commit_transaction->t_cpnext->t_cpprev =
952                                 commit_transaction;
953                         commit_transaction->t_cpprev->t_cpnext =
954                                 commit_transaction;
955                 }
956         }
957         spin_unlock(&journal->j_list_lock);
958
959         trace_jbd_end_commit(journal, commit_transaction);
960         jbd_debug(1, "JBD: commit %d complete, head %d\n",
961                   journal->j_commit_sequence, journal->j_tail_sequence);
962
963         wake_up(&journal->j_wait_done_commit);
964 }