jbd2: jbd2 stats through procfs
[linux-2.6.git] / fs / jbd2 / commit.c
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24
25 /*
26  * Default IO end handler for temporary BJ_IO buffer_heads.
27  */
28 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
29 {
30         BUFFER_TRACE(bh, "");
31         if (uptodate)
32                 set_buffer_uptodate(bh);
33         else
34                 clear_buffer_uptodate(bh);
35         unlock_buffer(bh);
36 }
37
38 /*
39  * When an ext3-ordered file is truncated, it is possible that many pages are
40  * not sucessfully freed, because they are attached to a committing transaction.
41  * After the transaction commits, these pages are left on the LRU, with no
42  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
43  * by the VM, but their apparent absence upsets the VM accounting, and it makes
44  * the numbers in /proc/meminfo look odd.
45  *
46  * So here, we have a buffer which has just come off the forget list.  Look to
47  * see if we can strip all buffers from the backing page.
48  *
49  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
50  * caller provided us with a ref against the buffer, and we drop that here.
51  */
52 static void release_buffer_page(struct buffer_head *bh)
53 {
54         struct page *page;
55
56         if (buffer_dirty(bh))
57                 goto nope;
58         if (atomic_read(&bh->b_count) != 1)
59                 goto nope;
60         page = bh->b_page;
61         if (!page)
62                 goto nope;
63         if (page->mapping)
64                 goto nope;
65
66         /* OK, it's a truncated page */
67         if (TestSetPageLocked(page))
68                 goto nope;
69
70         page_cache_get(page);
71         __brelse(bh);
72         try_to_free_buffers(page);
73         unlock_page(page);
74         page_cache_release(page);
75         return;
76
77 nope:
78         __brelse(bh);
79 }
80
81 /*
82  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
83  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
84  * return 0.  j_list_lock is dropped in this case.
85  */
86 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
87 {
88         if (!jbd_trylock_bh_state(bh)) {
89                 spin_unlock(&journal->j_list_lock);
90                 schedule();
91                 return 0;
92         }
93         return 1;
94 }
95
96 /* Done it all: now write the commit record.  We should have
97  * cleaned up our previous buffers by now, so if we are in abort
98  * mode we can now just skip the rest of the journal write
99  * entirely.
100  *
101  * Returns 1 if the journal needs to be aborted or 0 on success
102  */
103 static int journal_write_commit_record(journal_t *journal,
104                                         transaction_t *commit_transaction)
105 {
106         struct journal_head *descriptor;
107         struct buffer_head *bh;
108         int i, ret;
109         int barrier_done = 0;
110
111         if (is_journal_aborted(journal))
112                 return 0;
113
114         descriptor = jbd2_journal_get_descriptor_buffer(journal);
115         if (!descriptor)
116                 return 1;
117
118         bh = jh2bh(descriptor);
119
120         /* AKPM: buglet - add `i' to tmp! */
121         for (i = 0; i < bh->b_size; i += 512) {
122                 journal_header_t *tmp = (journal_header_t*)bh->b_data;
123                 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
124                 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
125                 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
126         }
127
128         JBUFFER_TRACE(descriptor, "write commit block");
129         set_buffer_dirty(bh);
130         if (journal->j_flags & JBD2_BARRIER) {
131                 set_buffer_ordered(bh);
132                 barrier_done = 1;
133         }
134         ret = sync_dirty_buffer(bh);
135         /* is it possible for another commit to fail at roughly
136          * the same time as this one?  If so, we don't want to
137          * trust the barrier flag in the super, but instead want
138          * to remember if we sent a barrier request
139          */
140         if (ret == -EOPNOTSUPP && barrier_done) {
141                 char b[BDEVNAME_SIZE];
142
143                 printk(KERN_WARNING
144                         "JBD: barrier-based sync failed on %s - "
145                         "disabling barriers\n",
146                         bdevname(journal->j_dev, b));
147                 spin_lock(&journal->j_state_lock);
148                 journal->j_flags &= ~JBD2_BARRIER;
149                 spin_unlock(&journal->j_state_lock);
150
151                 /* And try again, without the barrier */
152                 clear_buffer_ordered(bh);
153                 set_buffer_uptodate(bh);
154                 set_buffer_dirty(bh);
155                 ret = sync_dirty_buffer(bh);
156         }
157         put_bh(bh);             /* One for getblk() */
158         jbd2_journal_put_journal_head(descriptor);
159
160         return (ret == -EIO);
161 }
162
163 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
164 {
165         int i;
166
167         for (i = 0; i < bufs; i++) {
168                 wbuf[i]->b_end_io = end_buffer_write_sync;
169                 /* We use-up our safety reference in submit_bh() */
170                 submit_bh(WRITE, wbuf[i]);
171         }
172 }
173
174 /*
175  *  Submit all the data buffers to disk
176  */
177 static void journal_submit_data_buffers(journal_t *journal,
178                                 transaction_t *commit_transaction)
179 {
180         struct journal_head *jh;
181         struct buffer_head *bh;
182         int locked;
183         int bufs = 0;
184         struct buffer_head **wbuf = journal->j_wbuf;
185
186         /*
187          * Whenever we unlock the journal and sleep, things can get added
188          * onto ->t_sync_datalist, so we have to keep looping back to
189          * write_out_data until we *know* that the list is empty.
190          *
191          * Cleanup any flushed data buffers from the data list.  Even in
192          * abort mode, we want to flush this out as soon as possible.
193          */
194 write_out_data:
195         cond_resched();
196         spin_lock(&journal->j_list_lock);
197
198         while (commit_transaction->t_sync_datalist) {
199                 jh = commit_transaction->t_sync_datalist;
200                 bh = jh2bh(jh);
201                 locked = 0;
202
203                 /* Get reference just to make sure buffer does not disappear
204                  * when we are forced to drop various locks */
205                 get_bh(bh);
206                 /* If the buffer is dirty, we need to submit IO and hence
207                  * we need the buffer lock. We try to lock the buffer without
208                  * blocking. If we fail, we need to drop j_list_lock and do
209                  * blocking lock_buffer().
210                  */
211                 if (buffer_dirty(bh)) {
212                         if (test_set_buffer_locked(bh)) {
213                                 BUFFER_TRACE(bh, "needs blocking lock");
214                                 spin_unlock(&journal->j_list_lock);
215                                 /* Write out all data to prevent deadlocks */
216                                 journal_do_submit_data(wbuf, bufs);
217                                 bufs = 0;
218                                 lock_buffer(bh);
219                                 spin_lock(&journal->j_list_lock);
220                         }
221                         locked = 1;
222                 }
223                 /* We have to get bh_state lock. Again out of order, sigh. */
224                 if (!inverted_lock(journal, bh)) {
225                         jbd_lock_bh_state(bh);
226                         spin_lock(&journal->j_list_lock);
227                 }
228                 /* Someone already cleaned up the buffer? */
229                 if (!buffer_jbd(bh)
230                         || jh->b_transaction != commit_transaction
231                         || jh->b_jlist != BJ_SyncData) {
232                         jbd_unlock_bh_state(bh);
233                         if (locked)
234                                 unlock_buffer(bh);
235                         BUFFER_TRACE(bh, "already cleaned up");
236                         put_bh(bh);
237                         continue;
238                 }
239                 if (locked && test_clear_buffer_dirty(bh)) {
240                         BUFFER_TRACE(bh, "needs writeout, adding to array");
241                         wbuf[bufs++] = bh;
242                         __jbd2_journal_file_buffer(jh, commit_transaction,
243                                                 BJ_Locked);
244                         jbd_unlock_bh_state(bh);
245                         if (bufs == journal->j_wbufsize) {
246                                 spin_unlock(&journal->j_list_lock);
247                                 journal_do_submit_data(wbuf, bufs);
248                                 bufs = 0;
249                                 goto write_out_data;
250                         }
251                 } else if (!locked && buffer_locked(bh)) {
252                         __jbd2_journal_file_buffer(jh, commit_transaction,
253                                                 BJ_Locked);
254                         jbd_unlock_bh_state(bh);
255                         put_bh(bh);
256                 } else {
257                         BUFFER_TRACE(bh, "writeout complete: unfile");
258                         __jbd2_journal_unfile_buffer(jh);
259                         jbd_unlock_bh_state(bh);
260                         if (locked)
261                                 unlock_buffer(bh);
262                         jbd2_journal_remove_journal_head(bh);
263                         /* Once for our safety reference, once for
264                          * jbd2_journal_remove_journal_head() */
265                         put_bh(bh);
266                         put_bh(bh);
267                 }
268
269                 if (lock_need_resched(&journal->j_list_lock)) {
270                         spin_unlock(&journal->j_list_lock);
271                         goto write_out_data;
272                 }
273         }
274         spin_unlock(&journal->j_list_lock);
275         journal_do_submit_data(wbuf, bufs);
276 }
277
278 static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
279                                    unsigned long long block)
280 {
281         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
282         if (tag_bytes > JBD2_TAG_SIZE32)
283                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
284 }
285
286 /*
287  * jbd2_journal_commit_transaction
288  *
289  * The primary function for committing a transaction to the log.  This
290  * function is called by the journal thread to begin a complete commit.
291  */
292 void jbd2_journal_commit_transaction(journal_t *journal)
293 {
294         struct transaction_stats_s stats;
295         transaction_t *commit_transaction;
296         struct journal_head *jh, *new_jh, *descriptor;
297         struct buffer_head **wbuf = journal->j_wbuf;
298         int bufs;
299         int flags;
300         int err;
301         unsigned long long blocknr;
302         char *tagp = NULL;
303         journal_header_t *header;
304         journal_block_tag_t *tag = NULL;
305         int space_left = 0;
306         int first_tag = 0;
307         int tag_flag;
308         int i;
309         int tag_bytes = journal_tag_bytes(journal);
310
311         /*
312          * First job: lock down the current transaction and wait for
313          * all outstanding updates to complete.
314          */
315
316 #ifdef COMMIT_STATS
317         spin_lock(&journal->j_list_lock);
318         summarise_journal_usage(journal);
319         spin_unlock(&journal->j_list_lock);
320 #endif
321
322         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
323         if (journal->j_flags & JBD2_FLUSHED) {
324                 jbd_debug(3, "super block updated\n");
325                 jbd2_journal_update_superblock(journal, 1);
326         } else {
327                 jbd_debug(3, "superblock not updated\n");
328         }
329
330         J_ASSERT(journal->j_running_transaction != NULL);
331         J_ASSERT(journal->j_committing_transaction == NULL);
332
333         commit_transaction = journal->j_running_transaction;
334         J_ASSERT(commit_transaction->t_state == T_RUNNING);
335
336         jbd_debug(1, "JBD: starting commit of transaction %d\n",
337                         commit_transaction->t_tid);
338
339         spin_lock(&journal->j_state_lock);
340         commit_transaction->t_state = T_LOCKED;
341
342         stats.u.run.rs_wait = commit_transaction->t_max_wait;
343         stats.u.run.rs_locked = jiffies;
344         stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
345                                                 stats.u.run.rs_locked);
346
347         spin_lock(&commit_transaction->t_handle_lock);
348         while (commit_transaction->t_updates) {
349                 DEFINE_WAIT(wait);
350
351                 prepare_to_wait(&journal->j_wait_updates, &wait,
352                                         TASK_UNINTERRUPTIBLE);
353                 if (commit_transaction->t_updates) {
354                         spin_unlock(&commit_transaction->t_handle_lock);
355                         spin_unlock(&journal->j_state_lock);
356                         schedule();
357                         spin_lock(&journal->j_state_lock);
358                         spin_lock(&commit_transaction->t_handle_lock);
359                 }
360                 finish_wait(&journal->j_wait_updates, &wait);
361         }
362         spin_unlock(&commit_transaction->t_handle_lock);
363
364         J_ASSERT (commit_transaction->t_outstanding_credits <=
365                         journal->j_max_transaction_buffers);
366
367         /*
368          * First thing we are allowed to do is to discard any remaining
369          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
370          * that there are no such buffers: if a large filesystem
371          * operation like a truncate needs to split itself over multiple
372          * transactions, then it may try to do a jbd2_journal_restart() while
373          * there are still BJ_Reserved buffers outstanding.  These must
374          * be released cleanly from the current transaction.
375          *
376          * In this case, the filesystem must still reserve write access
377          * again before modifying the buffer in the new transaction, but
378          * we do not require it to remember exactly which old buffers it
379          * has reserved.  This is consistent with the existing behaviour
380          * that multiple jbd2_journal_get_write_access() calls to the same
381          * buffer are perfectly permissable.
382          */
383         while (commit_transaction->t_reserved_list) {
384                 jh = commit_transaction->t_reserved_list;
385                 JBUFFER_TRACE(jh, "reserved, unused: refile");
386                 /*
387                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
388                  * leave undo-committed data.
389                  */
390                 if (jh->b_committed_data) {
391                         struct buffer_head *bh = jh2bh(jh);
392
393                         jbd_lock_bh_state(bh);
394                         jbd2_free(jh->b_committed_data, bh->b_size);
395                         jh->b_committed_data = NULL;
396                         jbd_unlock_bh_state(bh);
397                 }
398                 jbd2_journal_refile_buffer(journal, jh);
399         }
400
401         /*
402          * Now try to drop any written-back buffers from the journal's
403          * checkpoint lists.  We do this *before* commit because it potentially
404          * frees some memory
405          */
406         spin_lock(&journal->j_list_lock);
407         __jbd2_journal_clean_checkpoint_list(journal);
408         spin_unlock(&journal->j_list_lock);
409
410         jbd_debug (3, "JBD: commit phase 1\n");
411
412         /*
413          * Switch to a new revoke table.
414          */
415         jbd2_journal_switch_revoke_table(journal);
416
417         stats.u.run.rs_flushing = jiffies;
418         stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
419                                                stats.u.run.rs_flushing);
420
421         commit_transaction->t_state = T_FLUSH;
422         journal->j_committing_transaction = commit_transaction;
423         journal->j_running_transaction = NULL;
424         commit_transaction->t_log_start = journal->j_head;
425         wake_up(&journal->j_wait_transaction_locked);
426         spin_unlock(&journal->j_state_lock);
427
428         jbd_debug (3, "JBD: commit phase 2\n");
429
430         /*
431          * First, drop modified flag: all accesses to the buffers
432          * will be tracked for a new trasaction only -bzzz
433          */
434         spin_lock(&journal->j_list_lock);
435         if (commit_transaction->t_buffers) {
436                 new_jh = jh = commit_transaction->t_buffers->b_tnext;
437                 do {
438                         J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
439                                         new_jh->b_modified == 0);
440                         new_jh->b_modified = 0;
441                         new_jh = new_jh->b_tnext;
442                 } while (new_jh != jh);
443         }
444         spin_unlock(&journal->j_list_lock);
445
446         /*
447          * Now start flushing things to disk, in the order they appear
448          * on the transaction lists.  Data blocks go first.
449          */
450         err = 0;
451         journal_submit_data_buffers(journal, commit_transaction);
452
453         /*
454          * Wait for all previously submitted IO to complete.
455          */
456         spin_lock(&journal->j_list_lock);
457         while (commit_transaction->t_locked_list) {
458                 struct buffer_head *bh;
459
460                 jh = commit_transaction->t_locked_list->b_tprev;
461                 bh = jh2bh(jh);
462                 get_bh(bh);
463                 if (buffer_locked(bh)) {
464                         spin_unlock(&journal->j_list_lock);
465                         wait_on_buffer(bh);
466                         if (unlikely(!buffer_uptodate(bh)))
467                                 err = -EIO;
468                         spin_lock(&journal->j_list_lock);
469                 }
470                 if (!inverted_lock(journal, bh)) {
471                         put_bh(bh);
472                         spin_lock(&journal->j_list_lock);
473                         continue;
474                 }
475                 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
476                         __jbd2_journal_unfile_buffer(jh);
477                         jbd_unlock_bh_state(bh);
478                         jbd2_journal_remove_journal_head(bh);
479                         put_bh(bh);
480                 } else {
481                         jbd_unlock_bh_state(bh);
482                 }
483                 put_bh(bh);
484                 cond_resched_lock(&journal->j_list_lock);
485         }
486         spin_unlock(&journal->j_list_lock);
487
488         if (err)
489                 jbd2_journal_abort(journal, err);
490
491         jbd2_journal_write_revoke_records(journal, commit_transaction);
492
493         jbd_debug(3, "JBD: commit phase 2\n");
494
495         /*
496          * If we found any dirty or locked buffers, then we should have
497          * looped back up to the write_out_data label.  If there weren't
498          * any then journal_clean_data_list should have wiped the list
499          * clean by now, so check that it is in fact empty.
500          */
501         J_ASSERT (commit_transaction->t_sync_datalist == NULL);
502
503         jbd_debug (3, "JBD: commit phase 3\n");
504
505         /*
506          * Way to go: we have now written out all of the data for a
507          * transaction!  Now comes the tricky part: we need to write out
508          * metadata.  Loop over the transaction's entire buffer list:
509          */
510         commit_transaction->t_state = T_COMMIT;
511
512         stats.u.run.rs_logging = jiffies;
513         stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
514                                                  stats.u.run.rs_logging);
515         stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
516         stats.u.run.rs_blocks_logged = 0;
517
518         descriptor = NULL;
519         bufs = 0;
520         while (commit_transaction->t_buffers) {
521
522                 /* Find the next buffer to be journaled... */
523
524                 jh = commit_transaction->t_buffers;
525
526                 /* If we're in abort mode, we just un-journal the buffer and
527                    release it for background writing. */
528
529                 if (is_journal_aborted(journal)) {
530                         JBUFFER_TRACE(jh, "journal is aborting: refile");
531                         jbd2_journal_refile_buffer(journal, jh);
532                         /* If that was the last one, we need to clean up
533                          * any descriptor buffers which may have been
534                          * already allocated, even if we are now
535                          * aborting. */
536                         if (!commit_transaction->t_buffers)
537                                 goto start_journal_io;
538                         continue;
539                 }
540
541                 /* Make sure we have a descriptor block in which to
542                    record the metadata buffer. */
543
544                 if (!descriptor) {
545                         struct buffer_head *bh;
546
547                         J_ASSERT (bufs == 0);
548
549                         jbd_debug(4, "JBD: get descriptor\n");
550
551                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
552                         if (!descriptor) {
553                                 jbd2_journal_abort(journal, -EIO);
554                                 continue;
555                         }
556
557                         bh = jh2bh(descriptor);
558                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
559                                 (unsigned long long)bh->b_blocknr, bh->b_data);
560                         header = (journal_header_t *)&bh->b_data[0];
561                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
562                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
563                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
564
565                         tagp = &bh->b_data[sizeof(journal_header_t)];
566                         space_left = bh->b_size - sizeof(journal_header_t);
567                         first_tag = 1;
568                         set_buffer_jwrite(bh);
569                         set_buffer_dirty(bh);
570                         wbuf[bufs++] = bh;
571
572                         /* Record it so that we can wait for IO
573                            completion later */
574                         BUFFER_TRACE(bh, "ph3: file as descriptor");
575                         jbd2_journal_file_buffer(descriptor, commit_transaction,
576                                         BJ_LogCtl);
577                 }
578
579                 /* Where is the buffer to be written? */
580
581                 err = jbd2_journal_next_log_block(journal, &blocknr);
582                 /* If the block mapping failed, just abandon the buffer
583                    and repeat this loop: we'll fall into the
584                    refile-on-abort condition above. */
585                 if (err) {
586                         jbd2_journal_abort(journal, err);
587                         continue;
588                 }
589
590                 /*
591                  * start_this_handle() uses t_outstanding_credits to determine
592                  * the free space in the log, but this counter is changed
593                  * by jbd2_journal_next_log_block() also.
594                  */
595                 commit_transaction->t_outstanding_credits--;
596
597                 /* Bump b_count to prevent truncate from stumbling over
598                    the shadowed buffer!  @@@ This can go if we ever get
599                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
600                 atomic_inc(&jh2bh(jh)->b_count);
601
602                 /* Make a temporary IO buffer with which to write it out
603                    (this will requeue both the metadata buffer and the
604                    temporary IO buffer). new_bh goes on BJ_IO*/
605
606                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
607                 /*
608                  * akpm: jbd2_journal_write_metadata_buffer() sets
609                  * new_bh->b_transaction to commit_transaction.
610                  * We need to clean this up before we release new_bh
611                  * (which is of type BJ_IO)
612                  */
613                 JBUFFER_TRACE(jh, "ph3: write metadata");
614                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
615                                                       jh, &new_jh, blocknr);
616                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
617                 wbuf[bufs++] = jh2bh(new_jh);
618
619                 /* Record the new block's tag in the current descriptor
620                    buffer */
621
622                 tag_flag = 0;
623                 if (flags & 1)
624                         tag_flag |= JBD2_FLAG_ESCAPE;
625                 if (!first_tag)
626                         tag_flag |= JBD2_FLAG_SAME_UUID;
627
628                 tag = (journal_block_tag_t *) tagp;
629                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
630                 tag->t_flags = cpu_to_be32(tag_flag);
631                 tagp += tag_bytes;
632                 space_left -= tag_bytes;
633
634                 if (first_tag) {
635                         memcpy (tagp, journal->j_uuid, 16);
636                         tagp += 16;
637                         space_left -= 16;
638                         first_tag = 0;
639                 }
640
641                 /* If there's no more to do, or if the descriptor is full,
642                    let the IO rip! */
643
644                 if (bufs == journal->j_wbufsize ||
645                     commit_transaction->t_buffers == NULL ||
646                     space_left < tag_bytes + 16) {
647
648                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
649
650                         /* Write an end-of-descriptor marker before
651                            submitting the IOs.  "tag" still points to
652                            the last tag we set up. */
653
654                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
655
656 start_journal_io:
657                         for (i = 0; i < bufs; i++) {
658                                 struct buffer_head *bh = wbuf[i];
659                                 lock_buffer(bh);
660                                 clear_buffer_dirty(bh);
661                                 set_buffer_uptodate(bh);
662                                 bh->b_end_io = journal_end_buffer_io_sync;
663                                 submit_bh(WRITE, bh);
664                         }
665                         cond_resched();
666                         stats.u.run.rs_blocks_logged += bufs;
667
668                         /* Force a new descriptor to be generated next
669                            time round the loop. */
670                         descriptor = NULL;
671                         bufs = 0;
672                 }
673         }
674
675         /* Lo and behold: we have just managed to send a transaction to
676            the log.  Before we can commit it, wait for the IO so far to
677            complete.  Control buffers being written are on the
678            transaction's t_log_list queue, and metadata buffers are on
679            the t_iobuf_list queue.
680
681            Wait for the buffers in reverse order.  That way we are
682            less likely to be woken up until all IOs have completed, and
683            so we incur less scheduling load.
684         */
685
686         jbd_debug(3, "JBD: commit phase 4\n");
687
688         /*
689          * akpm: these are BJ_IO, and j_list_lock is not needed.
690          * See __journal_try_to_free_buffer.
691          */
692 wait_for_iobuf:
693         while (commit_transaction->t_iobuf_list != NULL) {
694                 struct buffer_head *bh;
695
696                 jh = commit_transaction->t_iobuf_list->b_tprev;
697                 bh = jh2bh(jh);
698                 if (buffer_locked(bh)) {
699                         wait_on_buffer(bh);
700                         goto wait_for_iobuf;
701                 }
702                 if (cond_resched())
703                         goto wait_for_iobuf;
704
705                 if (unlikely(!buffer_uptodate(bh)))
706                         err = -EIO;
707
708                 clear_buffer_jwrite(bh);
709
710                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
711                 jbd2_journal_unfile_buffer(journal, jh);
712
713                 /*
714                  * ->t_iobuf_list should contain only dummy buffer_heads
715                  * which were created by jbd2_journal_write_metadata_buffer().
716                  */
717                 BUFFER_TRACE(bh, "dumping temporary bh");
718                 jbd2_journal_put_journal_head(jh);
719                 __brelse(bh);
720                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
721                 free_buffer_head(bh);
722
723                 /* We also have to unlock and free the corresponding
724                    shadowed buffer */
725                 jh = commit_transaction->t_shadow_list->b_tprev;
726                 bh = jh2bh(jh);
727                 clear_bit(BH_JWrite, &bh->b_state);
728                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
729
730                 /* The metadata is now released for reuse, but we need
731                    to remember it against this transaction so that when
732                    we finally commit, we can do any checkpointing
733                    required. */
734                 JBUFFER_TRACE(jh, "file as BJ_Forget");
735                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
736                 /* Wake up any transactions which were waiting for this
737                    IO to complete */
738                 wake_up_bit(&bh->b_state, BH_Unshadow);
739                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
740                 __brelse(bh);
741         }
742
743         J_ASSERT (commit_transaction->t_shadow_list == NULL);
744
745         jbd_debug(3, "JBD: commit phase 5\n");
746
747         /* Here we wait for the revoke record and descriptor record buffers */
748  wait_for_ctlbuf:
749         while (commit_transaction->t_log_list != NULL) {
750                 struct buffer_head *bh;
751
752                 jh = commit_transaction->t_log_list->b_tprev;
753                 bh = jh2bh(jh);
754                 if (buffer_locked(bh)) {
755                         wait_on_buffer(bh);
756                         goto wait_for_ctlbuf;
757                 }
758                 if (cond_resched())
759                         goto wait_for_ctlbuf;
760
761                 if (unlikely(!buffer_uptodate(bh)))
762                         err = -EIO;
763
764                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
765                 clear_buffer_jwrite(bh);
766                 jbd2_journal_unfile_buffer(journal, jh);
767                 jbd2_journal_put_journal_head(jh);
768                 __brelse(bh);           /* One for getblk */
769                 /* AKPM: bforget here */
770         }
771
772         jbd_debug(3, "JBD: commit phase 6\n");
773
774         if (journal_write_commit_record(journal, commit_transaction))
775                 err = -EIO;
776
777         if (err)
778                 jbd2_journal_abort(journal, err);
779
780         /* End of a transaction!  Finally, we can do checkpoint
781            processing: any buffers committed as a result of this
782            transaction can be removed from any checkpoint list it was on
783            before. */
784
785         jbd_debug(3, "JBD: commit phase 7\n");
786
787         J_ASSERT(commit_transaction->t_sync_datalist == NULL);
788         J_ASSERT(commit_transaction->t_buffers == NULL);
789         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
790         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
791         J_ASSERT(commit_transaction->t_shadow_list == NULL);
792         J_ASSERT(commit_transaction->t_log_list == NULL);
793
794 restart_loop:
795         /*
796          * As there are other places (journal_unmap_buffer()) adding buffers
797          * to this list we have to be careful and hold the j_list_lock.
798          */
799         spin_lock(&journal->j_list_lock);
800         while (commit_transaction->t_forget) {
801                 transaction_t *cp_transaction;
802                 struct buffer_head *bh;
803
804                 jh = commit_transaction->t_forget;
805                 spin_unlock(&journal->j_list_lock);
806                 bh = jh2bh(jh);
807                 jbd_lock_bh_state(bh);
808                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
809                         jh->b_transaction == journal->j_running_transaction);
810
811                 /*
812                  * If there is undo-protected committed data against
813                  * this buffer, then we can remove it now.  If it is a
814                  * buffer needing such protection, the old frozen_data
815                  * field now points to a committed version of the
816                  * buffer, so rotate that field to the new committed
817                  * data.
818                  *
819                  * Otherwise, we can just throw away the frozen data now.
820                  */
821                 if (jh->b_committed_data) {
822                         jbd2_free(jh->b_committed_data, bh->b_size);
823                         jh->b_committed_data = NULL;
824                         if (jh->b_frozen_data) {
825                                 jh->b_committed_data = jh->b_frozen_data;
826                                 jh->b_frozen_data = NULL;
827                         }
828                 } else if (jh->b_frozen_data) {
829                         jbd2_free(jh->b_frozen_data, bh->b_size);
830                         jh->b_frozen_data = NULL;
831                 }
832
833                 spin_lock(&journal->j_list_lock);
834                 cp_transaction = jh->b_cp_transaction;
835                 if (cp_transaction) {
836                         JBUFFER_TRACE(jh, "remove from old cp transaction");
837                         cp_transaction->t_chp_stats.cs_dropped++;
838                         __jbd2_journal_remove_checkpoint(jh);
839                 }
840
841                 /* Only re-checkpoint the buffer_head if it is marked
842                  * dirty.  If the buffer was added to the BJ_Forget list
843                  * by jbd2_journal_forget, it may no longer be dirty and
844                  * there's no point in keeping a checkpoint record for
845                  * it. */
846
847                 /* A buffer which has been freed while still being
848                  * journaled by a previous transaction may end up still
849                  * being dirty here, but we want to avoid writing back
850                  * that buffer in the future now that the last use has
851                  * been committed.  That's not only a performance gain,
852                  * it also stops aliasing problems if the buffer is left
853                  * behind for writeback and gets reallocated for another
854                  * use in a different page. */
855                 if (buffer_freed(bh)) {
856                         clear_buffer_freed(bh);
857                         clear_buffer_jbddirty(bh);
858                 }
859
860                 if (buffer_jbddirty(bh)) {
861                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
862                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
863                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
864                         __jbd2_journal_refile_buffer(jh);
865                         jbd_unlock_bh_state(bh);
866                 } else {
867                         J_ASSERT_BH(bh, !buffer_dirty(bh));
868                         /* The buffer on BJ_Forget list and not jbddirty means
869                          * it has been freed by this transaction and hence it
870                          * could not have been reallocated until this
871                          * transaction has committed. *BUT* it could be
872                          * reallocated once we have written all the data to
873                          * disk and before we process the buffer on BJ_Forget
874                          * list. */
875                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
876                         __jbd2_journal_refile_buffer(jh);
877                         if (!jh->b_transaction) {
878                                 jbd_unlock_bh_state(bh);
879                                  /* needs a brelse */
880                                 jbd2_journal_remove_journal_head(bh);
881                                 release_buffer_page(bh);
882                         } else
883                                 jbd_unlock_bh_state(bh);
884                 }
885                 cond_resched_lock(&journal->j_list_lock);
886         }
887         spin_unlock(&journal->j_list_lock);
888         /*
889          * This is a bit sleazy.  We use j_list_lock to protect transition
890          * of a transaction into T_FINISHED state and calling
891          * __jbd2_journal_drop_transaction(). Otherwise we could race with
892          * other checkpointing code processing the transaction...
893          */
894         spin_lock(&journal->j_state_lock);
895         spin_lock(&journal->j_list_lock);
896         /*
897          * Now recheck if some buffers did not get attached to the transaction
898          * while the lock was dropped...
899          */
900         if (commit_transaction->t_forget) {
901                 spin_unlock(&journal->j_list_lock);
902                 spin_unlock(&journal->j_state_lock);
903                 goto restart_loop;
904         }
905
906         /* Done with this transaction! */
907
908         jbd_debug(3, "JBD: commit phase 8\n");
909
910         J_ASSERT(commit_transaction->t_state == T_COMMIT);
911
912         commit_transaction->t_start = jiffies;
913         stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
914                                                 commit_transaction->t_start);
915
916         /*
917          * File the transaction for history
918          */
919         stats.ts_type = JBD2_STATS_RUN;
920         stats.ts_tid = commit_transaction->t_tid;
921         stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
922         spin_lock(&journal->j_history_lock);
923         memcpy(journal->j_history + journal->j_history_cur, &stats,
924                         sizeof(stats));
925         if (++journal->j_history_cur == journal->j_history_max)
926                 journal->j_history_cur = 0;
927
928         /*
929          * Calculate overall stats
930          */
931         journal->j_stats.ts_tid++;
932         journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
933         journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
934         journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
935         journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
936         journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
937         journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
938         journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
939         journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
940         spin_unlock(&journal->j_history_lock);
941
942         commit_transaction->t_state = T_FINISHED;
943         J_ASSERT(commit_transaction == journal->j_committing_transaction);
944         journal->j_commit_sequence = commit_transaction->t_tid;
945         journal->j_committing_transaction = NULL;
946         spin_unlock(&journal->j_state_lock);
947
948         if (commit_transaction->t_checkpoint_list == NULL &&
949             commit_transaction->t_checkpoint_io_list == NULL) {
950                 __jbd2_journal_drop_transaction(journal, commit_transaction);
951         } else {
952                 if (journal->j_checkpoint_transactions == NULL) {
953                         journal->j_checkpoint_transactions = commit_transaction;
954                         commit_transaction->t_cpnext = commit_transaction;
955                         commit_transaction->t_cpprev = commit_transaction;
956                 } else {
957                         commit_transaction->t_cpnext =
958                                 journal->j_checkpoint_transactions;
959                         commit_transaction->t_cpprev =
960                                 commit_transaction->t_cpnext->t_cpprev;
961                         commit_transaction->t_cpnext->t_cpprev =
962                                 commit_transaction;
963                         commit_transaction->t_cpprev->t_cpnext =
964                                 commit_transaction;
965                 }
966         }
967         spin_unlock(&journal->j_list_lock);
968
969         jbd_debug(1, "JBD: commit %d complete, head %d\n",
970                   journal->j_commit_sequence, journal->j_tail_sequence);
971
972         wake_up(&journal->j_wait_done_commit);
973 }