jbd2: Remove data=ordered mode support using jbd buffer heads
[linux-2.6.git] / fs / jbd2 / commit.c
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25
26 /*
27  * Default IO end handler for temporary BJ_IO buffer_heads.
28  */
29 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30 {
31         BUFFER_TRACE(bh, "");
32         if (uptodate)
33                 set_buffer_uptodate(bh);
34         else
35                 clear_buffer_uptodate(bh);
36         unlock_buffer(bh);
37 }
38
39 /*
40  * When an ext4 file is truncated, it is possible that some pages are not
41  * successfully freed, because they are attached to a committing transaction.
42  * After the transaction commits, these pages are left on the LRU, with no
43  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
44  * by the VM, but their apparent absence upsets the VM accounting, and it makes
45  * the numbers in /proc/meminfo look odd.
46  *
47  * So here, we have a buffer which has just come off the forget list.  Look to
48  * see if we can strip all buffers from the backing page.
49  *
50  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
51  * caller provided us with a ref against the buffer, and we drop that here.
52  */
53 static void release_buffer_page(struct buffer_head *bh)
54 {
55         struct page *page;
56
57         if (buffer_dirty(bh))
58                 goto nope;
59         if (atomic_read(&bh->b_count) != 1)
60                 goto nope;
61         page = bh->b_page;
62         if (!page)
63                 goto nope;
64         if (page->mapping)
65                 goto nope;
66
67         /* OK, it's a truncated page */
68         if (TestSetPageLocked(page))
69                 goto nope;
70
71         page_cache_get(page);
72         __brelse(bh);
73         try_to_free_buffers(page);
74         unlock_page(page);
75         page_cache_release(page);
76         return;
77
78 nope:
79         __brelse(bh);
80 }
81
82 /*
83  * Done it all: now submit the commit record.  We should have
84  * cleaned up our previous buffers by now, so if we are in abort
85  * mode we can now just skip the rest of the journal write
86  * entirely.
87  *
88  * Returns 1 if the journal needs to be aborted or 0 on success
89  */
90 static int journal_submit_commit_record(journal_t *journal,
91                                         transaction_t *commit_transaction,
92                                         struct buffer_head **cbh,
93                                         __u32 crc32_sum)
94 {
95         struct journal_head *descriptor;
96         struct commit_header *tmp;
97         struct buffer_head *bh;
98         int ret;
99         int barrier_done = 0;
100         struct timespec now = current_kernel_time();
101
102         if (is_journal_aborted(journal))
103                 return 0;
104
105         descriptor = jbd2_journal_get_descriptor_buffer(journal);
106         if (!descriptor)
107                 return 1;
108
109         bh = jh2bh(descriptor);
110
111         tmp = (struct commit_header *)bh->b_data;
112         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
113         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
114         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
115         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
116         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
117
118         if (JBD2_HAS_COMPAT_FEATURE(journal,
119                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
120                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
121                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
122                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
123         }
124
125         JBUFFER_TRACE(descriptor, "submit commit block");
126         lock_buffer(bh);
127         get_bh(bh);
128         set_buffer_dirty(bh);
129         set_buffer_uptodate(bh);
130         bh->b_end_io = journal_end_buffer_io_sync;
131
132         if (journal->j_flags & JBD2_BARRIER &&
133                 !JBD2_HAS_INCOMPAT_FEATURE(journal,
134                                          JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
135                 set_buffer_ordered(bh);
136                 barrier_done = 1;
137         }
138         ret = submit_bh(WRITE, bh);
139         if (barrier_done)
140                 clear_buffer_ordered(bh);
141
142         /* is it possible for another commit to fail at roughly
143          * the same time as this one?  If so, we don't want to
144          * trust the barrier flag in the super, but instead want
145          * to remember if we sent a barrier request
146          */
147         if (ret == -EOPNOTSUPP && barrier_done) {
148                 char b[BDEVNAME_SIZE];
149
150                 printk(KERN_WARNING
151                         "JBD: barrier-based sync failed on %s - "
152                         "disabling barriers\n",
153                         bdevname(journal->j_dev, b));
154                 spin_lock(&journal->j_state_lock);
155                 journal->j_flags &= ~JBD2_BARRIER;
156                 spin_unlock(&journal->j_state_lock);
157
158                 /* And try again, without the barrier */
159                 lock_buffer(bh);
160                 set_buffer_uptodate(bh);
161                 set_buffer_dirty(bh);
162                 ret = submit_bh(WRITE, bh);
163         }
164         *cbh = bh;
165         return ret;
166 }
167
168 /*
169  * This function along with journal_submit_commit_record
170  * allows to write the commit record asynchronously.
171  */
172 static int journal_wait_on_commit_record(struct buffer_head *bh)
173 {
174         int ret = 0;
175
176         clear_buffer_dirty(bh);
177         wait_on_buffer(bh);
178
179         if (unlikely(!buffer_uptodate(bh)))
180                 ret = -EIO;
181         put_bh(bh);            /* One for getblk() */
182         jbd2_journal_put_journal_head(bh2jh(bh));
183
184         return ret;
185 }
186
187 /*
188  * Submit all the data buffers of inode associated with the transaction to
189  * disk.
190  *
191  * We are in a committing transaction. Therefore no new inode can be added to
192  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
193  * operate on from being released while we write out pages.
194  */
195 static int journal_submit_inode_data_buffers(journal_t *journal,
196                 transaction_t *commit_transaction)
197 {
198         struct jbd2_inode *jinode;
199         int err, ret = 0;
200         struct address_space *mapping;
201
202         spin_lock(&journal->j_list_lock);
203         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
204                 mapping = jinode->i_vfs_inode->i_mapping;
205                 jinode->i_flags |= JI_COMMIT_RUNNING;
206                 spin_unlock(&journal->j_list_lock);
207                 err = filemap_fdatawrite_range(mapping, 0,
208                                         i_size_read(jinode->i_vfs_inode));
209                 if (!ret)
210                         ret = err;
211                 spin_lock(&journal->j_list_lock);
212                 J_ASSERT(jinode->i_transaction == commit_transaction);
213                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
214                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
215         }
216         spin_unlock(&journal->j_list_lock);
217         return ret;
218 }
219
220 /*
221  * Wait for data submitted for writeout, refile inodes to proper
222  * transaction if needed.
223  *
224  */
225 static int journal_finish_inode_data_buffers(journal_t *journal,
226                 transaction_t *commit_transaction)
227 {
228         struct jbd2_inode *jinode, *next_i;
229         int err, ret = 0;
230
231         /* For locking, see the comment in journal_submit_inode_data_buffers() */
232         spin_lock(&journal->j_list_lock);
233         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
234                 jinode->i_flags |= JI_COMMIT_RUNNING;
235                 spin_unlock(&journal->j_list_lock);
236                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
237                 if (!ret)
238                         ret = err;
239                 spin_lock(&journal->j_list_lock);
240                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
241                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
242         }
243
244         /* Now refile inode to proper lists */
245         list_for_each_entry_safe(jinode, next_i,
246                                  &commit_transaction->t_inode_list, i_list) {
247                 list_del(&jinode->i_list);
248                 if (jinode->i_next_transaction) {
249                         jinode->i_transaction = jinode->i_next_transaction;
250                         jinode->i_next_transaction = NULL;
251                         list_add(&jinode->i_list,
252                                 &jinode->i_transaction->t_inode_list);
253                 } else {
254                         jinode->i_transaction = NULL;
255                 }
256         }
257         spin_unlock(&journal->j_list_lock);
258
259         return ret;
260 }
261
262 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
263 {
264         struct page *page = bh->b_page;
265         char *addr;
266         __u32 checksum;
267
268         addr = kmap_atomic(page, KM_USER0);
269         checksum = crc32_be(crc32_sum,
270                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
271         kunmap_atomic(addr, KM_USER0);
272
273         return checksum;
274 }
275
276 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
277                                    unsigned long long block)
278 {
279         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
280         if (tag_bytes > JBD2_TAG_SIZE32)
281                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
282 }
283
284 /*
285  * jbd2_journal_commit_transaction
286  *
287  * The primary function for committing a transaction to the log.  This
288  * function is called by the journal thread to begin a complete commit.
289  */
290 void jbd2_journal_commit_transaction(journal_t *journal)
291 {
292         struct transaction_stats_s stats;
293         transaction_t *commit_transaction;
294         struct journal_head *jh, *new_jh, *descriptor;
295         struct buffer_head **wbuf = journal->j_wbuf;
296         int bufs;
297         int flags;
298         int err;
299         unsigned long long blocknr;
300         char *tagp = NULL;
301         journal_header_t *header;
302         journal_block_tag_t *tag = NULL;
303         int space_left = 0;
304         int first_tag = 0;
305         int tag_flag;
306         int i;
307         int tag_bytes = journal_tag_bytes(journal);
308         struct buffer_head *cbh = NULL; /* For transactional checksums */
309         __u32 crc32_sum = ~0;
310
311         /*
312          * First job: lock down the current transaction and wait for
313          * all outstanding updates to complete.
314          */
315
316 #ifdef COMMIT_STATS
317         spin_lock(&journal->j_list_lock);
318         summarise_journal_usage(journal);
319         spin_unlock(&journal->j_list_lock);
320 #endif
321
322         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
323         if (journal->j_flags & JBD2_FLUSHED) {
324                 jbd_debug(3, "super block updated\n");
325                 jbd2_journal_update_superblock(journal, 1);
326         } else {
327                 jbd_debug(3, "superblock not updated\n");
328         }
329
330         J_ASSERT(journal->j_running_transaction != NULL);
331         J_ASSERT(journal->j_committing_transaction == NULL);
332
333         commit_transaction = journal->j_running_transaction;
334         J_ASSERT(commit_transaction->t_state == T_RUNNING);
335
336         jbd_debug(1, "JBD: starting commit of transaction %d\n",
337                         commit_transaction->t_tid);
338
339         spin_lock(&journal->j_state_lock);
340         commit_transaction->t_state = T_LOCKED;
341
342         stats.u.run.rs_wait = commit_transaction->t_max_wait;
343         stats.u.run.rs_locked = jiffies;
344         stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
345                                                 stats.u.run.rs_locked);
346
347         spin_lock(&commit_transaction->t_handle_lock);
348         while (commit_transaction->t_updates) {
349                 DEFINE_WAIT(wait);
350
351                 prepare_to_wait(&journal->j_wait_updates, &wait,
352                                         TASK_UNINTERRUPTIBLE);
353                 if (commit_transaction->t_updates) {
354                         spin_unlock(&commit_transaction->t_handle_lock);
355                         spin_unlock(&journal->j_state_lock);
356                         schedule();
357                         spin_lock(&journal->j_state_lock);
358                         spin_lock(&commit_transaction->t_handle_lock);
359                 }
360                 finish_wait(&journal->j_wait_updates, &wait);
361         }
362         spin_unlock(&commit_transaction->t_handle_lock);
363
364         J_ASSERT (commit_transaction->t_outstanding_credits <=
365                         journal->j_max_transaction_buffers);
366
367         /*
368          * First thing we are allowed to do is to discard any remaining
369          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
370          * that there are no such buffers: if a large filesystem
371          * operation like a truncate needs to split itself over multiple
372          * transactions, then it may try to do a jbd2_journal_restart() while
373          * there are still BJ_Reserved buffers outstanding.  These must
374          * be released cleanly from the current transaction.
375          *
376          * In this case, the filesystem must still reserve write access
377          * again before modifying the buffer in the new transaction, but
378          * we do not require it to remember exactly which old buffers it
379          * has reserved.  This is consistent with the existing behaviour
380          * that multiple jbd2_journal_get_write_access() calls to the same
381          * buffer are perfectly permissable.
382          */
383         while (commit_transaction->t_reserved_list) {
384                 jh = commit_transaction->t_reserved_list;
385                 JBUFFER_TRACE(jh, "reserved, unused: refile");
386                 /*
387                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
388                  * leave undo-committed data.
389                  */
390                 if (jh->b_committed_data) {
391                         struct buffer_head *bh = jh2bh(jh);
392
393                         jbd_lock_bh_state(bh);
394                         jbd2_free(jh->b_committed_data, bh->b_size);
395                         jh->b_committed_data = NULL;
396                         jbd_unlock_bh_state(bh);
397                 }
398                 jbd2_journal_refile_buffer(journal, jh);
399         }
400
401         /*
402          * Now try to drop any written-back buffers from the journal's
403          * checkpoint lists.  We do this *before* commit because it potentially
404          * frees some memory
405          */
406         spin_lock(&journal->j_list_lock);
407         __jbd2_journal_clean_checkpoint_list(journal);
408         spin_unlock(&journal->j_list_lock);
409
410         jbd_debug (3, "JBD: commit phase 1\n");
411
412         /*
413          * Switch to a new revoke table.
414          */
415         jbd2_journal_switch_revoke_table(journal);
416
417         stats.u.run.rs_flushing = jiffies;
418         stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
419                                                stats.u.run.rs_flushing);
420
421         commit_transaction->t_state = T_FLUSH;
422         journal->j_committing_transaction = commit_transaction;
423         journal->j_running_transaction = NULL;
424         commit_transaction->t_log_start = journal->j_head;
425         wake_up(&journal->j_wait_transaction_locked);
426         spin_unlock(&journal->j_state_lock);
427
428         jbd_debug (3, "JBD: commit phase 2\n");
429
430         /*
431          * Now start flushing things to disk, in the order they appear
432          * on the transaction lists.  Data blocks go first.
433          */
434         err = journal_submit_inode_data_buffers(journal, commit_transaction);
435         if (err)
436                 jbd2_journal_abort(journal, err);
437
438         jbd2_journal_write_revoke_records(journal, commit_transaction);
439
440         jbd_debug(3, "JBD: commit phase 2\n");
441
442         /*
443          * Way to go: we have now written out all of the data for a
444          * transaction!  Now comes the tricky part: we need to write out
445          * metadata.  Loop over the transaction's entire buffer list:
446          */
447         spin_lock(&journal->j_state_lock);
448         commit_transaction->t_state = T_COMMIT;
449         spin_unlock(&journal->j_state_lock);
450
451         stats.u.run.rs_logging = jiffies;
452         stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
453                                                  stats.u.run.rs_logging);
454         stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
455         stats.u.run.rs_blocks_logged = 0;
456
457         J_ASSERT(commit_transaction->t_nr_buffers <=
458                  commit_transaction->t_outstanding_credits);
459
460         err = 0;
461         descriptor = NULL;
462         bufs = 0;
463         while (commit_transaction->t_buffers) {
464
465                 /* Find the next buffer to be journaled... */
466
467                 jh = commit_transaction->t_buffers;
468
469                 /* If we're in abort mode, we just un-journal the buffer and
470                    release it for background writing. */
471
472                 if (is_journal_aborted(journal)) {
473                         JBUFFER_TRACE(jh, "journal is aborting: refile");
474                         jbd2_journal_refile_buffer(journal, jh);
475                         /* If that was the last one, we need to clean up
476                          * any descriptor buffers which may have been
477                          * already allocated, even if we are now
478                          * aborting. */
479                         if (!commit_transaction->t_buffers)
480                                 goto start_journal_io;
481                         continue;
482                 }
483
484                 /* Make sure we have a descriptor block in which to
485                    record the metadata buffer. */
486
487                 if (!descriptor) {
488                         struct buffer_head *bh;
489
490                         J_ASSERT (bufs == 0);
491
492                         jbd_debug(4, "JBD: get descriptor\n");
493
494                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
495                         if (!descriptor) {
496                                 jbd2_journal_abort(journal, -EIO);
497                                 continue;
498                         }
499
500                         bh = jh2bh(descriptor);
501                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
502                                 (unsigned long long)bh->b_blocknr, bh->b_data);
503                         header = (journal_header_t *)&bh->b_data[0];
504                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
505                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
506                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
507
508                         tagp = &bh->b_data[sizeof(journal_header_t)];
509                         space_left = bh->b_size - sizeof(journal_header_t);
510                         first_tag = 1;
511                         set_buffer_jwrite(bh);
512                         set_buffer_dirty(bh);
513                         wbuf[bufs++] = bh;
514
515                         /* Record it so that we can wait for IO
516                            completion later */
517                         BUFFER_TRACE(bh, "ph3: file as descriptor");
518                         jbd2_journal_file_buffer(descriptor, commit_transaction,
519                                         BJ_LogCtl);
520                 }
521
522                 /* Where is the buffer to be written? */
523
524                 err = jbd2_journal_next_log_block(journal, &blocknr);
525                 /* If the block mapping failed, just abandon the buffer
526                    and repeat this loop: we'll fall into the
527                    refile-on-abort condition above. */
528                 if (err) {
529                         jbd2_journal_abort(journal, err);
530                         continue;
531                 }
532
533                 /*
534                  * start_this_handle() uses t_outstanding_credits to determine
535                  * the free space in the log, but this counter is changed
536                  * by jbd2_journal_next_log_block() also.
537                  */
538                 commit_transaction->t_outstanding_credits--;
539
540                 /* Bump b_count to prevent truncate from stumbling over
541                    the shadowed buffer!  @@@ This can go if we ever get
542                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
543                 atomic_inc(&jh2bh(jh)->b_count);
544
545                 /* Make a temporary IO buffer with which to write it out
546                    (this will requeue both the metadata buffer and the
547                    temporary IO buffer). new_bh goes on BJ_IO*/
548
549                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
550                 /*
551                  * akpm: jbd2_journal_write_metadata_buffer() sets
552                  * new_bh->b_transaction to commit_transaction.
553                  * We need to clean this up before we release new_bh
554                  * (which is of type BJ_IO)
555                  */
556                 JBUFFER_TRACE(jh, "ph3: write metadata");
557                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
558                                                       jh, &new_jh, blocknr);
559                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
560                 wbuf[bufs++] = jh2bh(new_jh);
561
562                 /* Record the new block's tag in the current descriptor
563                    buffer */
564
565                 tag_flag = 0;
566                 if (flags & 1)
567                         tag_flag |= JBD2_FLAG_ESCAPE;
568                 if (!first_tag)
569                         tag_flag |= JBD2_FLAG_SAME_UUID;
570
571                 tag = (journal_block_tag_t *) tagp;
572                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
573                 tag->t_flags = cpu_to_be32(tag_flag);
574                 tagp += tag_bytes;
575                 space_left -= tag_bytes;
576
577                 if (first_tag) {
578                         memcpy (tagp, journal->j_uuid, 16);
579                         tagp += 16;
580                         space_left -= 16;
581                         first_tag = 0;
582                 }
583
584                 /* If there's no more to do, or if the descriptor is full,
585                    let the IO rip! */
586
587                 if (bufs == journal->j_wbufsize ||
588                     commit_transaction->t_buffers == NULL ||
589                     space_left < tag_bytes + 16) {
590
591                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
592
593                         /* Write an end-of-descriptor marker before
594                            submitting the IOs.  "tag" still points to
595                            the last tag we set up. */
596
597                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
598
599 start_journal_io:
600                         for (i = 0; i < bufs; i++) {
601                                 struct buffer_head *bh = wbuf[i];
602                                 /*
603                                  * Compute checksum.
604                                  */
605                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
606                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
607                                         crc32_sum =
608                                             jbd2_checksum_data(crc32_sum, bh);
609                                 }
610
611                                 lock_buffer(bh);
612                                 clear_buffer_dirty(bh);
613                                 set_buffer_uptodate(bh);
614                                 bh->b_end_io = journal_end_buffer_io_sync;
615                                 submit_bh(WRITE, bh);
616                         }
617                         cond_resched();
618                         stats.u.run.rs_blocks_logged += bufs;
619
620                         /* Force a new descriptor to be generated next
621                            time round the loop. */
622                         descriptor = NULL;
623                         bufs = 0;
624                 }
625         }
626
627         /* Done it all: now write the commit record asynchronously. */
628
629         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
630                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
631                 err = journal_submit_commit_record(journal, commit_transaction,
632                                                  &cbh, crc32_sum);
633                 if (err)
634                         __jbd2_journal_abort_hard(journal);
635         }
636
637         /*
638          * This is the right place to wait for data buffers both for ASYNC
639          * and !ASYNC commit. If commit is ASYNC, we need to wait only after
640          * the commit block went to disk (which happens above). If commit is
641          * SYNC, we need to wait for data buffers before we start writing
642          * commit block, which happens below in such setting.
643          */
644         err = journal_finish_inode_data_buffers(journal, commit_transaction);
645         if (err)
646                 jbd2_journal_abort(journal, err);
647
648         /* Lo and behold: we have just managed to send a transaction to
649            the log.  Before we can commit it, wait for the IO so far to
650            complete.  Control buffers being written are on the
651            transaction's t_log_list queue, and metadata buffers are on
652            the t_iobuf_list queue.
653
654            Wait for the buffers in reverse order.  That way we are
655            less likely to be woken up until all IOs have completed, and
656            so we incur less scheduling load.
657         */
658
659         jbd_debug(3, "JBD: commit phase 3\n");
660
661         /*
662          * akpm: these are BJ_IO, and j_list_lock is not needed.
663          * See __journal_try_to_free_buffer.
664          */
665 wait_for_iobuf:
666         while (commit_transaction->t_iobuf_list != NULL) {
667                 struct buffer_head *bh;
668
669                 jh = commit_transaction->t_iobuf_list->b_tprev;
670                 bh = jh2bh(jh);
671                 if (buffer_locked(bh)) {
672                         wait_on_buffer(bh);
673                         goto wait_for_iobuf;
674                 }
675                 if (cond_resched())
676                         goto wait_for_iobuf;
677
678                 if (unlikely(!buffer_uptodate(bh)))
679                         err = -EIO;
680
681                 clear_buffer_jwrite(bh);
682
683                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
684                 jbd2_journal_unfile_buffer(journal, jh);
685
686                 /*
687                  * ->t_iobuf_list should contain only dummy buffer_heads
688                  * which were created by jbd2_journal_write_metadata_buffer().
689                  */
690                 BUFFER_TRACE(bh, "dumping temporary bh");
691                 jbd2_journal_put_journal_head(jh);
692                 __brelse(bh);
693                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
694                 free_buffer_head(bh);
695
696                 /* We also have to unlock and free the corresponding
697                    shadowed buffer */
698                 jh = commit_transaction->t_shadow_list->b_tprev;
699                 bh = jh2bh(jh);
700                 clear_bit(BH_JWrite, &bh->b_state);
701                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
702
703                 /* The metadata is now released for reuse, but we need
704                    to remember it against this transaction so that when
705                    we finally commit, we can do any checkpointing
706                    required. */
707                 JBUFFER_TRACE(jh, "file as BJ_Forget");
708                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
709                 /* Wake up any transactions which were waiting for this
710                    IO to complete */
711                 wake_up_bit(&bh->b_state, BH_Unshadow);
712                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
713                 __brelse(bh);
714         }
715
716         J_ASSERT (commit_transaction->t_shadow_list == NULL);
717
718         jbd_debug(3, "JBD: commit phase 4\n");
719
720         /* Here we wait for the revoke record and descriptor record buffers */
721  wait_for_ctlbuf:
722         while (commit_transaction->t_log_list != NULL) {
723                 struct buffer_head *bh;
724
725                 jh = commit_transaction->t_log_list->b_tprev;
726                 bh = jh2bh(jh);
727                 if (buffer_locked(bh)) {
728                         wait_on_buffer(bh);
729                         goto wait_for_ctlbuf;
730                 }
731                 if (cond_resched())
732                         goto wait_for_ctlbuf;
733
734                 if (unlikely(!buffer_uptodate(bh)))
735                         err = -EIO;
736
737                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
738                 clear_buffer_jwrite(bh);
739                 jbd2_journal_unfile_buffer(journal, jh);
740                 jbd2_journal_put_journal_head(jh);
741                 __brelse(bh);           /* One for getblk */
742                 /* AKPM: bforget here */
743         }
744
745         jbd_debug(3, "JBD: commit phase 5\n");
746
747         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
748                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
749                 err = journal_submit_commit_record(journal, commit_transaction,
750                                                 &cbh, crc32_sum);
751                 if (err)
752                         __jbd2_journal_abort_hard(journal);
753         }
754         if (!err && !is_journal_aborted(journal))
755                 err = journal_wait_on_commit_record(cbh);
756
757         if (err)
758                 jbd2_journal_abort(journal, err);
759
760         /* End of a transaction!  Finally, we can do checkpoint
761            processing: any buffers committed as a result of this
762            transaction can be removed from any checkpoint list it was on
763            before. */
764
765         jbd_debug(3, "JBD: commit phase 6\n");
766
767         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
768         J_ASSERT(commit_transaction->t_buffers == NULL);
769         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
770         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
771         J_ASSERT(commit_transaction->t_shadow_list == NULL);
772         J_ASSERT(commit_transaction->t_log_list == NULL);
773
774 restart_loop:
775         /*
776          * As there are other places (journal_unmap_buffer()) adding buffers
777          * to this list we have to be careful and hold the j_list_lock.
778          */
779         spin_lock(&journal->j_list_lock);
780         while (commit_transaction->t_forget) {
781                 transaction_t *cp_transaction;
782                 struct buffer_head *bh;
783
784                 jh = commit_transaction->t_forget;
785                 spin_unlock(&journal->j_list_lock);
786                 bh = jh2bh(jh);
787                 jbd_lock_bh_state(bh);
788                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
789                         jh->b_transaction == journal->j_running_transaction);
790
791                 /*
792                  * If there is undo-protected committed data against
793                  * this buffer, then we can remove it now.  If it is a
794                  * buffer needing such protection, the old frozen_data
795                  * field now points to a committed version of the
796                  * buffer, so rotate that field to the new committed
797                  * data.
798                  *
799                  * Otherwise, we can just throw away the frozen data now.
800                  */
801                 if (jh->b_committed_data) {
802                         jbd2_free(jh->b_committed_data, bh->b_size);
803                         jh->b_committed_data = NULL;
804                         if (jh->b_frozen_data) {
805                                 jh->b_committed_data = jh->b_frozen_data;
806                                 jh->b_frozen_data = NULL;
807                         }
808                 } else if (jh->b_frozen_data) {
809                         jbd2_free(jh->b_frozen_data, bh->b_size);
810                         jh->b_frozen_data = NULL;
811                 }
812
813                 spin_lock(&journal->j_list_lock);
814                 cp_transaction = jh->b_cp_transaction;
815                 if (cp_transaction) {
816                         JBUFFER_TRACE(jh, "remove from old cp transaction");
817                         cp_transaction->t_chp_stats.cs_dropped++;
818                         __jbd2_journal_remove_checkpoint(jh);
819                 }
820
821                 /* Only re-checkpoint the buffer_head if it is marked
822                  * dirty.  If the buffer was added to the BJ_Forget list
823                  * by jbd2_journal_forget, it may no longer be dirty and
824                  * there's no point in keeping a checkpoint record for
825                  * it. */
826
827                 /* A buffer which has been freed while still being
828                  * journaled by a previous transaction may end up still
829                  * being dirty here, but we want to avoid writing back
830                  * that buffer in the future now that the last use has
831                  * been committed.  That's not only a performance gain,
832                  * it also stops aliasing problems if the buffer is left
833                  * behind for writeback and gets reallocated for another
834                  * use in a different page. */
835                 if (buffer_freed(bh)) {
836                         clear_buffer_freed(bh);
837                         clear_buffer_jbddirty(bh);
838                 }
839
840                 if (buffer_jbddirty(bh)) {
841                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
842                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
843                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
844                         __jbd2_journal_refile_buffer(jh);
845                         jbd_unlock_bh_state(bh);
846                 } else {
847                         J_ASSERT_BH(bh, !buffer_dirty(bh));
848                         /* The buffer on BJ_Forget list and not jbddirty means
849                          * it has been freed by this transaction and hence it
850                          * could not have been reallocated until this
851                          * transaction has committed. *BUT* it could be
852                          * reallocated once we have written all the data to
853                          * disk and before we process the buffer on BJ_Forget
854                          * list. */
855                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
856                         __jbd2_journal_refile_buffer(jh);
857                         if (!jh->b_transaction) {
858                                 jbd_unlock_bh_state(bh);
859                                  /* needs a brelse */
860                                 jbd2_journal_remove_journal_head(bh);
861                                 release_buffer_page(bh);
862                         } else
863                                 jbd_unlock_bh_state(bh);
864                 }
865                 cond_resched_lock(&journal->j_list_lock);
866         }
867         spin_unlock(&journal->j_list_lock);
868         /*
869          * This is a bit sleazy.  We use j_list_lock to protect transition
870          * of a transaction into T_FINISHED state and calling
871          * __jbd2_journal_drop_transaction(). Otherwise we could race with
872          * other checkpointing code processing the transaction...
873          */
874         spin_lock(&journal->j_state_lock);
875         spin_lock(&journal->j_list_lock);
876         /*
877          * Now recheck if some buffers did not get attached to the transaction
878          * while the lock was dropped...
879          */
880         if (commit_transaction->t_forget) {
881                 spin_unlock(&journal->j_list_lock);
882                 spin_unlock(&journal->j_state_lock);
883                 goto restart_loop;
884         }
885
886         /* Done with this transaction! */
887
888         jbd_debug(3, "JBD: commit phase 7\n");
889
890         J_ASSERT(commit_transaction->t_state == T_COMMIT);
891
892         commit_transaction->t_start = jiffies;
893         stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
894                                                 commit_transaction->t_start);
895
896         /*
897          * File the transaction for history
898          */
899         stats.ts_type = JBD2_STATS_RUN;
900         stats.ts_tid = commit_transaction->t_tid;
901         stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
902         spin_lock(&journal->j_history_lock);
903         memcpy(journal->j_history + journal->j_history_cur, &stats,
904                         sizeof(stats));
905         if (++journal->j_history_cur == journal->j_history_max)
906                 journal->j_history_cur = 0;
907
908         /*
909          * Calculate overall stats
910          */
911         journal->j_stats.ts_tid++;
912         journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
913         journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
914         journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
915         journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
916         journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
917         journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
918         journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
919         journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
920         spin_unlock(&journal->j_history_lock);
921
922         commit_transaction->t_state = T_FINISHED;
923         J_ASSERT(commit_transaction == journal->j_committing_transaction);
924         journal->j_commit_sequence = commit_transaction->t_tid;
925         journal->j_committing_transaction = NULL;
926         spin_unlock(&journal->j_state_lock);
927
928         if (commit_transaction->t_checkpoint_list == NULL &&
929             commit_transaction->t_checkpoint_io_list == NULL) {
930                 __jbd2_journal_drop_transaction(journal, commit_transaction);
931         } else {
932                 if (journal->j_checkpoint_transactions == NULL) {
933                         journal->j_checkpoint_transactions = commit_transaction;
934                         commit_transaction->t_cpnext = commit_transaction;
935                         commit_transaction->t_cpprev = commit_transaction;
936                 } else {
937                         commit_transaction->t_cpnext =
938                                 journal->j_checkpoint_transactions;
939                         commit_transaction->t_cpprev =
940                                 commit_transaction->t_cpnext->t_cpprev;
941                         commit_transaction->t_cpnext->t_cpprev =
942                                 commit_transaction;
943                         commit_transaction->t_cpprev->t_cpnext =
944                                 commit_transaction;
945                 }
946         }
947         spin_unlock(&journal->j_list_lock);
948
949         jbd_debug(1, "JBD: commit %d complete, head %d\n",
950                   journal->j_commit_sequence, journal->j_tail_sequence);
951
952         wake_up(&journal->j_wait_done_commit);
953 }