jbd2: fix race between jbd2_journal_remove_checkpoint and ->j_commit_callback
[linux-2.6.git] / fs / jbd2 / commit.c
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31
32 /*
33  * Default IO end handler for temporary BJ_IO buffer_heads.
34  */
35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36 {
37         BUFFER_TRACE(bh, "");
38         if (uptodate)
39                 set_buffer_uptodate(bh);
40         else
41                 clear_buffer_uptodate(bh);
42         unlock_buffer(bh);
43 }
44
45 /*
46  * When an ext4 file is truncated, it is possible that some pages are not
47  * successfully freed, because they are attached to a committing transaction.
48  * After the transaction commits, these pages are left on the LRU, with no
49  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
50  * by the VM, but their apparent absence upsets the VM accounting, and it makes
51  * the numbers in /proc/meminfo look odd.
52  *
53  * So here, we have a buffer which has just come off the forget list.  Look to
54  * see if we can strip all buffers from the backing page.
55  *
56  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
57  * caller provided us with a ref against the buffer, and we drop that here.
58  */
59 static void release_buffer_page(struct buffer_head *bh)
60 {
61         struct page *page;
62
63         if (buffer_dirty(bh))
64                 goto nope;
65         if (atomic_read(&bh->b_count) != 1)
66                 goto nope;
67         page = bh->b_page;
68         if (!page)
69                 goto nope;
70         if (page->mapping)
71                 goto nope;
72
73         /* OK, it's a truncated page */
74         if (!trylock_page(page))
75                 goto nope;
76
77         page_cache_get(page);
78         __brelse(bh);
79         try_to_free_buffers(page);
80         unlock_page(page);
81         page_cache_release(page);
82         return;
83
84 nope:
85         __brelse(bh);
86 }
87
88 /*
89  * Done it all: now submit the commit record.  We should have
90  * cleaned up our previous buffers by now, so if we are in abort
91  * mode we can now just skip the rest of the journal write
92  * entirely.
93  *
94  * Returns 1 if the journal needs to be aborted or 0 on success
95  */
96 static int journal_submit_commit_record(journal_t *journal,
97                                         transaction_t *commit_transaction,
98                                         struct buffer_head **cbh,
99                                         __u32 crc32_sum)
100 {
101         struct journal_head *descriptor;
102         struct commit_header *tmp;
103         struct buffer_head *bh;
104         int ret;
105         struct timespec now = current_kernel_time();
106
107         *cbh = NULL;
108
109         if (is_journal_aborted(journal))
110                 return 0;
111
112         descriptor = jbd2_journal_get_descriptor_buffer(journal);
113         if (!descriptor)
114                 return 1;
115
116         bh = jh2bh(descriptor);
117
118         tmp = (struct commit_header *)bh->b_data;
119         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
120         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
121         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
122         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
123         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
124
125         if (JBD2_HAS_COMPAT_FEATURE(journal,
126                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
127                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
128                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
129                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
130         }
131
132         JBUFFER_TRACE(descriptor, "submit commit block");
133         lock_buffer(bh);
134         clear_buffer_dirty(bh);
135         set_buffer_uptodate(bh);
136         bh->b_end_io = journal_end_buffer_io_sync;
137
138         if (journal->j_flags & JBD2_BARRIER &&
139             !JBD2_HAS_INCOMPAT_FEATURE(journal,
140                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
141                 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
142         else
143                 ret = submit_bh(WRITE_SYNC, bh);
144
145         *cbh = bh;
146         return ret;
147 }
148
149 /*
150  * This function along with journal_submit_commit_record
151  * allows to write the commit record asynchronously.
152  */
153 static int journal_wait_on_commit_record(journal_t *journal,
154                                          struct buffer_head *bh)
155 {
156         int ret = 0;
157
158         clear_buffer_dirty(bh);
159         wait_on_buffer(bh);
160
161         if (unlikely(!buffer_uptodate(bh)))
162                 ret = -EIO;
163         put_bh(bh);            /* One for getblk() */
164         jbd2_journal_put_journal_head(bh2jh(bh));
165
166         return ret;
167 }
168
169 /*
170  * write the filemap data using writepage() address_space_operations.
171  * We don't do block allocation here even for delalloc. We don't
172  * use writepages() because with dealyed allocation we may be doing
173  * block allocation in writepages().
174  */
175 static int journal_submit_inode_data_buffers(struct address_space *mapping)
176 {
177         int ret;
178         struct writeback_control wbc = {
179                 .sync_mode =  WB_SYNC_ALL,
180                 .nr_to_write = mapping->nrpages * 2,
181                 .range_start = 0,
182                 .range_end = i_size_read(mapping->host),
183         };
184
185         ret = generic_writepages(mapping, &wbc);
186         return ret;
187 }
188
189 /*
190  * Submit all the data buffers of inode associated with the transaction to
191  * disk.
192  *
193  * We are in a committing transaction. Therefore no new inode can be added to
194  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
195  * operate on from being released while we write out pages.
196  */
197 static int journal_submit_data_buffers(journal_t *journal,
198                 transaction_t *commit_transaction)
199 {
200         struct jbd2_inode *jinode;
201         int err, ret = 0;
202         struct address_space *mapping;
203
204         spin_lock(&journal->j_list_lock);
205         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
206                 mapping = jinode->i_vfs_inode->i_mapping;
207                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
208                 spin_unlock(&journal->j_list_lock);
209                 /*
210                  * submit the inode data buffers. We use writepage
211                  * instead of writepages. Because writepages can do
212                  * block allocation  with delalloc. We need to write
213                  * only allocated blocks here.
214                  */
215                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
216                 err = journal_submit_inode_data_buffers(mapping);
217                 if (!ret)
218                         ret = err;
219                 spin_lock(&journal->j_list_lock);
220                 J_ASSERT(jinode->i_transaction == commit_transaction);
221                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
222                 smp_mb__after_clear_bit();
223                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
224         }
225         spin_unlock(&journal->j_list_lock);
226         return ret;
227 }
228
229 /*
230  * Wait for data submitted for writeout, refile inodes to proper
231  * transaction if needed.
232  *
233  */
234 static int journal_finish_inode_data_buffers(journal_t *journal,
235                 transaction_t *commit_transaction)
236 {
237         struct jbd2_inode *jinode, *next_i;
238         int err, ret = 0;
239
240         /* For locking, see the comment in journal_submit_data_buffers() */
241         spin_lock(&journal->j_list_lock);
242         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
243                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
244                 spin_unlock(&journal->j_list_lock);
245                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
246                 if (err) {
247                         /*
248                          * Because AS_EIO is cleared by
249                          * filemap_fdatawait_range(), set it again so
250                          * that user process can get -EIO from fsync().
251                          */
252                         set_bit(AS_EIO,
253                                 &jinode->i_vfs_inode->i_mapping->flags);
254
255                         if (!ret)
256                                 ret = err;
257                 }
258                 spin_lock(&journal->j_list_lock);
259                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
260                 smp_mb__after_clear_bit();
261                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
262         }
263
264         /* Now refile inode to proper lists */
265         list_for_each_entry_safe(jinode, next_i,
266                                  &commit_transaction->t_inode_list, i_list) {
267                 list_del(&jinode->i_list);
268                 if (jinode->i_next_transaction) {
269                         jinode->i_transaction = jinode->i_next_transaction;
270                         jinode->i_next_transaction = NULL;
271                         list_add(&jinode->i_list,
272                                 &jinode->i_transaction->t_inode_list);
273                 } else {
274                         jinode->i_transaction = NULL;
275                 }
276         }
277         spin_unlock(&journal->j_list_lock);
278
279         return ret;
280 }
281
282 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
283 {
284         struct page *page = bh->b_page;
285         char *addr;
286         __u32 checksum;
287
288         addr = kmap_atomic(page);
289         checksum = crc32_be(crc32_sum,
290                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
291         kunmap_atomic(addr);
292
293         return checksum;
294 }
295
296 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
297                                    unsigned long long block)
298 {
299         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
300         if (tag_bytes > JBD2_TAG_SIZE32)
301                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
302 }
303
304 /*
305  * jbd2_journal_commit_transaction
306  *
307  * The primary function for committing a transaction to the log.  This
308  * function is called by the journal thread to begin a complete commit.
309  */
310 void jbd2_journal_commit_transaction(journal_t *journal)
311 {
312         struct transaction_stats_s stats;
313         transaction_t *commit_transaction;
314         struct journal_head *jh, *new_jh, *descriptor;
315         struct buffer_head **wbuf = journal->j_wbuf;
316         int bufs;
317         int flags;
318         int err;
319         unsigned long long blocknr;
320         ktime_t start_time;
321         u64 commit_time;
322         char *tagp = NULL;
323         journal_header_t *header;
324         journal_block_tag_t *tag = NULL;
325         int space_left = 0;
326         int first_tag = 0;
327         int tag_flag;
328         int i;
329         int tag_bytes = journal_tag_bytes(journal);
330         struct buffer_head *cbh = NULL; /* For transactional checksums */
331         __u32 crc32_sum = ~0;
332         struct blk_plug plug;
333         /* Tail of the journal */
334         unsigned long first_block;
335         tid_t first_tid;
336         int update_tail;
337
338         /*
339          * First job: lock down the current transaction and wait for
340          * all outstanding updates to complete.
341          */
342
343         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
344         if (journal->j_flags & JBD2_FLUSHED) {
345                 jbd_debug(3, "super block updated\n");
346                 mutex_lock(&journal->j_checkpoint_mutex);
347                 /*
348                  * We hold j_checkpoint_mutex so tail cannot change under us.
349                  * We don't need any special data guarantees for writing sb
350                  * since journal is empty and it is ok for write to be
351                  * flushed only with transaction commit.
352                  */
353                 jbd2_journal_update_sb_log_tail(journal,
354                                                 journal->j_tail_sequence,
355                                                 journal->j_tail,
356                                                 WRITE_SYNC);
357                 mutex_unlock(&journal->j_checkpoint_mutex);
358         } else {
359                 jbd_debug(3, "superblock not updated\n");
360         }
361
362         J_ASSERT(journal->j_running_transaction != NULL);
363         J_ASSERT(journal->j_committing_transaction == NULL);
364
365         commit_transaction = journal->j_running_transaction;
366         J_ASSERT(commit_transaction->t_state == T_RUNNING);
367
368         trace_jbd2_start_commit(journal, commit_transaction);
369         jbd_debug(1, "JBD2: starting commit of transaction %d\n",
370                         commit_transaction->t_tid);
371
372         write_lock(&journal->j_state_lock);
373         commit_transaction->t_state = T_LOCKED;
374
375         trace_jbd2_commit_locking(journal, commit_transaction);
376         stats.run.rs_wait = commit_transaction->t_max_wait;
377         stats.run.rs_locked = jiffies;
378         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
379                                               stats.run.rs_locked);
380
381         spin_lock(&commit_transaction->t_handle_lock);
382         while (atomic_read(&commit_transaction->t_updates)) {
383                 DEFINE_WAIT(wait);
384
385                 prepare_to_wait(&journal->j_wait_updates, &wait,
386                                         TASK_UNINTERRUPTIBLE);
387                 if (atomic_read(&commit_transaction->t_updates)) {
388                         spin_unlock(&commit_transaction->t_handle_lock);
389                         write_unlock(&journal->j_state_lock);
390                         schedule();
391                         write_lock(&journal->j_state_lock);
392                         spin_lock(&commit_transaction->t_handle_lock);
393                 }
394                 finish_wait(&journal->j_wait_updates, &wait);
395         }
396         spin_unlock(&commit_transaction->t_handle_lock);
397
398         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
399                         journal->j_max_transaction_buffers);
400
401         /*
402          * First thing we are allowed to do is to discard any remaining
403          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
404          * that there are no such buffers: if a large filesystem
405          * operation like a truncate needs to split itself over multiple
406          * transactions, then it may try to do a jbd2_journal_restart() while
407          * there are still BJ_Reserved buffers outstanding.  These must
408          * be released cleanly from the current transaction.
409          *
410          * In this case, the filesystem must still reserve write access
411          * again before modifying the buffer in the new transaction, but
412          * we do not require it to remember exactly which old buffers it
413          * has reserved.  This is consistent with the existing behaviour
414          * that multiple jbd2_journal_get_write_access() calls to the same
415          * buffer are perfectly permissible.
416          */
417         while (commit_transaction->t_reserved_list) {
418                 jh = commit_transaction->t_reserved_list;
419                 JBUFFER_TRACE(jh, "reserved, unused: refile");
420                 /*
421                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
422                  * leave undo-committed data.
423                  */
424                 if (jh->b_committed_data) {
425                         struct buffer_head *bh = jh2bh(jh);
426
427                         jbd_lock_bh_state(bh);
428                         jbd2_free(jh->b_committed_data, bh->b_size);
429                         jh->b_committed_data = NULL;
430                         jbd_unlock_bh_state(bh);
431                 }
432                 jbd2_journal_refile_buffer(journal, jh);
433         }
434
435         /*
436          * Now try to drop any written-back buffers from the journal's
437          * checkpoint lists.  We do this *before* commit because it potentially
438          * frees some memory
439          */
440         spin_lock(&journal->j_list_lock);
441         __jbd2_journal_clean_checkpoint_list(journal);
442         spin_unlock(&journal->j_list_lock);
443
444         jbd_debug(3, "JBD2: commit phase 1\n");
445
446         /*
447          * Clear revoked flag to reflect there is no revoked buffers
448          * in the next transaction which is going to be started.
449          */
450         jbd2_clear_buffer_revoked_flags(journal);
451
452         /*
453          * Switch to a new revoke table.
454          */
455         jbd2_journal_switch_revoke_table(journal);
456
457         trace_jbd2_commit_flushing(journal, commit_transaction);
458         stats.run.rs_flushing = jiffies;
459         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
460                                              stats.run.rs_flushing);
461
462         commit_transaction->t_state = T_FLUSH;
463         journal->j_committing_transaction = commit_transaction;
464         journal->j_running_transaction = NULL;
465         start_time = ktime_get();
466         commit_transaction->t_log_start = journal->j_head;
467         wake_up(&journal->j_wait_transaction_locked);
468         write_unlock(&journal->j_state_lock);
469
470         jbd_debug(3, "JBD2: commit phase 2\n");
471
472         /*
473          * Now start flushing things to disk, in the order they appear
474          * on the transaction lists.  Data blocks go first.
475          */
476         err = journal_submit_data_buffers(journal, commit_transaction);
477         if (err)
478                 jbd2_journal_abort(journal, err);
479
480         blk_start_plug(&plug);
481         jbd2_journal_write_revoke_records(journal, commit_transaction,
482                                           WRITE_SYNC);
483         blk_finish_plug(&plug);
484
485         jbd_debug(3, "JBD2: commit phase 2\n");
486
487         /*
488          * Way to go: we have now written out all of the data for a
489          * transaction!  Now comes the tricky part: we need to write out
490          * metadata.  Loop over the transaction's entire buffer list:
491          */
492         write_lock(&journal->j_state_lock);
493         commit_transaction->t_state = T_COMMIT;
494         write_unlock(&journal->j_state_lock);
495
496         trace_jbd2_commit_logging(journal, commit_transaction);
497         stats.run.rs_logging = jiffies;
498         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
499                                                stats.run.rs_logging);
500         stats.run.rs_blocks =
501                 atomic_read(&commit_transaction->t_outstanding_credits);
502         stats.run.rs_blocks_logged = 0;
503
504         J_ASSERT(commit_transaction->t_nr_buffers <=
505                  atomic_read(&commit_transaction->t_outstanding_credits));
506
507         err = 0;
508         descriptor = NULL;
509         bufs = 0;
510         blk_start_plug(&plug);
511         while (commit_transaction->t_buffers) {
512
513                 /* Find the next buffer to be journaled... */
514
515                 jh = commit_transaction->t_buffers;
516
517                 /* If we're in abort mode, we just un-journal the buffer and
518                    release it. */
519
520                 if (is_journal_aborted(journal)) {
521                         clear_buffer_jbddirty(jh2bh(jh));
522                         JBUFFER_TRACE(jh, "journal is aborting: refile");
523                         jbd2_buffer_abort_trigger(jh,
524                                                   jh->b_frozen_data ?
525                                                   jh->b_frozen_triggers :
526                                                   jh->b_triggers);
527                         jbd2_journal_refile_buffer(journal, jh);
528                         /* If that was the last one, we need to clean up
529                          * any descriptor buffers which may have been
530                          * already allocated, even if we are now
531                          * aborting. */
532                         if (!commit_transaction->t_buffers)
533                                 goto start_journal_io;
534                         continue;
535                 }
536
537                 /* Make sure we have a descriptor block in which to
538                    record the metadata buffer. */
539
540                 if (!descriptor) {
541                         struct buffer_head *bh;
542
543                         J_ASSERT (bufs == 0);
544
545                         jbd_debug(4, "JBD2: get descriptor\n");
546
547                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
548                         if (!descriptor) {
549                                 jbd2_journal_abort(journal, -EIO);
550                                 continue;
551                         }
552
553                         bh = jh2bh(descriptor);
554                         jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
555                                 (unsigned long long)bh->b_blocknr, bh->b_data);
556                         header = (journal_header_t *)&bh->b_data[0];
557                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
558                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
559                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
560
561                         tagp = &bh->b_data[sizeof(journal_header_t)];
562                         space_left = bh->b_size - sizeof(journal_header_t);
563                         first_tag = 1;
564                         set_buffer_jwrite(bh);
565                         set_buffer_dirty(bh);
566                         wbuf[bufs++] = bh;
567
568                         /* Record it so that we can wait for IO
569                            completion later */
570                         BUFFER_TRACE(bh, "ph3: file as descriptor");
571                         jbd2_journal_file_buffer(descriptor, commit_transaction,
572                                         BJ_LogCtl);
573                 }
574
575                 /* Where is the buffer to be written? */
576
577                 err = jbd2_journal_next_log_block(journal, &blocknr);
578                 /* If the block mapping failed, just abandon the buffer
579                    and repeat this loop: we'll fall into the
580                    refile-on-abort condition above. */
581                 if (err) {
582                         jbd2_journal_abort(journal, err);
583                         continue;
584                 }
585
586                 /*
587                  * start_this_handle() uses t_outstanding_credits to determine
588                  * the free space in the log, but this counter is changed
589                  * by jbd2_journal_next_log_block() also.
590                  */
591                 atomic_dec(&commit_transaction->t_outstanding_credits);
592
593                 /* Bump b_count to prevent truncate from stumbling over
594                    the shadowed buffer!  @@@ This can go if we ever get
595                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
596                 atomic_inc(&jh2bh(jh)->b_count);
597
598                 /* Make a temporary IO buffer with which to write it out
599                    (this will requeue both the metadata buffer and the
600                    temporary IO buffer). new_bh goes on BJ_IO*/
601
602                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
603                 /*
604                  * akpm: jbd2_journal_write_metadata_buffer() sets
605                  * new_bh->b_transaction to commit_transaction.
606                  * We need to clean this up before we release new_bh
607                  * (which is of type BJ_IO)
608                  */
609                 JBUFFER_TRACE(jh, "ph3: write metadata");
610                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
611                                                       jh, &new_jh, blocknr);
612                 if (flags < 0) {
613                         jbd2_journal_abort(journal, flags);
614                         continue;
615                 }
616                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
617                 wbuf[bufs++] = jh2bh(new_jh);
618
619                 /* Record the new block's tag in the current descriptor
620                    buffer */
621
622                 tag_flag = 0;
623                 if (flags & 1)
624                         tag_flag |= JBD2_FLAG_ESCAPE;
625                 if (!first_tag)
626                         tag_flag |= JBD2_FLAG_SAME_UUID;
627
628                 tag = (journal_block_tag_t *) tagp;
629                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
630                 tag->t_flags = cpu_to_be32(tag_flag);
631                 tagp += tag_bytes;
632                 space_left -= tag_bytes;
633
634                 if (first_tag) {
635                         memcpy (tagp, journal->j_uuid, 16);
636                         tagp += 16;
637                         space_left -= 16;
638                         first_tag = 0;
639                 }
640
641                 /* If there's no more to do, or if the descriptor is full,
642                    let the IO rip! */
643
644                 if (bufs == journal->j_wbufsize ||
645                     commit_transaction->t_buffers == NULL ||
646                     space_left < tag_bytes + 16) {
647
648                         jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
649
650                         /* Write an end-of-descriptor marker before
651                            submitting the IOs.  "tag" still points to
652                            the last tag we set up. */
653
654                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
655
656 start_journal_io:
657                         for (i = 0; i < bufs; i++) {
658                                 struct buffer_head *bh = wbuf[i];
659                                 /*
660                                  * Compute checksum.
661                                  */
662                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
663                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
664                                         crc32_sum =
665                                             jbd2_checksum_data(crc32_sum, bh);
666                                 }
667
668                                 lock_buffer(bh);
669                                 clear_buffer_dirty(bh);
670                                 set_buffer_uptodate(bh);
671                                 bh->b_end_io = journal_end_buffer_io_sync;
672                                 submit_bh(WRITE_SYNC, bh);
673                         }
674                         cond_resched();
675                         stats.run.rs_blocks_logged += bufs;
676
677                         /* Force a new descriptor to be generated next
678                            time round the loop. */
679                         descriptor = NULL;
680                         bufs = 0;
681                 }
682         }
683
684         err = journal_finish_inode_data_buffers(journal, commit_transaction);
685         if (err) {
686                 printk(KERN_WARNING
687                         "JBD2: Detected IO errors while flushing file data "
688                        "on %s\n", journal->j_devname);
689                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
690                         jbd2_journal_abort(journal, err);
691                 err = 0;
692         }
693
694         /*
695          * Get current oldest transaction in the log before we issue flush
696          * to the filesystem device. After the flush we can be sure that
697          * blocks of all older transactions are checkpointed to persistent
698          * storage and we will be safe to update journal start in the
699          * superblock with the numbers we get here.
700          */
701         update_tail =
702                 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
703
704         write_lock(&journal->j_state_lock);
705         if (update_tail) {
706                 long freed = first_block - journal->j_tail;
707
708                 if (first_block < journal->j_tail)
709                         freed += journal->j_last - journal->j_first;
710                 /* Update tail only if we free significant amount of space */
711                 if (freed < journal->j_maxlen / 4)
712                         update_tail = 0;
713         }
714         J_ASSERT(commit_transaction->t_state == T_COMMIT);
715         commit_transaction->t_state = T_COMMIT_DFLUSH;
716         write_unlock(&journal->j_state_lock);
717
718         /* 
719          * If the journal is not located on the file system device,
720          * then we must flush the file system device before we issue
721          * the commit record
722          */
723         if (commit_transaction->t_need_data_flush &&
724             (journal->j_fs_dev != journal->j_dev) &&
725             (journal->j_flags & JBD2_BARRIER))
726                 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
727
728         /* Done it all: now write the commit record asynchronously. */
729         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
730                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
731                 err = journal_submit_commit_record(journal, commit_transaction,
732                                                  &cbh, crc32_sum);
733                 if (err)
734                         __jbd2_journal_abort_hard(journal);
735         }
736
737         blk_finish_plug(&plug);
738
739         /* Lo and behold: we have just managed to send a transaction to
740            the log.  Before we can commit it, wait for the IO so far to
741            complete.  Control buffers being written are on the
742            transaction's t_log_list queue, and metadata buffers are on
743            the t_iobuf_list queue.
744
745            Wait for the buffers in reverse order.  That way we are
746            less likely to be woken up until all IOs have completed, and
747            so we incur less scheduling load.
748         */
749
750         jbd_debug(3, "JBD2: commit phase 3\n");
751
752         /*
753          * akpm: these are BJ_IO, and j_list_lock is not needed.
754          * See __journal_try_to_free_buffer.
755          */
756 wait_for_iobuf:
757         while (commit_transaction->t_iobuf_list != NULL) {
758                 struct buffer_head *bh;
759
760                 jh = commit_transaction->t_iobuf_list->b_tprev;
761                 bh = jh2bh(jh);
762                 if (buffer_locked(bh)) {
763                         wait_on_buffer(bh);
764                         goto wait_for_iobuf;
765                 }
766                 if (cond_resched())
767                         goto wait_for_iobuf;
768
769                 if (unlikely(!buffer_uptodate(bh)))
770                         err = -EIO;
771
772                 clear_buffer_jwrite(bh);
773
774                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
775                 jbd2_journal_unfile_buffer(journal, jh);
776
777                 /*
778                  * ->t_iobuf_list should contain only dummy buffer_heads
779                  * which were created by jbd2_journal_write_metadata_buffer().
780                  */
781                 BUFFER_TRACE(bh, "dumping temporary bh");
782                 jbd2_journal_put_journal_head(jh);
783                 __brelse(bh);
784                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
785                 free_buffer_head(bh);
786
787                 /* We also have to unlock and free the corresponding
788                    shadowed buffer */
789                 jh = commit_transaction->t_shadow_list->b_tprev;
790                 bh = jh2bh(jh);
791                 clear_bit(BH_JWrite, &bh->b_state);
792                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
793
794                 /* The metadata is now released for reuse, but we need
795                    to remember it against this transaction so that when
796                    we finally commit, we can do any checkpointing
797                    required. */
798                 JBUFFER_TRACE(jh, "file as BJ_Forget");
799                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
800                 /*
801                  * Wake up any transactions which were waiting for this IO to
802                  * complete. The barrier must be here so that changes by
803                  * jbd2_journal_file_buffer() take effect before wake_up_bit()
804                  * does the waitqueue check.
805                  */
806                 smp_mb();
807                 wake_up_bit(&bh->b_state, BH_Unshadow);
808                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
809                 __brelse(bh);
810         }
811
812         J_ASSERT (commit_transaction->t_shadow_list == NULL);
813
814         jbd_debug(3, "JBD2: commit phase 4\n");
815
816         /* Here we wait for the revoke record and descriptor record buffers */
817  wait_for_ctlbuf:
818         while (commit_transaction->t_log_list != NULL) {
819                 struct buffer_head *bh;
820
821                 jh = commit_transaction->t_log_list->b_tprev;
822                 bh = jh2bh(jh);
823                 if (buffer_locked(bh)) {
824                         wait_on_buffer(bh);
825                         goto wait_for_ctlbuf;
826                 }
827                 if (cond_resched())
828                         goto wait_for_ctlbuf;
829
830                 if (unlikely(!buffer_uptodate(bh)))
831                         err = -EIO;
832
833                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
834                 clear_buffer_jwrite(bh);
835                 jbd2_journal_unfile_buffer(journal, jh);
836                 jbd2_journal_put_journal_head(jh);
837                 __brelse(bh);           /* One for getblk */
838                 /* AKPM: bforget here */
839         }
840
841         if (err)
842                 jbd2_journal_abort(journal, err);
843
844         jbd_debug(3, "JBD2: commit phase 5\n");
845         write_lock(&journal->j_state_lock);
846         J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
847         commit_transaction->t_state = T_COMMIT_JFLUSH;
848         write_unlock(&journal->j_state_lock);
849
850         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
851                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
852                 err = journal_submit_commit_record(journal, commit_transaction,
853                                                 &cbh, crc32_sum);
854                 if (err)
855                         __jbd2_journal_abort_hard(journal);
856         }
857         if (cbh)
858                 err = journal_wait_on_commit_record(journal, cbh);
859         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
860                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
861             journal->j_flags & JBD2_BARRIER) {
862                 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
863         }
864
865         if (err)
866                 jbd2_journal_abort(journal, err);
867
868         /*
869          * Now disk caches for filesystem device are flushed so we are safe to
870          * erase checkpointed transactions from the log by updating journal
871          * superblock.
872          */
873         if (update_tail)
874                 jbd2_update_log_tail(journal, first_tid, first_block);
875
876         /* End of a transaction!  Finally, we can do checkpoint
877            processing: any buffers committed as a result of this
878            transaction can be removed from any checkpoint list it was on
879            before. */
880
881         jbd_debug(3, "JBD2: commit phase 6\n");
882
883         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
884         J_ASSERT(commit_transaction->t_buffers == NULL);
885         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
886         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
887         J_ASSERT(commit_transaction->t_shadow_list == NULL);
888         J_ASSERT(commit_transaction->t_log_list == NULL);
889
890 restart_loop:
891         /*
892          * As there are other places (journal_unmap_buffer()) adding buffers
893          * to this list we have to be careful and hold the j_list_lock.
894          */
895         spin_lock(&journal->j_list_lock);
896         while (commit_transaction->t_forget) {
897                 transaction_t *cp_transaction;
898                 struct buffer_head *bh;
899                 int try_to_free = 0;
900
901                 jh = commit_transaction->t_forget;
902                 spin_unlock(&journal->j_list_lock);
903                 bh = jh2bh(jh);
904                 /*
905                  * Get a reference so that bh cannot be freed before we are
906                  * done with it.
907                  */
908                 get_bh(bh);
909                 jbd_lock_bh_state(bh);
910                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
911
912                 /*
913                  * If there is undo-protected committed data against
914                  * this buffer, then we can remove it now.  If it is a
915                  * buffer needing such protection, the old frozen_data
916                  * field now points to a committed version of the
917                  * buffer, so rotate that field to the new committed
918                  * data.
919                  *
920                  * Otherwise, we can just throw away the frozen data now.
921                  *
922                  * We also know that the frozen data has already fired
923                  * its triggers if they exist, so we can clear that too.
924                  */
925                 if (jh->b_committed_data) {
926                         jbd2_free(jh->b_committed_data, bh->b_size);
927                         jh->b_committed_data = NULL;
928                         if (jh->b_frozen_data) {
929                                 jh->b_committed_data = jh->b_frozen_data;
930                                 jh->b_frozen_data = NULL;
931                                 jh->b_frozen_triggers = NULL;
932                         }
933                 } else if (jh->b_frozen_data) {
934                         jbd2_free(jh->b_frozen_data, bh->b_size);
935                         jh->b_frozen_data = NULL;
936                         jh->b_frozen_triggers = NULL;
937                 }
938
939                 spin_lock(&journal->j_list_lock);
940                 cp_transaction = jh->b_cp_transaction;
941                 if (cp_transaction) {
942                         JBUFFER_TRACE(jh, "remove from old cp transaction");
943                         cp_transaction->t_chp_stats.cs_dropped++;
944                         __jbd2_journal_remove_checkpoint(jh);
945                 }
946
947                 /* Only re-checkpoint the buffer_head if it is marked
948                  * dirty.  If the buffer was added to the BJ_Forget list
949                  * by jbd2_journal_forget, it may no longer be dirty and
950                  * there's no point in keeping a checkpoint record for
951                  * it. */
952
953                 /* A buffer which has been freed while still being
954                  * journaled by a previous transaction may end up still
955                  * being dirty here, but we want to avoid writing back
956                  * that buffer in the future after the "add to orphan"
957                  * operation been committed,  That's not only a performance
958                  * gain, it also stops aliasing problems if the buffer is
959                  * left behind for writeback and gets reallocated for another
960                  * use in a different page. */
961                 if (buffer_freed(bh) && !jh->b_next_transaction) {
962                         clear_buffer_freed(bh);
963                         clear_buffer_jbddirty(bh);
964                 }
965
966                 if (buffer_jbddirty(bh)) {
967                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
968                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
969                         if (is_journal_aborted(journal))
970                                 clear_buffer_jbddirty(bh);
971                 } else {
972                         J_ASSERT_BH(bh, !buffer_dirty(bh));
973                         /*
974                          * The buffer on BJ_Forget list and not jbddirty means
975                          * it has been freed by this transaction and hence it
976                          * could not have been reallocated until this
977                          * transaction has committed. *BUT* it could be
978                          * reallocated once we have written all the data to
979                          * disk and before we process the buffer on BJ_Forget
980                          * list.
981                          */
982                         if (!jh->b_next_transaction)
983                                 try_to_free = 1;
984                 }
985                 JBUFFER_TRACE(jh, "refile or unfile buffer");
986                 __jbd2_journal_refile_buffer(jh);
987                 jbd_unlock_bh_state(bh);
988                 if (try_to_free)
989                         release_buffer_page(bh);        /* Drops bh reference */
990                 else
991                         __brelse(bh);
992                 cond_resched_lock(&journal->j_list_lock);
993         }
994         spin_unlock(&journal->j_list_lock);
995         /*
996          * This is a bit sleazy.  We use j_list_lock to protect transition
997          * of a transaction into T_FINISHED state and calling
998          * __jbd2_journal_drop_transaction(). Otherwise we could race with
999          * other checkpointing code processing the transaction...
1000          */
1001         write_lock(&journal->j_state_lock);
1002         spin_lock(&journal->j_list_lock);
1003         /*
1004          * Now recheck if some buffers did not get attached to the transaction
1005          * while the lock was dropped...
1006          */
1007         if (commit_transaction->t_forget) {
1008                 spin_unlock(&journal->j_list_lock);
1009                 write_unlock(&journal->j_state_lock);
1010                 goto restart_loop;
1011         }
1012
1013         /* Done with this transaction! */
1014
1015         jbd_debug(3, "JBD2: commit phase 7\n");
1016
1017         J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1018
1019         commit_transaction->t_start = jiffies;
1020         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1021                                               commit_transaction->t_start);
1022
1023         /*
1024          * File the transaction statistics
1025          */
1026         stats.ts_tid = commit_transaction->t_tid;
1027         stats.run.rs_handle_count =
1028                 atomic_read(&commit_transaction->t_handle_count);
1029         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1030                              commit_transaction->t_tid, &stats.run);
1031
1032         /*
1033          * Calculate overall stats
1034          */
1035         spin_lock(&journal->j_history_lock);
1036         journal->j_stats.ts_tid++;
1037         journal->j_stats.run.rs_wait += stats.run.rs_wait;
1038         journal->j_stats.run.rs_running += stats.run.rs_running;
1039         journal->j_stats.run.rs_locked += stats.run.rs_locked;
1040         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1041         journal->j_stats.run.rs_logging += stats.run.rs_logging;
1042         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1043         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1044         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1045         spin_unlock(&journal->j_history_lock);
1046
1047         commit_transaction->t_state = T_COMMIT_CALLBACK;
1048         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1049         journal->j_commit_sequence = commit_transaction->t_tid;
1050         journal->j_committing_transaction = NULL;
1051         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1052
1053         /*
1054          * weight the commit time higher than the average time so we don't
1055          * react too strongly to vast changes in the commit time
1056          */
1057         if (likely(journal->j_average_commit_time))
1058                 journal->j_average_commit_time = (commit_time +
1059                                 journal->j_average_commit_time*3) / 4;
1060         else
1061                 journal->j_average_commit_time = commit_time;
1062
1063         write_unlock(&journal->j_state_lock);
1064
1065         if (journal->j_checkpoint_transactions == NULL) {
1066                 journal->j_checkpoint_transactions = commit_transaction;
1067                 commit_transaction->t_cpnext = commit_transaction;
1068                 commit_transaction->t_cpprev = commit_transaction;
1069         } else {
1070                 commit_transaction->t_cpnext =
1071                         journal->j_checkpoint_transactions;
1072                 commit_transaction->t_cpprev =
1073                         commit_transaction->t_cpnext->t_cpprev;
1074                 commit_transaction->t_cpnext->t_cpprev =
1075                         commit_transaction;
1076                 commit_transaction->t_cpprev->t_cpnext =
1077                                 commit_transaction;
1078         }
1079         spin_unlock(&journal->j_list_lock);
1080         /* Drop all spin_locks because commit_callback may be block.
1081          * __journal_remove_checkpoint() can not destroy transaction
1082          * under us because it is not marked as T_FINISHED yet */
1083         if (journal->j_commit_callback)
1084                 journal->j_commit_callback(journal, commit_transaction);
1085
1086         trace_jbd2_end_commit(journal, commit_transaction);
1087         jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1088                   journal->j_commit_sequence, journal->j_tail_sequence);
1089
1090         write_lock(&journal->j_state_lock);
1091         spin_lock(&journal->j_list_lock);
1092         commit_transaction->t_state = T_FINISHED;
1093         /* Recheck checkpoint lists after j_list_lock was dropped */
1094         if (commit_transaction->t_checkpoint_list == NULL &&
1095             commit_transaction->t_checkpoint_io_list == NULL) {
1096                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1097                 jbd2_journal_free_transaction(commit_transaction);
1098         }
1099         spin_unlock(&journal->j_list_lock);
1100         write_unlock(&journal->j_state_lock);
1101         wake_up(&journal->j_wait_done_commit);
1102 }