jbd2: replace barriers with explicit flush / FUA usage
[linux-2.6.git] / fs / jbd2 / commit.c
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <trace/events/jbd2.h>
30
31 /*
32  * Default IO end handler for temporary BJ_IO buffer_heads.
33  */
34 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
35 {
36         BUFFER_TRACE(bh, "");
37         if (uptodate)
38                 set_buffer_uptodate(bh);
39         else
40                 clear_buffer_uptodate(bh);
41         unlock_buffer(bh);
42 }
43
44 /*
45  * When an ext4 file is truncated, it is possible that some pages are not
46  * successfully freed, because they are attached to a committing transaction.
47  * After the transaction commits, these pages are left on the LRU, with no
48  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
49  * by the VM, but their apparent absence upsets the VM accounting, and it makes
50  * the numbers in /proc/meminfo look odd.
51  *
52  * So here, we have a buffer which has just come off the forget list.  Look to
53  * see if we can strip all buffers from the backing page.
54  *
55  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
56  * caller provided us with a ref against the buffer, and we drop that here.
57  */
58 static void release_buffer_page(struct buffer_head *bh)
59 {
60         struct page *page;
61
62         if (buffer_dirty(bh))
63                 goto nope;
64         if (atomic_read(&bh->b_count) != 1)
65                 goto nope;
66         page = bh->b_page;
67         if (!page)
68                 goto nope;
69         if (page->mapping)
70                 goto nope;
71
72         /* OK, it's a truncated page */
73         if (!trylock_page(page))
74                 goto nope;
75
76         page_cache_get(page);
77         __brelse(bh);
78         try_to_free_buffers(page);
79         unlock_page(page);
80         page_cache_release(page);
81         return;
82
83 nope:
84         __brelse(bh);
85 }
86
87 /*
88  * Done it all: now submit the commit record.  We should have
89  * cleaned up our previous buffers by now, so if we are in abort
90  * mode we can now just skip the rest of the journal write
91  * entirely.
92  *
93  * Returns 1 if the journal needs to be aborted or 0 on success
94  */
95 static int journal_submit_commit_record(journal_t *journal,
96                                         transaction_t *commit_transaction,
97                                         struct buffer_head **cbh,
98                                         __u32 crc32_sum)
99 {
100         struct journal_head *descriptor;
101         struct commit_header *tmp;
102         struct buffer_head *bh;
103         int ret;
104         struct timespec now = current_kernel_time();
105
106         if (is_journal_aborted(journal))
107                 return 0;
108
109         descriptor = jbd2_journal_get_descriptor_buffer(journal);
110         if (!descriptor)
111                 return 1;
112
113         bh = jh2bh(descriptor);
114
115         tmp = (struct commit_header *)bh->b_data;
116         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
117         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
118         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
119         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
120         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
121
122         if (JBD2_HAS_COMPAT_FEATURE(journal,
123                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
124                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
125                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
126                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
127         }
128
129         JBUFFER_TRACE(descriptor, "submit commit block");
130         lock_buffer(bh);
131         clear_buffer_dirty(bh);
132         set_buffer_uptodate(bh);
133         bh->b_end_io = journal_end_buffer_io_sync;
134
135         if (journal->j_flags & JBD2_BARRIER &&
136             !JBD2_HAS_INCOMPAT_FEATURE(journal,
137                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
138                 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
139         else
140                 ret = submit_bh(WRITE_SYNC_PLUG, bh);
141
142         *cbh = bh;
143         return ret;
144 }
145
146 /*
147  * This function along with journal_submit_commit_record
148  * allows to write the commit record asynchronously.
149  */
150 static int journal_wait_on_commit_record(journal_t *journal,
151                                          struct buffer_head *bh)
152 {
153         int ret = 0;
154
155         clear_buffer_dirty(bh);
156         wait_on_buffer(bh);
157
158         if (unlikely(!buffer_uptodate(bh)))
159                 ret = -EIO;
160         put_bh(bh);            /* One for getblk() */
161         jbd2_journal_put_journal_head(bh2jh(bh));
162
163         return ret;
164 }
165
166 /*
167  * write the filemap data using writepage() address_space_operations.
168  * We don't do block allocation here even for delalloc. We don't
169  * use writepages() because with dealyed allocation we may be doing
170  * block allocation in writepages().
171  */
172 static int journal_submit_inode_data_buffers(struct address_space *mapping)
173 {
174         int ret;
175         struct writeback_control wbc = {
176                 .sync_mode =  WB_SYNC_ALL,
177                 .nr_to_write = mapping->nrpages * 2,
178                 .range_start = 0,
179                 .range_end = i_size_read(mapping->host),
180         };
181
182         ret = generic_writepages(mapping, &wbc);
183         return ret;
184 }
185
186 /*
187  * Submit all the data buffers of inode associated with the transaction to
188  * disk.
189  *
190  * We are in a committing transaction. Therefore no new inode can be added to
191  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
192  * operate on from being released while we write out pages.
193  */
194 static int journal_submit_data_buffers(journal_t *journal,
195                 transaction_t *commit_transaction)
196 {
197         struct jbd2_inode *jinode;
198         int err, ret = 0;
199         struct address_space *mapping;
200
201         spin_lock(&journal->j_list_lock);
202         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
203                 mapping = jinode->i_vfs_inode->i_mapping;
204                 jinode->i_flags |= JI_COMMIT_RUNNING;
205                 spin_unlock(&journal->j_list_lock);
206                 /*
207                  * submit the inode data buffers. We use writepage
208                  * instead of writepages. Because writepages can do
209                  * block allocation  with delalloc. We need to write
210                  * only allocated blocks here.
211                  */
212                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
213                 err = journal_submit_inode_data_buffers(mapping);
214                 if (!ret)
215                         ret = err;
216                 spin_lock(&journal->j_list_lock);
217                 J_ASSERT(jinode->i_transaction == commit_transaction);
218                 commit_transaction->t_flushed_data_blocks = 1;
219                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
220                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
221         }
222         spin_unlock(&journal->j_list_lock);
223         return ret;
224 }
225
226 /*
227  * Wait for data submitted for writeout, refile inodes to proper
228  * transaction if needed.
229  *
230  */
231 static int journal_finish_inode_data_buffers(journal_t *journal,
232                 transaction_t *commit_transaction)
233 {
234         struct jbd2_inode *jinode, *next_i;
235         int err, ret = 0;
236
237         /* For locking, see the comment in journal_submit_data_buffers() */
238         spin_lock(&journal->j_list_lock);
239         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
240                 jinode->i_flags |= JI_COMMIT_RUNNING;
241                 spin_unlock(&journal->j_list_lock);
242                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
243                 if (err) {
244                         /*
245                          * Because AS_EIO is cleared by
246                          * filemap_fdatawait_range(), set it again so
247                          * that user process can get -EIO from fsync().
248                          */
249                         set_bit(AS_EIO,
250                                 &jinode->i_vfs_inode->i_mapping->flags);
251
252                         if (!ret)
253                                 ret = err;
254                 }
255                 spin_lock(&journal->j_list_lock);
256                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
257                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
258         }
259
260         /* Now refile inode to proper lists */
261         list_for_each_entry_safe(jinode, next_i,
262                                  &commit_transaction->t_inode_list, i_list) {
263                 list_del(&jinode->i_list);
264                 if (jinode->i_next_transaction) {
265                         jinode->i_transaction = jinode->i_next_transaction;
266                         jinode->i_next_transaction = NULL;
267                         list_add(&jinode->i_list,
268                                 &jinode->i_transaction->t_inode_list);
269                 } else {
270                         jinode->i_transaction = NULL;
271                 }
272         }
273         spin_unlock(&journal->j_list_lock);
274
275         return ret;
276 }
277
278 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
279 {
280         struct page *page = bh->b_page;
281         char *addr;
282         __u32 checksum;
283
284         addr = kmap_atomic(page, KM_USER0);
285         checksum = crc32_be(crc32_sum,
286                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
287         kunmap_atomic(addr, KM_USER0);
288
289         return checksum;
290 }
291
292 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
293                                    unsigned long long block)
294 {
295         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
296         if (tag_bytes > JBD2_TAG_SIZE32)
297                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
298 }
299
300 /*
301  * jbd2_journal_commit_transaction
302  *
303  * The primary function for committing a transaction to the log.  This
304  * function is called by the journal thread to begin a complete commit.
305  */
306 void jbd2_journal_commit_transaction(journal_t *journal)
307 {
308         struct transaction_stats_s stats;
309         transaction_t *commit_transaction;
310         struct journal_head *jh, *new_jh, *descriptor;
311         struct buffer_head **wbuf = journal->j_wbuf;
312         int bufs;
313         int flags;
314         int err;
315         unsigned long long blocknr;
316         ktime_t start_time;
317         u64 commit_time;
318         char *tagp = NULL;
319         journal_header_t *header;
320         journal_block_tag_t *tag = NULL;
321         int space_left = 0;
322         int first_tag = 0;
323         int tag_flag;
324         int i, to_free = 0;
325         int tag_bytes = journal_tag_bytes(journal);
326         struct buffer_head *cbh = NULL; /* For transactional checksums */
327         __u32 crc32_sum = ~0;
328         int write_op = WRITE;
329
330         /*
331          * First job: lock down the current transaction and wait for
332          * all outstanding updates to complete.
333          */
334
335 #ifdef COMMIT_STATS
336         spin_lock(&journal->j_list_lock);
337         summarise_journal_usage(journal);
338         spin_unlock(&journal->j_list_lock);
339 #endif
340
341         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
342         if (journal->j_flags & JBD2_FLUSHED) {
343                 jbd_debug(3, "super block updated\n");
344                 jbd2_journal_update_superblock(journal, 1);
345         } else {
346                 jbd_debug(3, "superblock not updated\n");
347         }
348
349         J_ASSERT(journal->j_running_transaction != NULL);
350         J_ASSERT(journal->j_committing_transaction == NULL);
351
352         commit_transaction = journal->j_running_transaction;
353         J_ASSERT(commit_transaction->t_state == T_RUNNING);
354
355         trace_jbd2_start_commit(journal, commit_transaction);
356         jbd_debug(1, "JBD: starting commit of transaction %d\n",
357                         commit_transaction->t_tid);
358
359         write_lock(&journal->j_state_lock);
360         commit_transaction->t_state = T_LOCKED;
361
362         /*
363          * Use plugged writes here, since we want to submit several before
364          * we unplug the device. We don't do explicit unplugging in here,
365          * instead we rely on sync_buffer() doing the unplug for us.
366          */
367         if (commit_transaction->t_synchronous_commit)
368                 write_op = WRITE_SYNC_PLUG;
369         trace_jbd2_commit_locking(journal, commit_transaction);
370         stats.run.rs_wait = commit_transaction->t_max_wait;
371         stats.run.rs_locked = jiffies;
372         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
373                                               stats.run.rs_locked);
374
375         spin_lock(&commit_transaction->t_handle_lock);
376         while (atomic_read(&commit_transaction->t_updates)) {
377                 DEFINE_WAIT(wait);
378
379                 prepare_to_wait(&journal->j_wait_updates, &wait,
380                                         TASK_UNINTERRUPTIBLE);
381                 if (atomic_read(&commit_transaction->t_updates)) {
382                         spin_unlock(&commit_transaction->t_handle_lock);
383                         write_unlock(&journal->j_state_lock);
384                         schedule();
385                         write_lock(&journal->j_state_lock);
386                         spin_lock(&commit_transaction->t_handle_lock);
387                 }
388                 finish_wait(&journal->j_wait_updates, &wait);
389         }
390         spin_unlock(&commit_transaction->t_handle_lock);
391
392         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
393                         journal->j_max_transaction_buffers);
394
395         /*
396          * First thing we are allowed to do is to discard any remaining
397          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
398          * that there are no such buffers: if a large filesystem
399          * operation like a truncate needs to split itself over multiple
400          * transactions, then it may try to do a jbd2_journal_restart() while
401          * there are still BJ_Reserved buffers outstanding.  These must
402          * be released cleanly from the current transaction.
403          *
404          * In this case, the filesystem must still reserve write access
405          * again before modifying the buffer in the new transaction, but
406          * we do not require it to remember exactly which old buffers it
407          * has reserved.  This is consistent with the existing behaviour
408          * that multiple jbd2_journal_get_write_access() calls to the same
409          * buffer are perfectly permissable.
410          */
411         while (commit_transaction->t_reserved_list) {
412                 jh = commit_transaction->t_reserved_list;
413                 JBUFFER_TRACE(jh, "reserved, unused: refile");
414                 /*
415                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
416                  * leave undo-committed data.
417                  */
418                 if (jh->b_committed_data) {
419                         struct buffer_head *bh = jh2bh(jh);
420
421                         jbd_lock_bh_state(bh);
422                         jbd2_free(jh->b_committed_data, bh->b_size);
423                         jh->b_committed_data = NULL;
424                         jbd_unlock_bh_state(bh);
425                 }
426                 jbd2_journal_refile_buffer(journal, jh);
427         }
428
429         /*
430          * Now try to drop any written-back buffers from the journal's
431          * checkpoint lists.  We do this *before* commit because it potentially
432          * frees some memory
433          */
434         spin_lock(&journal->j_list_lock);
435         __jbd2_journal_clean_checkpoint_list(journal);
436         spin_unlock(&journal->j_list_lock);
437
438         jbd_debug (3, "JBD: commit phase 1\n");
439
440         /*
441          * Switch to a new revoke table.
442          */
443         jbd2_journal_switch_revoke_table(journal);
444
445         trace_jbd2_commit_flushing(journal, commit_transaction);
446         stats.run.rs_flushing = jiffies;
447         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
448                                              stats.run.rs_flushing);
449
450         commit_transaction->t_state = T_FLUSH;
451         journal->j_committing_transaction = commit_transaction;
452         journal->j_running_transaction = NULL;
453         start_time = ktime_get();
454         commit_transaction->t_log_start = journal->j_head;
455         wake_up(&journal->j_wait_transaction_locked);
456         write_unlock(&journal->j_state_lock);
457
458         jbd_debug (3, "JBD: commit phase 2\n");
459
460         /*
461          * Now start flushing things to disk, in the order they appear
462          * on the transaction lists.  Data blocks go first.
463          */
464         err = journal_submit_data_buffers(journal, commit_transaction);
465         if (err)
466                 jbd2_journal_abort(journal, err);
467
468         jbd2_journal_write_revoke_records(journal, commit_transaction,
469                                           write_op);
470
471         jbd_debug(3, "JBD: commit phase 2\n");
472
473         /*
474          * Way to go: we have now written out all of the data for a
475          * transaction!  Now comes the tricky part: we need to write out
476          * metadata.  Loop over the transaction's entire buffer list:
477          */
478         write_lock(&journal->j_state_lock);
479         commit_transaction->t_state = T_COMMIT;
480         write_unlock(&journal->j_state_lock);
481
482         trace_jbd2_commit_logging(journal, commit_transaction);
483         stats.run.rs_logging = jiffies;
484         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
485                                                stats.run.rs_logging);
486         stats.run.rs_blocks =
487                 atomic_read(&commit_transaction->t_outstanding_credits);
488         stats.run.rs_blocks_logged = 0;
489
490         J_ASSERT(commit_transaction->t_nr_buffers <=
491                  atomic_read(&commit_transaction->t_outstanding_credits));
492
493         err = 0;
494         descriptor = NULL;
495         bufs = 0;
496         while (commit_transaction->t_buffers) {
497
498                 /* Find the next buffer to be journaled... */
499
500                 jh = commit_transaction->t_buffers;
501
502                 /* If we're in abort mode, we just un-journal the buffer and
503                    release it. */
504
505                 if (is_journal_aborted(journal)) {
506                         clear_buffer_jbddirty(jh2bh(jh));
507                         JBUFFER_TRACE(jh, "journal is aborting: refile");
508                         jbd2_buffer_abort_trigger(jh,
509                                                   jh->b_frozen_data ?
510                                                   jh->b_frozen_triggers :
511                                                   jh->b_triggers);
512                         jbd2_journal_refile_buffer(journal, jh);
513                         /* If that was the last one, we need to clean up
514                          * any descriptor buffers which may have been
515                          * already allocated, even if we are now
516                          * aborting. */
517                         if (!commit_transaction->t_buffers)
518                                 goto start_journal_io;
519                         continue;
520                 }
521
522                 /* Make sure we have a descriptor block in which to
523                    record the metadata buffer. */
524
525                 if (!descriptor) {
526                         struct buffer_head *bh;
527
528                         J_ASSERT (bufs == 0);
529
530                         jbd_debug(4, "JBD: get descriptor\n");
531
532                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
533                         if (!descriptor) {
534                                 jbd2_journal_abort(journal, -EIO);
535                                 continue;
536                         }
537
538                         bh = jh2bh(descriptor);
539                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
540                                 (unsigned long long)bh->b_blocknr, bh->b_data);
541                         header = (journal_header_t *)&bh->b_data[0];
542                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
543                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
544                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
545
546                         tagp = &bh->b_data[sizeof(journal_header_t)];
547                         space_left = bh->b_size - sizeof(journal_header_t);
548                         first_tag = 1;
549                         set_buffer_jwrite(bh);
550                         set_buffer_dirty(bh);
551                         wbuf[bufs++] = bh;
552
553                         /* Record it so that we can wait for IO
554                            completion later */
555                         BUFFER_TRACE(bh, "ph3: file as descriptor");
556                         jbd2_journal_file_buffer(descriptor, commit_transaction,
557                                         BJ_LogCtl);
558                 }
559
560                 /* Where is the buffer to be written? */
561
562                 err = jbd2_journal_next_log_block(journal, &blocknr);
563                 /* If the block mapping failed, just abandon the buffer
564                    and repeat this loop: we'll fall into the
565                    refile-on-abort condition above. */
566                 if (err) {
567                         jbd2_journal_abort(journal, err);
568                         continue;
569                 }
570
571                 /*
572                  * start_this_handle() uses t_outstanding_credits to determine
573                  * the free space in the log, but this counter is changed
574                  * by jbd2_journal_next_log_block() also.
575                  */
576                 atomic_dec(&commit_transaction->t_outstanding_credits);
577
578                 /* Bump b_count to prevent truncate from stumbling over
579                    the shadowed buffer!  @@@ This can go if we ever get
580                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
581                 atomic_inc(&jh2bh(jh)->b_count);
582
583                 /* Make a temporary IO buffer with which to write it out
584                    (this will requeue both the metadata buffer and the
585                    temporary IO buffer). new_bh goes on BJ_IO*/
586
587                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
588                 /*
589                  * akpm: jbd2_journal_write_metadata_buffer() sets
590                  * new_bh->b_transaction to commit_transaction.
591                  * We need to clean this up before we release new_bh
592                  * (which is of type BJ_IO)
593                  */
594                 JBUFFER_TRACE(jh, "ph3: write metadata");
595                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
596                                                       jh, &new_jh, blocknr);
597                 if (flags < 0) {
598                         jbd2_journal_abort(journal, flags);
599                         continue;
600                 }
601                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
602                 wbuf[bufs++] = jh2bh(new_jh);
603
604                 /* Record the new block's tag in the current descriptor
605                    buffer */
606
607                 tag_flag = 0;
608                 if (flags & 1)
609                         tag_flag |= JBD2_FLAG_ESCAPE;
610                 if (!first_tag)
611                         tag_flag |= JBD2_FLAG_SAME_UUID;
612
613                 tag = (journal_block_tag_t *) tagp;
614                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
615                 tag->t_flags = cpu_to_be32(tag_flag);
616                 tagp += tag_bytes;
617                 space_left -= tag_bytes;
618
619                 if (first_tag) {
620                         memcpy (tagp, journal->j_uuid, 16);
621                         tagp += 16;
622                         space_left -= 16;
623                         first_tag = 0;
624                 }
625
626                 /* If there's no more to do, or if the descriptor is full,
627                    let the IO rip! */
628
629                 if (bufs == journal->j_wbufsize ||
630                     commit_transaction->t_buffers == NULL ||
631                     space_left < tag_bytes + 16) {
632
633                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
634
635                         /* Write an end-of-descriptor marker before
636                            submitting the IOs.  "tag" still points to
637                            the last tag we set up. */
638
639                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
640
641 start_journal_io:
642                         for (i = 0; i < bufs; i++) {
643                                 struct buffer_head *bh = wbuf[i];
644                                 /*
645                                  * Compute checksum.
646                                  */
647                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
648                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
649                                         crc32_sum =
650                                             jbd2_checksum_data(crc32_sum, bh);
651                                 }
652
653                                 lock_buffer(bh);
654                                 clear_buffer_dirty(bh);
655                                 set_buffer_uptodate(bh);
656                                 bh->b_end_io = journal_end_buffer_io_sync;
657                                 submit_bh(write_op, bh);
658                         }
659                         cond_resched();
660                         stats.run.rs_blocks_logged += bufs;
661
662                         /* Force a new descriptor to be generated next
663                            time round the loop. */
664                         descriptor = NULL;
665                         bufs = 0;
666                 }
667         }
668
669         err = journal_finish_inode_data_buffers(journal, commit_transaction);
670         if (err) {
671                 printk(KERN_WARNING
672                         "JBD2: Detected IO errors while flushing file data "
673                        "on %s\n", journal->j_devname);
674                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
675                         jbd2_journal_abort(journal, err);
676                 err = 0;
677         }
678
679         /* 
680          * If the journal is not located on the file system device,
681          * then we must flush the file system device before we issue
682          * the commit record
683          */
684         if (commit_transaction->t_flushed_data_blocks &&
685             (journal->j_fs_dev != journal->j_dev) &&
686             (journal->j_flags & JBD2_BARRIER))
687                 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
688                         BLKDEV_IFL_WAIT);
689
690         /* Done it all: now write the commit record asynchronously. */
691         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
692                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
693                 err = journal_submit_commit_record(journal, commit_transaction,
694                                                  &cbh, crc32_sum);
695                 if (err)
696                         __jbd2_journal_abort_hard(journal);
697         }
698
699         /* Lo and behold: we have just managed to send a transaction to
700            the log.  Before we can commit it, wait for the IO so far to
701            complete.  Control buffers being written are on the
702            transaction's t_log_list queue, and metadata buffers are on
703            the t_iobuf_list queue.
704
705            Wait for the buffers in reverse order.  That way we are
706            less likely to be woken up until all IOs have completed, and
707            so we incur less scheduling load.
708         */
709
710         jbd_debug(3, "JBD: commit phase 3\n");
711
712         /*
713          * akpm: these are BJ_IO, and j_list_lock is not needed.
714          * See __journal_try_to_free_buffer.
715          */
716 wait_for_iobuf:
717         while (commit_transaction->t_iobuf_list != NULL) {
718                 struct buffer_head *bh;
719
720                 jh = commit_transaction->t_iobuf_list->b_tprev;
721                 bh = jh2bh(jh);
722                 if (buffer_locked(bh)) {
723                         wait_on_buffer(bh);
724                         goto wait_for_iobuf;
725                 }
726                 if (cond_resched())
727                         goto wait_for_iobuf;
728
729                 if (unlikely(!buffer_uptodate(bh)))
730                         err = -EIO;
731
732                 clear_buffer_jwrite(bh);
733
734                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
735                 jbd2_journal_unfile_buffer(journal, jh);
736
737                 /*
738                  * ->t_iobuf_list should contain only dummy buffer_heads
739                  * which were created by jbd2_journal_write_metadata_buffer().
740                  */
741                 BUFFER_TRACE(bh, "dumping temporary bh");
742                 jbd2_journal_put_journal_head(jh);
743                 __brelse(bh);
744                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
745                 free_buffer_head(bh);
746
747                 /* We also have to unlock and free the corresponding
748                    shadowed buffer */
749                 jh = commit_transaction->t_shadow_list->b_tprev;
750                 bh = jh2bh(jh);
751                 clear_bit(BH_JWrite, &bh->b_state);
752                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
753
754                 /* The metadata is now released for reuse, but we need
755                    to remember it against this transaction so that when
756                    we finally commit, we can do any checkpointing
757                    required. */
758                 JBUFFER_TRACE(jh, "file as BJ_Forget");
759                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
760                 /* Wake up any transactions which were waiting for this
761                    IO to complete */
762                 wake_up_bit(&bh->b_state, BH_Unshadow);
763                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
764                 __brelse(bh);
765         }
766
767         J_ASSERT (commit_transaction->t_shadow_list == NULL);
768
769         jbd_debug(3, "JBD: commit phase 4\n");
770
771         /* Here we wait for the revoke record and descriptor record buffers */
772  wait_for_ctlbuf:
773         while (commit_transaction->t_log_list != NULL) {
774                 struct buffer_head *bh;
775
776                 jh = commit_transaction->t_log_list->b_tprev;
777                 bh = jh2bh(jh);
778                 if (buffer_locked(bh)) {
779                         wait_on_buffer(bh);
780                         goto wait_for_ctlbuf;
781                 }
782                 if (cond_resched())
783                         goto wait_for_ctlbuf;
784
785                 if (unlikely(!buffer_uptodate(bh)))
786                         err = -EIO;
787
788                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
789                 clear_buffer_jwrite(bh);
790                 jbd2_journal_unfile_buffer(journal, jh);
791                 jbd2_journal_put_journal_head(jh);
792                 __brelse(bh);           /* One for getblk */
793                 /* AKPM: bforget here */
794         }
795
796         if (err)
797                 jbd2_journal_abort(journal, err);
798
799         jbd_debug(3, "JBD: commit phase 5\n");
800
801         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
802                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
803                 err = journal_submit_commit_record(journal, commit_transaction,
804                                                 &cbh, crc32_sum);
805                 if (err)
806                         __jbd2_journal_abort_hard(journal);
807         }
808         if (!err && !is_journal_aborted(journal))
809                 err = journal_wait_on_commit_record(journal, cbh);
810         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
811                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
812             journal->j_flags & JBD2_BARRIER) {
813                 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
814                                    BLKDEV_IFL_WAIT);
815         }
816
817         if (err)
818                 jbd2_journal_abort(journal, err);
819
820         /* End of a transaction!  Finally, we can do checkpoint
821            processing: any buffers committed as a result of this
822            transaction can be removed from any checkpoint list it was on
823            before. */
824
825         jbd_debug(3, "JBD: commit phase 6\n");
826
827         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
828         J_ASSERT(commit_transaction->t_buffers == NULL);
829         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
830         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
831         J_ASSERT(commit_transaction->t_shadow_list == NULL);
832         J_ASSERT(commit_transaction->t_log_list == NULL);
833
834 restart_loop:
835         /*
836          * As there are other places (journal_unmap_buffer()) adding buffers
837          * to this list we have to be careful and hold the j_list_lock.
838          */
839         spin_lock(&journal->j_list_lock);
840         while (commit_transaction->t_forget) {
841                 transaction_t *cp_transaction;
842                 struct buffer_head *bh;
843
844                 jh = commit_transaction->t_forget;
845                 spin_unlock(&journal->j_list_lock);
846                 bh = jh2bh(jh);
847                 jbd_lock_bh_state(bh);
848                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
849
850                 /*
851                  * If there is undo-protected committed data against
852                  * this buffer, then we can remove it now.  If it is a
853                  * buffer needing such protection, the old frozen_data
854                  * field now points to a committed version of the
855                  * buffer, so rotate that field to the new committed
856                  * data.
857                  *
858                  * Otherwise, we can just throw away the frozen data now.
859                  *
860                  * We also know that the frozen data has already fired
861                  * its triggers if they exist, so we can clear that too.
862                  */
863                 if (jh->b_committed_data) {
864                         jbd2_free(jh->b_committed_data, bh->b_size);
865                         jh->b_committed_data = NULL;
866                         if (jh->b_frozen_data) {
867                                 jh->b_committed_data = jh->b_frozen_data;
868                                 jh->b_frozen_data = NULL;
869                                 jh->b_frozen_triggers = NULL;
870                         }
871                 } else if (jh->b_frozen_data) {
872                         jbd2_free(jh->b_frozen_data, bh->b_size);
873                         jh->b_frozen_data = NULL;
874                         jh->b_frozen_triggers = NULL;
875                 }
876
877                 spin_lock(&journal->j_list_lock);
878                 cp_transaction = jh->b_cp_transaction;
879                 if (cp_transaction) {
880                         JBUFFER_TRACE(jh, "remove from old cp transaction");
881                         cp_transaction->t_chp_stats.cs_dropped++;
882                         __jbd2_journal_remove_checkpoint(jh);
883                 }
884
885                 /* Only re-checkpoint the buffer_head if it is marked
886                  * dirty.  If the buffer was added to the BJ_Forget list
887                  * by jbd2_journal_forget, it may no longer be dirty and
888                  * there's no point in keeping a checkpoint record for
889                  * it. */
890
891                 /* A buffer which has been freed while still being
892                  * journaled by a previous transaction may end up still
893                  * being dirty here, but we want to avoid writing back
894                  * that buffer in the future after the "add to orphan"
895                  * operation been committed,  That's not only a performance
896                  * gain, it also stops aliasing problems if the buffer is
897                  * left behind for writeback and gets reallocated for another
898                  * use in a different page. */
899                 if (buffer_freed(bh) && !jh->b_next_transaction) {
900                         clear_buffer_freed(bh);
901                         clear_buffer_jbddirty(bh);
902                 }
903
904                 if (buffer_jbddirty(bh)) {
905                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
906                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
907                         if (is_journal_aborted(journal))
908                                 clear_buffer_jbddirty(bh);
909                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
910                         __jbd2_journal_refile_buffer(jh);
911                         jbd_unlock_bh_state(bh);
912                 } else {
913                         J_ASSERT_BH(bh, !buffer_dirty(bh));
914                         /* The buffer on BJ_Forget list and not jbddirty means
915                          * it has been freed by this transaction and hence it
916                          * could not have been reallocated until this
917                          * transaction has committed. *BUT* it could be
918                          * reallocated once we have written all the data to
919                          * disk and before we process the buffer on BJ_Forget
920                          * list. */
921                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
922                         __jbd2_journal_refile_buffer(jh);
923                         if (!jh->b_transaction) {
924                                 jbd_unlock_bh_state(bh);
925                                  /* needs a brelse */
926                                 jbd2_journal_remove_journal_head(bh);
927                                 release_buffer_page(bh);
928                         } else
929                                 jbd_unlock_bh_state(bh);
930                 }
931                 cond_resched_lock(&journal->j_list_lock);
932         }
933         spin_unlock(&journal->j_list_lock);
934         /*
935          * This is a bit sleazy.  We use j_list_lock to protect transition
936          * of a transaction into T_FINISHED state and calling
937          * __jbd2_journal_drop_transaction(). Otherwise we could race with
938          * other checkpointing code processing the transaction...
939          */
940         write_lock(&journal->j_state_lock);
941         spin_lock(&journal->j_list_lock);
942         /*
943          * Now recheck if some buffers did not get attached to the transaction
944          * while the lock was dropped...
945          */
946         if (commit_transaction->t_forget) {
947                 spin_unlock(&journal->j_list_lock);
948                 write_unlock(&journal->j_state_lock);
949                 goto restart_loop;
950         }
951
952         /* Done with this transaction! */
953
954         jbd_debug(3, "JBD: commit phase 7\n");
955
956         J_ASSERT(commit_transaction->t_state == T_COMMIT);
957
958         commit_transaction->t_start = jiffies;
959         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
960                                               commit_transaction->t_start);
961
962         /*
963          * File the transaction statistics
964          */
965         stats.ts_tid = commit_transaction->t_tid;
966         stats.run.rs_handle_count =
967                 atomic_read(&commit_transaction->t_handle_count);
968         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
969                              commit_transaction->t_tid, &stats.run);
970
971         /*
972          * Calculate overall stats
973          */
974         spin_lock(&journal->j_history_lock);
975         journal->j_stats.ts_tid++;
976         journal->j_stats.run.rs_wait += stats.run.rs_wait;
977         journal->j_stats.run.rs_running += stats.run.rs_running;
978         journal->j_stats.run.rs_locked += stats.run.rs_locked;
979         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
980         journal->j_stats.run.rs_logging += stats.run.rs_logging;
981         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
982         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
983         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
984         spin_unlock(&journal->j_history_lock);
985
986         commit_transaction->t_state = T_FINISHED;
987         J_ASSERT(commit_transaction == journal->j_committing_transaction);
988         journal->j_commit_sequence = commit_transaction->t_tid;
989         journal->j_committing_transaction = NULL;
990         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
991
992         /*
993          * weight the commit time higher than the average time so we don't
994          * react too strongly to vast changes in the commit time
995          */
996         if (likely(journal->j_average_commit_time))
997                 journal->j_average_commit_time = (commit_time +
998                                 journal->j_average_commit_time*3) / 4;
999         else
1000                 journal->j_average_commit_time = commit_time;
1001         write_unlock(&journal->j_state_lock);
1002
1003         if (commit_transaction->t_checkpoint_list == NULL &&
1004             commit_transaction->t_checkpoint_io_list == NULL) {
1005                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1006                 to_free = 1;
1007         } else {
1008                 if (journal->j_checkpoint_transactions == NULL) {
1009                         journal->j_checkpoint_transactions = commit_transaction;
1010                         commit_transaction->t_cpnext = commit_transaction;
1011                         commit_transaction->t_cpprev = commit_transaction;
1012                 } else {
1013                         commit_transaction->t_cpnext =
1014                                 journal->j_checkpoint_transactions;
1015                         commit_transaction->t_cpprev =
1016                                 commit_transaction->t_cpnext->t_cpprev;
1017                         commit_transaction->t_cpnext->t_cpprev =
1018                                 commit_transaction;
1019                         commit_transaction->t_cpprev->t_cpnext =
1020                                 commit_transaction;
1021                 }
1022         }
1023         spin_unlock(&journal->j_list_lock);
1024
1025         if (journal->j_commit_callback)
1026                 journal->j_commit_callback(journal, commit_transaction);
1027
1028         trace_jbd2_end_commit(journal, commit_transaction);
1029         jbd_debug(1, "JBD: commit %d complete, head %d\n",
1030                   journal->j_commit_sequence, journal->j_tail_sequence);
1031         if (to_free)
1032                 kfree(commit_transaction);
1033
1034         wake_up(&journal->j_wait_done_commit);
1035 }