jbd2: finish conversion from WRITE_SYNC_PLUG to WRITE_SYNC and explicit plugging
[linux-2.6.git] / fs / jbd2 / commit.c
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31 #include <asm/system.h>
32
33 /*
34  * Default IO end handler for temporary BJ_IO buffer_heads.
35  */
36 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
37 {
38         BUFFER_TRACE(bh, "");
39         if (uptodate)
40                 set_buffer_uptodate(bh);
41         else
42                 clear_buffer_uptodate(bh);
43         unlock_buffer(bh);
44 }
45
46 /*
47  * When an ext4 file is truncated, it is possible that some pages are not
48  * successfully freed, because they are attached to a committing transaction.
49  * After the transaction commits, these pages are left on the LRU, with no
50  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
51  * by the VM, but their apparent absence upsets the VM accounting, and it makes
52  * the numbers in /proc/meminfo look odd.
53  *
54  * So here, we have a buffer which has just come off the forget list.  Look to
55  * see if we can strip all buffers from the backing page.
56  *
57  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
58  * caller provided us with a ref against the buffer, and we drop that here.
59  */
60 static void release_buffer_page(struct buffer_head *bh)
61 {
62         struct page *page;
63
64         if (buffer_dirty(bh))
65                 goto nope;
66         if (atomic_read(&bh->b_count) != 1)
67                 goto nope;
68         page = bh->b_page;
69         if (!page)
70                 goto nope;
71         if (page->mapping)
72                 goto nope;
73
74         /* OK, it's a truncated page */
75         if (!trylock_page(page))
76                 goto nope;
77
78         page_cache_get(page);
79         __brelse(bh);
80         try_to_free_buffers(page);
81         unlock_page(page);
82         page_cache_release(page);
83         return;
84
85 nope:
86         __brelse(bh);
87 }
88
89 /*
90  * Done it all: now submit the commit record.  We should have
91  * cleaned up our previous buffers by now, so if we are in abort
92  * mode we can now just skip the rest of the journal write
93  * entirely.
94  *
95  * Returns 1 if the journal needs to be aborted or 0 on success
96  */
97 static int journal_submit_commit_record(journal_t *journal,
98                                         transaction_t *commit_transaction,
99                                         struct buffer_head **cbh,
100                                         __u32 crc32_sum)
101 {
102         struct journal_head *descriptor;
103         struct commit_header *tmp;
104         struct buffer_head *bh;
105         int ret;
106         struct timespec now = current_kernel_time();
107
108         if (is_journal_aborted(journal))
109                 return 0;
110
111         descriptor = jbd2_journal_get_descriptor_buffer(journal);
112         if (!descriptor)
113                 return 1;
114
115         bh = jh2bh(descriptor);
116
117         tmp = (struct commit_header *)bh->b_data;
118         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
119         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
120         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
121         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
122         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
123
124         if (JBD2_HAS_COMPAT_FEATURE(journal,
125                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
126                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
127                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
128                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
129         }
130
131         JBUFFER_TRACE(descriptor, "submit commit block");
132         lock_buffer(bh);
133         clear_buffer_dirty(bh);
134         set_buffer_uptodate(bh);
135         bh->b_end_io = journal_end_buffer_io_sync;
136
137         if (journal->j_flags & JBD2_BARRIER &&
138             !JBD2_HAS_INCOMPAT_FEATURE(journal,
139                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
140                 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
141         else
142                 ret = submit_bh(WRITE_SYNC, bh);
143
144         *cbh = bh;
145         return ret;
146 }
147
148 /*
149  * This function along with journal_submit_commit_record
150  * allows to write the commit record asynchronously.
151  */
152 static int journal_wait_on_commit_record(journal_t *journal,
153                                          struct buffer_head *bh)
154 {
155         int ret = 0;
156
157         clear_buffer_dirty(bh);
158         wait_on_buffer(bh);
159
160         if (unlikely(!buffer_uptodate(bh)))
161                 ret = -EIO;
162         put_bh(bh);            /* One for getblk() */
163         jbd2_journal_put_journal_head(bh2jh(bh));
164
165         return ret;
166 }
167
168 /*
169  * write the filemap data using writepage() address_space_operations.
170  * We don't do block allocation here even for delalloc. We don't
171  * use writepages() because with dealyed allocation we may be doing
172  * block allocation in writepages().
173  */
174 static int journal_submit_inode_data_buffers(struct address_space *mapping)
175 {
176         int ret;
177         struct writeback_control wbc = {
178                 .sync_mode =  WB_SYNC_ALL,
179                 .nr_to_write = mapping->nrpages * 2,
180                 .range_start = 0,
181                 .range_end = i_size_read(mapping->host),
182         };
183
184         ret = generic_writepages(mapping, &wbc);
185         return ret;
186 }
187
188 /*
189  * Submit all the data buffers of inode associated with the transaction to
190  * disk.
191  *
192  * We are in a committing transaction. Therefore no new inode can be added to
193  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
194  * operate on from being released while we write out pages.
195  */
196 static int journal_submit_data_buffers(journal_t *journal,
197                 transaction_t *commit_transaction)
198 {
199         struct jbd2_inode *jinode;
200         int err, ret = 0;
201         struct address_space *mapping;
202
203         spin_lock(&journal->j_list_lock);
204         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
205                 mapping = jinode->i_vfs_inode->i_mapping;
206                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
207                 spin_unlock(&journal->j_list_lock);
208                 /*
209                  * submit the inode data buffers. We use writepage
210                  * instead of writepages. Because writepages can do
211                  * block allocation  with delalloc. We need to write
212                  * only allocated blocks here.
213                  */
214                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
215                 err = journal_submit_inode_data_buffers(mapping);
216                 if (!ret)
217                         ret = err;
218                 spin_lock(&journal->j_list_lock);
219                 J_ASSERT(jinode->i_transaction == commit_transaction);
220                 commit_transaction->t_flushed_data_blocks = 1;
221                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
222                 smp_mb__after_clear_bit();
223                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
224         }
225         spin_unlock(&journal->j_list_lock);
226         return ret;
227 }
228
229 /*
230  * Wait for data submitted for writeout, refile inodes to proper
231  * transaction if needed.
232  *
233  */
234 static int journal_finish_inode_data_buffers(journal_t *journal,
235                 transaction_t *commit_transaction)
236 {
237         struct jbd2_inode *jinode, *next_i;
238         int err, ret = 0;
239
240         /* For locking, see the comment in journal_submit_data_buffers() */
241         spin_lock(&journal->j_list_lock);
242         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
243                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
244                 spin_unlock(&journal->j_list_lock);
245                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
246                 if (err) {
247                         /*
248                          * Because AS_EIO is cleared by
249                          * filemap_fdatawait_range(), set it again so
250                          * that user process can get -EIO from fsync().
251                          */
252                         set_bit(AS_EIO,
253                                 &jinode->i_vfs_inode->i_mapping->flags);
254
255                         if (!ret)
256                                 ret = err;
257                 }
258                 spin_lock(&journal->j_list_lock);
259                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
260                 smp_mb__after_clear_bit();
261                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
262         }
263
264         /* Now refile inode to proper lists */
265         list_for_each_entry_safe(jinode, next_i,
266                                  &commit_transaction->t_inode_list, i_list) {
267                 list_del(&jinode->i_list);
268                 if (jinode->i_next_transaction) {
269                         jinode->i_transaction = jinode->i_next_transaction;
270                         jinode->i_next_transaction = NULL;
271                         list_add(&jinode->i_list,
272                                 &jinode->i_transaction->t_inode_list);
273                 } else {
274                         jinode->i_transaction = NULL;
275                 }
276         }
277         spin_unlock(&journal->j_list_lock);
278
279         return ret;
280 }
281
282 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
283 {
284         struct page *page = bh->b_page;
285         char *addr;
286         __u32 checksum;
287
288         addr = kmap_atomic(page, KM_USER0);
289         checksum = crc32_be(crc32_sum,
290                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
291         kunmap_atomic(addr, KM_USER0);
292
293         return checksum;
294 }
295
296 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
297                                    unsigned long long block)
298 {
299         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
300         if (tag_bytes > JBD2_TAG_SIZE32)
301                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
302 }
303
304 /*
305  * jbd2_journal_commit_transaction
306  *
307  * The primary function for committing a transaction to the log.  This
308  * function is called by the journal thread to begin a complete commit.
309  */
310 void jbd2_journal_commit_transaction(journal_t *journal)
311 {
312         struct transaction_stats_s stats;
313         transaction_t *commit_transaction;
314         struct journal_head *jh, *new_jh, *descriptor;
315         struct buffer_head **wbuf = journal->j_wbuf;
316         int bufs;
317         int flags;
318         int err;
319         unsigned long long blocknr;
320         ktime_t start_time;
321         u64 commit_time;
322         char *tagp = NULL;
323         journal_header_t *header;
324         journal_block_tag_t *tag = NULL;
325         int space_left = 0;
326         int first_tag = 0;
327         int tag_flag;
328         int i, to_free = 0;
329         int tag_bytes = journal_tag_bytes(journal);
330         struct buffer_head *cbh = NULL; /* For transactional checksums */
331         __u32 crc32_sum = ~0;
332         struct blk_plug plug;
333
334         /*
335          * First job: lock down the current transaction and wait for
336          * all outstanding updates to complete.
337          */
338
339 #ifdef COMMIT_STATS
340         spin_lock(&journal->j_list_lock);
341         summarise_journal_usage(journal);
342         spin_unlock(&journal->j_list_lock);
343 #endif
344
345         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
346         if (journal->j_flags & JBD2_FLUSHED) {
347                 jbd_debug(3, "super block updated\n");
348                 jbd2_journal_update_superblock(journal, 1);
349         } else {
350                 jbd_debug(3, "superblock not updated\n");
351         }
352
353         J_ASSERT(journal->j_running_transaction != NULL);
354         J_ASSERT(journal->j_committing_transaction == NULL);
355
356         commit_transaction = journal->j_running_transaction;
357         J_ASSERT(commit_transaction->t_state == T_RUNNING);
358
359         trace_jbd2_start_commit(journal, commit_transaction);
360         jbd_debug(1, "JBD: starting commit of transaction %d\n",
361                         commit_transaction->t_tid);
362
363         write_lock(&journal->j_state_lock);
364         commit_transaction->t_state = T_LOCKED;
365
366         trace_jbd2_commit_locking(journal, commit_transaction);
367         stats.run.rs_wait = commit_transaction->t_max_wait;
368         stats.run.rs_locked = jiffies;
369         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
370                                               stats.run.rs_locked);
371
372         spin_lock(&commit_transaction->t_handle_lock);
373         while (atomic_read(&commit_transaction->t_updates)) {
374                 DEFINE_WAIT(wait);
375
376                 prepare_to_wait(&journal->j_wait_updates, &wait,
377                                         TASK_UNINTERRUPTIBLE);
378                 if (atomic_read(&commit_transaction->t_updates)) {
379                         spin_unlock(&commit_transaction->t_handle_lock);
380                         write_unlock(&journal->j_state_lock);
381                         schedule();
382                         write_lock(&journal->j_state_lock);
383                         spin_lock(&commit_transaction->t_handle_lock);
384                 }
385                 finish_wait(&journal->j_wait_updates, &wait);
386         }
387         spin_unlock(&commit_transaction->t_handle_lock);
388
389         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
390                         journal->j_max_transaction_buffers);
391
392         /*
393          * First thing we are allowed to do is to discard any remaining
394          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
395          * that there are no such buffers: if a large filesystem
396          * operation like a truncate needs to split itself over multiple
397          * transactions, then it may try to do a jbd2_journal_restart() while
398          * there are still BJ_Reserved buffers outstanding.  These must
399          * be released cleanly from the current transaction.
400          *
401          * In this case, the filesystem must still reserve write access
402          * again before modifying the buffer in the new transaction, but
403          * we do not require it to remember exactly which old buffers it
404          * has reserved.  This is consistent with the existing behaviour
405          * that multiple jbd2_journal_get_write_access() calls to the same
406          * buffer are perfectly permissable.
407          */
408         while (commit_transaction->t_reserved_list) {
409                 jh = commit_transaction->t_reserved_list;
410                 JBUFFER_TRACE(jh, "reserved, unused: refile");
411                 /*
412                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
413                  * leave undo-committed data.
414                  */
415                 if (jh->b_committed_data) {
416                         struct buffer_head *bh = jh2bh(jh);
417
418                         jbd_lock_bh_state(bh);
419                         jbd2_free(jh->b_committed_data, bh->b_size);
420                         jh->b_committed_data = NULL;
421                         jbd_unlock_bh_state(bh);
422                 }
423                 jbd2_journal_refile_buffer(journal, jh);
424         }
425
426         /*
427          * Now try to drop any written-back buffers from the journal's
428          * checkpoint lists.  We do this *before* commit because it potentially
429          * frees some memory
430          */
431         spin_lock(&journal->j_list_lock);
432         __jbd2_journal_clean_checkpoint_list(journal);
433         spin_unlock(&journal->j_list_lock);
434
435         jbd_debug (3, "JBD: commit phase 1\n");
436
437         /*
438          * Switch to a new revoke table.
439          */
440         jbd2_journal_switch_revoke_table(journal);
441
442         trace_jbd2_commit_flushing(journal, commit_transaction);
443         stats.run.rs_flushing = jiffies;
444         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
445                                              stats.run.rs_flushing);
446
447         commit_transaction->t_state = T_FLUSH;
448         journal->j_committing_transaction = commit_transaction;
449         journal->j_running_transaction = NULL;
450         start_time = ktime_get();
451         commit_transaction->t_log_start = journal->j_head;
452         wake_up(&journal->j_wait_transaction_locked);
453         write_unlock(&journal->j_state_lock);
454
455         jbd_debug (3, "JBD: commit phase 2\n");
456
457         /*
458          * Now start flushing things to disk, in the order they appear
459          * on the transaction lists.  Data blocks go first.
460          */
461         err = journal_submit_data_buffers(journal, commit_transaction);
462         if (err)
463                 jbd2_journal_abort(journal, err);
464
465         blk_start_plug(&plug);
466         jbd2_journal_write_revoke_records(journal, commit_transaction,
467                                           WRITE_SYNC);
468         blk_finish_plug(&plug);
469
470         jbd_debug(3, "JBD: commit phase 2\n");
471
472         /*
473          * Way to go: we have now written out all of the data for a
474          * transaction!  Now comes the tricky part: we need to write out
475          * metadata.  Loop over the transaction's entire buffer list:
476          */
477         write_lock(&journal->j_state_lock);
478         commit_transaction->t_state = T_COMMIT;
479         write_unlock(&journal->j_state_lock);
480
481         trace_jbd2_commit_logging(journal, commit_transaction);
482         stats.run.rs_logging = jiffies;
483         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
484                                                stats.run.rs_logging);
485         stats.run.rs_blocks =
486                 atomic_read(&commit_transaction->t_outstanding_credits);
487         stats.run.rs_blocks_logged = 0;
488
489         J_ASSERT(commit_transaction->t_nr_buffers <=
490                  atomic_read(&commit_transaction->t_outstanding_credits));
491
492         err = 0;
493         descriptor = NULL;
494         bufs = 0;
495         blk_start_plug(&plug);
496         while (commit_transaction->t_buffers) {
497
498                 /* Find the next buffer to be journaled... */
499
500                 jh = commit_transaction->t_buffers;
501
502                 /* If we're in abort mode, we just un-journal the buffer and
503                    release it. */
504
505                 if (is_journal_aborted(journal)) {
506                         clear_buffer_jbddirty(jh2bh(jh));
507                         JBUFFER_TRACE(jh, "journal is aborting: refile");
508                         jbd2_buffer_abort_trigger(jh,
509                                                   jh->b_frozen_data ?
510                                                   jh->b_frozen_triggers :
511                                                   jh->b_triggers);
512                         jbd2_journal_refile_buffer(journal, jh);
513                         /* If that was the last one, we need to clean up
514                          * any descriptor buffers which may have been
515                          * already allocated, even if we are now
516                          * aborting. */
517                         if (!commit_transaction->t_buffers)
518                                 goto start_journal_io;
519                         continue;
520                 }
521
522                 /* Make sure we have a descriptor block in which to
523                    record the metadata buffer. */
524
525                 if (!descriptor) {
526                         struct buffer_head *bh;
527
528                         J_ASSERT (bufs == 0);
529
530                         jbd_debug(4, "JBD: get descriptor\n");
531
532                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
533                         if (!descriptor) {
534                                 jbd2_journal_abort(journal, -EIO);
535                                 continue;
536                         }
537
538                         bh = jh2bh(descriptor);
539                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
540                                 (unsigned long long)bh->b_blocknr, bh->b_data);
541                         header = (journal_header_t *)&bh->b_data[0];
542                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
543                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
544                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
545
546                         tagp = &bh->b_data[sizeof(journal_header_t)];
547                         space_left = bh->b_size - sizeof(journal_header_t);
548                         first_tag = 1;
549                         set_buffer_jwrite(bh);
550                         set_buffer_dirty(bh);
551                         wbuf[bufs++] = bh;
552
553                         /* Record it so that we can wait for IO
554                            completion later */
555                         BUFFER_TRACE(bh, "ph3: file as descriptor");
556                         jbd2_journal_file_buffer(descriptor, commit_transaction,
557                                         BJ_LogCtl);
558                 }
559
560                 /* Where is the buffer to be written? */
561
562                 err = jbd2_journal_next_log_block(journal, &blocknr);
563                 /* If the block mapping failed, just abandon the buffer
564                    and repeat this loop: we'll fall into the
565                    refile-on-abort condition above. */
566                 if (err) {
567                         jbd2_journal_abort(journal, err);
568                         continue;
569                 }
570
571                 /*
572                  * start_this_handle() uses t_outstanding_credits to determine
573                  * the free space in the log, but this counter is changed
574                  * by jbd2_journal_next_log_block() also.
575                  */
576                 atomic_dec(&commit_transaction->t_outstanding_credits);
577
578                 /* Bump b_count to prevent truncate from stumbling over
579                    the shadowed buffer!  @@@ This can go if we ever get
580                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
581                 atomic_inc(&jh2bh(jh)->b_count);
582
583                 /* Make a temporary IO buffer with which to write it out
584                    (this will requeue both the metadata buffer and the
585                    temporary IO buffer). new_bh goes on BJ_IO*/
586
587                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
588                 /*
589                  * akpm: jbd2_journal_write_metadata_buffer() sets
590                  * new_bh->b_transaction to commit_transaction.
591                  * We need to clean this up before we release new_bh
592                  * (which is of type BJ_IO)
593                  */
594                 JBUFFER_TRACE(jh, "ph3: write metadata");
595                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
596                                                       jh, &new_jh, blocknr);
597                 if (flags < 0) {
598                         jbd2_journal_abort(journal, flags);
599                         continue;
600                 }
601                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
602                 wbuf[bufs++] = jh2bh(new_jh);
603
604                 /* Record the new block's tag in the current descriptor
605                    buffer */
606
607                 tag_flag = 0;
608                 if (flags & 1)
609                         tag_flag |= JBD2_FLAG_ESCAPE;
610                 if (!first_tag)
611                         tag_flag |= JBD2_FLAG_SAME_UUID;
612
613                 tag = (journal_block_tag_t *) tagp;
614                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
615                 tag->t_flags = cpu_to_be32(tag_flag);
616                 tagp += tag_bytes;
617                 space_left -= tag_bytes;
618
619                 if (first_tag) {
620                         memcpy (tagp, journal->j_uuid, 16);
621                         tagp += 16;
622                         space_left -= 16;
623                         first_tag = 0;
624                 }
625
626                 /* If there's no more to do, or if the descriptor is full,
627                    let the IO rip! */
628
629                 if (bufs == journal->j_wbufsize ||
630                     commit_transaction->t_buffers == NULL ||
631                     space_left < tag_bytes + 16) {
632
633                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
634
635                         /* Write an end-of-descriptor marker before
636                            submitting the IOs.  "tag" still points to
637                            the last tag we set up. */
638
639                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
640
641 start_journal_io:
642                         for (i = 0; i < bufs; i++) {
643                                 struct buffer_head *bh = wbuf[i];
644                                 /*
645                                  * Compute checksum.
646                                  */
647                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
648                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
649                                         crc32_sum =
650                                             jbd2_checksum_data(crc32_sum, bh);
651                                 }
652
653                                 lock_buffer(bh);
654                                 clear_buffer_dirty(bh);
655                                 set_buffer_uptodate(bh);
656                                 bh->b_end_io = journal_end_buffer_io_sync;
657                                 submit_bh(WRITE_SYNC, bh);
658                         }
659                         cond_resched();
660                         stats.run.rs_blocks_logged += bufs;
661
662                         /* Force a new descriptor to be generated next
663                            time round the loop. */
664                         descriptor = NULL;
665                         bufs = 0;
666                 }
667         }
668
669         err = journal_finish_inode_data_buffers(journal, commit_transaction);
670         if (err) {
671                 printk(KERN_WARNING
672                         "JBD2: Detected IO errors while flushing file data "
673                        "on %s\n", journal->j_devname);
674                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
675                         jbd2_journal_abort(journal, err);
676                 err = 0;
677         }
678
679         /* 
680          * If the journal is not located on the file system device,
681          * then we must flush the file system device before we issue
682          * the commit record
683          */
684         if (commit_transaction->t_flushed_data_blocks &&
685             (journal->j_fs_dev != journal->j_dev) &&
686             (journal->j_flags & JBD2_BARRIER))
687                 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
688
689         /* Done it all: now write the commit record asynchronously. */
690         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
691                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
692                 err = journal_submit_commit_record(journal, commit_transaction,
693                                                  &cbh, crc32_sum);
694                 if (err)
695                         __jbd2_journal_abort_hard(journal);
696         }
697
698         blk_finish_plug(&plug);
699
700         /* Lo and behold: we have just managed to send a transaction to
701            the log.  Before we can commit it, wait for the IO so far to
702            complete.  Control buffers being written are on the
703            transaction's t_log_list queue, and metadata buffers are on
704            the t_iobuf_list queue.
705
706            Wait for the buffers in reverse order.  That way we are
707            less likely to be woken up until all IOs have completed, and
708            so we incur less scheduling load.
709         */
710
711         jbd_debug(3, "JBD: commit phase 3\n");
712
713         /*
714          * akpm: these are BJ_IO, and j_list_lock is not needed.
715          * See __journal_try_to_free_buffer.
716          */
717 wait_for_iobuf:
718         while (commit_transaction->t_iobuf_list != NULL) {
719                 struct buffer_head *bh;
720
721                 jh = commit_transaction->t_iobuf_list->b_tprev;
722                 bh = jh2bh(jh);
723                 if (buffer_locked(bh)) {
724                         wait_on_buffer(bh);
725                         goto wait_for_iobuf;
726                 }
727                 if (cond_resched())
728                         goto wait_for_iobuf;
729
730                 if (unlikely(!buffer_uptodate(bh)))
731                         err = -EIO;
732
733                 clear_buffer_jwrite(bh);
734
735                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
736                 jbd2_journal_unfile_buffer(journal, jh);
737
738                 /*
739                  * ->t_iobuf_list should contain only dummy buffer_heads
740                  * which were created by jbd2_journal_write_metadata_buffer().
741                  */
742                 BUFFER_TRACE(bh, "dumping temporary bh");
743                 jbd2_journal_put_journal_head(jh);
744                 __brelse(bh);
745                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
746                 free_buffer_head(bh);
747
748                 /* We also have to unlock and free the corresponding
749                    shadowed buffer */
750                 jh = commit_transaction->t_shadow_list->b_tprev;
751                 bh = jh2bh(jh);
752                 clear_bit(BH_JWrite, &bh->b_state);
753                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
754
755                 /* The metadata is now released for reuse, but we need
756                    to remember it against this transaction so that when
757                    we finally commit, we can do any checkpointing
758                    required. */
759                 JBUFFER_TRACE(jh, "file as BJ_Forget");
760                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
761                 /* Wake up any transactions which were waiting for this
762                    IO to complete */
763                 wake_up_bit(&bh->b_state, BH_Unshadow);
764                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
765                 __brelse(bh);
766         }
767
768         J_ASSERT (commit_transaction->t_shadow_list == NULL);
769
770         jbd_debug(3, "JBD: commit phase 4\n");
771
772         /* Here we wait for the revoke record and descriptor record buffers */
773  wait_for_ctlbuf:
774         while (commit_transaction->t_log_list != NULL) {
775                 struct buffer_head *bh;
776
777                 jh = commit_transaction->t_log_list->b_tprev;
778                 bh = jh2bh(jh);
779                 if (buffer_locked(bh)) {
780                         wait_on_buffer(bh);
781                         goto wait_for_ctlbuf;
782                 }
783                 if (cond_resched())
784                         goto wait_for_ctlbuf;
785
786                 if (unlikely(!buffer_uptodate(bh)))
787                         err = -EIO;
788
789                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
790                 clear_buffer_jwrite(bh);
791                 jbd2_journal_unfile_buffer(journal, jh);
792                 jbd2_journal_put_journal_head(jh);
793                 __brelse(bh);           /* One for getblk */
794                 /* AKPM: bforget here */
795         }
796
797         if (err)
798                 jbd2_journal_abort(journal, err);
799
800         jbd_debug(3, "JBD: commit phase 5\n");
801
802         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
803                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
804                 err = journal_submit_commit_record(journal, commit_transaction,
805                                                 &cbh, crc32_sum);
806                 if (err)
807                         __jbd2_journal_abort_hard(journal);
808         }
809         if (!err && !is_journal_aborted(journal))
810                 err = journal_wait_on_commit_record(journal, cbh);
811         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
812                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
813             journal->j_flags & JBD2_BARRIER) {
814                 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
815         }
816
817         if (err)
818                 jbd2_journal_abort(journal, err);
819
820         /* End of a transaction!  Finally, we can do checkpoint
821            processing: any buffers committed as a result of this
822            transaction can be removed from any checkpoint list it was on
823            before. */
824
825         jbd_debug(3, "JBD: commit phase 6\n");
826
827         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
828         J_ASSERT(commit_transaction->t_buffers == NULL);
829         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
830         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
831         J_ASSERT(commit_transaction->t_shadow_list == NULL);
832         J_ASSERT(commit_transaction->t_log_list == NULL);
833
834 restart_loop:
835         /*
836          * As there are other places (journal_unmap_buffer()) adding buffers
837          * to this list we have to be careful and hold the j_list_lock.
838          */
839         spin_lock(&journal->j_list_lock);
840         while (commit_transaction->t_forget) {
841                 transaction_t *cp_transaction;
842                 struct buffer_head *bh;
843
844                 jh = commit_transaction->t_forget;
845                 spin_unlock(&journal->j_list_lock);
846                 bh = jh2bh(jh);
847                 jbd_lock_bh_state(bh);
848                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
849
850                 /*
851                  * If there is undo-protected committed data against
852                  * this buffer, then we can remove it now.  If it is a
853                  * buffer needing such protection, the old frozen_data
854                  * field now points to a committed version of the
855                  * buffer, so rotate that field to the new committed
856                  * data.
857                  *
858                  * Otherwise, we can just throw away the frozen data now.
859                  *
860                  * We also know that the frozen data has already fired
861                  * its triggers if they exist, so we can clear that too.
862                  */
863                 if (jh->b_committed_data) {
864                         jbd2_free(jh->b_committed_data, bh->b_size);
865                         jh->b_committed_data = NULL;
866                         if (jh->b_frozen_data) {
867                                 jh->b_committed_data = jh->b_frozen_data;
868                                 jh->b_frozen_data = NULL;
869                                 jh->b_frozen_triggers = NULL;
870                         }
871                 } else if (jh->b_frozen_data) {
872                         jbd2_free(jh->b_frozen_data, bh->b_size);
873                         jh->b_frozen_data = NULL;
874                         jh->b_frozen_triggers = NULL;
875                 }
876
877                 spin_lock(&journal->j_list_lock);
878                 cp_transaction = jh->b_cp_transaction;
879                 if (cp_transaction) {
880                         JBUFFER_TRACE(jh, "remove from old cp transaction");
881                         cp_transaction->t_chp_stats.cs_dropped++;
882                         __jbd2_journal_remove_checkpoint(jh);
883                 }
884
885                 /* Only re-checkpoint the buffer_head if it is marked
886                  * dirty.  If the buffer was added to the BJ_Forget list
887                  * by jbd2_journal_forget, it may no longer be dirty and
888                  * there's no point in keeping a checkpoint record for
889                  * it. */
890
891                 /* A buffer which has been freed while still being
892                  * journaled by a previous transaction may end up still
893                  * being dirty here, but we want to avoid writing back
894                  * that buffer in the future after the "add to orphan"
895                  * operation been committed,  That's not only a performance
896                  * gain, it also stops aliasing problems if the buffer is
897                  * left behind for writeback and gets reallocated for another
898                  * use in a different page. */
899                 if (buffer_freed(bh) && !jh->b_next_transaction) {
900                         clear_buffer_freed(bh);
901                         clear_buffer_jbddirty(bh);
902                 }
903
904                 if (buffer_jbddirty(bh)) {
905                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
906                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
907                         if (is_journal_aborted(journal))
908                                 clear_buffer_jbddirty(bh);
909                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
910                         __jbd2_journal_refile_buffer(jh);
911                         jbd_unlock_bh_state(bh);
912                 } else {
913                         J_ASSERT_BH(bh, !buffer_dirty(bh));
914                         /* The buffer on BJ_Forget list and not jbddirty means
915                          * it has been freed by this transaction and hence it
916                          * could not have been reallocated until this
917                          * transaction has committed. *BUT* it could be
918                          * reallocated once we have written all the data to
919                          * disk and before we process the buffer on BJ_Forget
920                          * list. */
921                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
922                         __jbd2_journal_refile_buffer(jh);
923                         if (!jh->b_transaction) {
924                                 jbd_unlock_bh_state(bh);
925                                  /* needs a brelse */
926                                 jbd2_journal_remove_journal_head(bh);
927                                 release_buffer_page(bh);
928                         } else
929                                 jbd_unlock_bh_state(bh);
930                 }
931                 cond_resched_lock(&journal->j_list_lock);
932         }
933         spin_unlock(&journal->j_list_lock);
934         /*
935          * This is a bit sleazy.  We use j_list_lock to protect transition
936          * of a transaction into T_FINISHED state and calling
937          * __jbd2_journal_drop_transaction(). Otherwise we could race with
938          * other checkpointing code processing the transaction...
939          */
940         write_lock(&journal->j_state_lock);
941         spin_lock(&journal->j_list_lock);
942         /*
943          * Now recheck if some buffers did not get attached to the transaction
944          * while the lock was dropped...
945          */
946         if (commit_transaction->t_forget) {
947                 spin_unlock(&journal->j_list_lock);
948                 write_unlock(&journal->j_state_lock);
949                 goto restart_loop;
950         }
951
952         /* Done with this transaction! */
953
954         jbd_debug(3, "JBD: commit phase 7\n");
955
956         J_ASSERT(commit_transaction->t_state == T_COMMIT);
957
958         commit_transaction->t_start = jiffies;
959         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
960                                               commit_transaction->t_start);
961
962         /*
963          * File the transaction statistics
964          */
965         stats.ts_tid = commit_transaction->t_tid;
966         stats.run.rs_handle_count =
967                 atomic_read(&commit_transaction->t_handle_count);
968         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
969                              commit_transaction->t_tid, &stats.run);
970
971         /*
972          * Calculate overall stats
973          */
974         spin_lock(&journal->j_history_lock);
975         journal->j_stats.ts_tid++;
976         journal->j_stats.run.rs_wait += stats.run.rs_wait;
977         journal->j_stats.run.rs_running += stats.run.rs_running;
978         journal->j_stats.run.rs_locked += stats.run.rs_locked;
979         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
980         journal->j_stats.run.rs_logging += stats.run.rs_logging;
981         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
982         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
983         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
984         spin_unlock(&journal->j_history_lock);
985
986         commit_transaction->t_state = T_FINISHED;
987         J_ASSERT(commit_transaction == journal->j_committing_transaction);
988         journal->j_commit_sequence = commit_transaction->t_tid;
989         journal->j_committing_transaction = NULL;
990         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
991
992         /*
993          * weight the commit time higher than the average time so we don't
994          * react too strongly to vast changes in the commit time
995          */
996         if (likely(journal->j_average_commit_time))
997                 journal->j_average_commit_time = (commit_time +
998                                 journal->j_average_commit_time*3) / 4;
999         else
1000                 journal->j_average_commit_time = commit_time;
1001         write_unlock(&journal->j_state_lock);
1002
1003         if (commit_transaction->t_checkpoint_list == NULL &&
1004             commit_transaction->t_checkpoint_io_list == NULL) {
1005                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1006                 to_free = 1;
1007         } else {
1008                 if (journal->j_checkpoint_transactions == NULL) {
1009                         journal->j_checkpoint_transactions = commit_transaction;
1010                         commit_transaction->t_cpnext = commit_transaction;
1011                         commit_transaction->t_cpprev = commit_transaction;
1012                 } else {
1013                         commit_transaction->t_cpnext =
1014                                 journal->j_checkpoint_transactions;
1015                         commit_transaction->t_cpprev =
1016                                 commit_transaction->t_cpnext->t_cpprev;
1017                         commit_transaction->t_cpnext->t_cpprev =
1018                                 commit_transaction;
1019                         commit_transaction->t_cpprev->t_cpnext =
1020                                 commit_transaction;
1021                 }
1022         }
1023         spin_unlock(&journal->j_list_lock);
1024
1025         if (journal->j_commit_callback)
1026                 journal->j_commit_callback(journal, commit_transaction);
1027
1028         trace_jbd2_end_commit(journal, commit_transaction);
1029         jbd_debug(1, "JBD: commit %d complete, head %d\n",
1030                   journal->j_commit_sequence, journal->j_tail_sequence);
1031         if (to_free)
1032                 kfree(commit_transaction);
1033
1034         wake_up(&journal->j_wait_done_commit);
1035 }