Merge branch 'for-2.6.37/barrier' of git://git.kernel.dk/linux-2.6-block
[linux-2.6.git] / fs / jbd2 / commit.c
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <trace/events/jbd2.h>
30
31 /*
32  * Default IO end handler for temporary BJ_IO buffer_heads.
33  */
34 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
35 {
36         BUFFER_TRACE(bh, "");
37         if (uptodate)
38                 set_buffer_uptodate(bh);
39         else
40                 clear_buffer_uptodate(bh);
41         unlock_buffer(bh);
42 }
43
44 /*
45  * When an ext4 file is truncated, it is possible that some pages are not
46  * successfully freed, because they are attached to a committing transaction.
47  * After the transaction commits, these pages are left on the LRU, with no
48  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
49  * by the VM, but their apparent absence upsets the VM accounting, and it makes
50  * the numbers in /proc/meminfo look odd.
51  *
52  * So here, we have a buffer which has just come off the forget list.  Look to
53  * see if we can strip all buffers from the backing page.
54  *
55  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
56  * caller provided us with a ref against the buffer, and we drop that here.
57  */
58 static void release_buffer_page(struct buffer_head *bh)
59 {
60         struct page *page;
61
62         if (buffer_dirty(bh))
63                 goto nope;
64         if (atomic_read(&bh->b_count) != 1)
65                 goto nope;
66         page = bh->b_page;
67         if (!page)
68                 goto nope;
69         if (page->mapping)
70                 goto nope;
71
72         /* OK, it's a truncated page */
73         if (!trylock_page(page))
74                 goto nope;
75
76         page_cache_get(page);
77         __brelse(bh);
78         try_to_free_buffers(page);
79         unlock_page(page);
80         page_cache_release(page);
81         return;
82
83 nope:
84         __brelse(bh);
85 }
86
87 /*
88  * Done it all: now submit the commit record.  We should have
89  * cleaned up our previous buffers by now, so if we are in abort
90  * mode we can now just skip the rest of the journal write
91  * entirely.
92  *
93  * Returns 1 if the journal needs to be aborted or 0 on success
94  */
95 static int journal_submit_commit_record(journal_t *journal,
96                                         transaction_t *commit_transaction,
97                                         struct buffer_head **cbh,
98                                         __u32 crc32_sum)
99 {
100         struct journal_head *descriptor;
101         struct commit_header *tmp;
102         struct buffer_head *bh;
103         int ret;
104         struct timespec now = current_kernel_time();
105
106         if (is_journal_aborted(journal))
107                 return 0;
108
109         descriptor = jbd2_journal_get_descriptor_buffer(journal);
110         if (!descriptor)
111                 return 1;
112
113         bh = jh2bh(descriptor);
114
115         tmp = (struct commit_header *)bh->b_data;
116         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
117         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
118         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
119         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
120         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
121
122         if (JBD2_HAS_COMPAT_FEATURE(journal,
123                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
124                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
125                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
126                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
127         }
128
129         JBUFFER_TRACE(descriptor, "submit commit block");
130         lock_buffer(bh);
131         clear_buffer_dirty(bh);
132         set_buffer_uptodate(bh);
133         bh->b_end_io = journal_end_buffer_io_sync;
134
135         if (journal->j_flags & JBD2_BARRIER &&
136             !JBD2_HAS_INCOMPAT_FEATURE(journal,
137                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
138                 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
139         else
140                 ret = submit_bh(WRITE_SYNC_PLUG, bh);
141
142         *cbh = bh;
143         return ret;
144 }
145
146 /*
147  * This function along with journal_submit_commit_record
148  * allows to write the commit record asynchronously.
149  */
150 static int journal_wait_on_commit_record(journal_t *journal,
151                                          struct buffer_head *bh)
152 {
153         int ret = 0;
154
155         clear_buffer_dirty(bh);
156         wait_on_buffer(bh);
157
158         if (unlikely(!buffer_uptodate(bh)))
159                 ret = -EIO;
160         put_bh(bh);            /* One for getblk() */
161         jbd2_journal_put_journal_head(bh2jh(bh));
162
163         return ret;
164 }
165
166 /*
167  * write the filemap data using writepage() address_space_operations.
168  * We don't do block allocation here even for delalloc. We don't
169  * use writepages() because with dealyed allocation we may be doing
170  * block allocation in writepages().
171  */
172 static int journal_submit_inode_data_buffers(struct address_space *mapping)
173 {
174         int ret;
175         struct writeback_control wbc = {
176                 .sync_mode =  WB_SYNC_ALL,
177                 .nr_to_write = mapping->nrpages * 2,
178                 .range_start = 0,
179                 .range_end = i_size_read(mapping->host),
180         };
181
182         ret = generic_writepages(mapping, &wbc);
183         return ret;
184 }
185
186 /*
187  * Submit all the data buffers of inode associated with the transaction to
188  * disk.
189  *
190  * We are in a committing transaction. Therefore no new inode can be added to
191  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
192  * operate on from being released while we write out pages.
193  */
194 static int journal_submit_data_buffers(journal_t *journal,
195                 transaction_t *commit_transaction)
196 {
197         struct jbd2_inode *jinode;
198         int err, ret = 0;
199         struct address_space *mapping;
200
201         spin_lock(&journal->j_list_lock);
202         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
203                 mapping = jinode->i_vfs_inode->i_mapping;
204                 jinode->i_flags |= JI_COMMIT_RUNNING;
205                 spin_unlock(&journal->j_list_lock);
206                 /*
207                  * submit the inode data buffers. We use writepage
208                  * instead of writepages. Because writepages can do
209                  * block allocation  with delalloc. We need to write
210                  * only allocated blocks here.
211                  */
212                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
213                 err = journal_submit_inode_data_buffers(mapping);
214                 if (!ret)
215                         ret = err;
216                 spin_lock(&journal->j_list_lock);
217                 J_ASSERT(jinode->i_transaction == commit_transaction);
218                 commit_transaction->t_flushed_data_blocks = 1;
219                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
220                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
221         }
222         spin_unlock(&journal->j_list_lock);
223         return ret;
224 }
225
226 /*
227  * Wait for data submitted for writeout, refile inodes to proper
228  * transaction if needed.
229  *
230  */
231 static int journal_finish_inode_data_buffers(journal_t *journal,
232                 transaction_t *commit_transaction)
233 {
234         struct jbd2_inode *jinode, *next_i;
235         int err, ret = 0;
236
237         /* For locking, see the comment in journal_submit_data_buffers() */
238         spin_lock(&journal->j_list_lock);
239         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
240                 jinode->i_flags |= JI_COMMIT_RUNNING;
241                 spin_unlock(&journal->j_list_lock);
242                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
243                 if (err) {
244                         /*
245                          * Because AS_EIO is cleared by
246                          * filemap_fdatawait_range(), set it again so
247                          * that user process can get -EIO from fsync().
248                          */
249                         set_bit(AS_EIO,
250                                 &jinode->i_vfs_inode->i_mapping->flags);
251
252                         if (!ret)
253                                 ret = err;
254                 }
255                 spin_lock(&journal->j_list_lock);
256                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
257                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
258         }
259
260         /* Now refile inode to proper lists */
261         list_for_each_entry_safe(jinode, next_i,
262                                  &commit_transaction->t_inode_list, i_list) {
263                 list_del(&jinode->i_list);
264                 if (jinode->i_next_transaction) {
265                         jinode->i_transaction = jinode->i_next_transaction;
266                         jinode->i_next_transaction = NULL;
267                         list_add(&jinode->i_list,
268                                 &jinode->i_transaction->t_inode_list);
269                 } else {
270                         jinode->i_transaction = NULL;
271                 }
272         }
273         spin_unlock(&journal->j_list_lock);
274
275         return ret;
276 }
277
278 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
279 {
280         struct page *page = bh->b_page;
281         char *addr;
282         __u32 checksum;
283
284         addr = kmap_atomic(page, KM_USER0);
285         checksum = crc32_be(crc32_sum,
286                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
287         kunmap_atomic(addr, KM_USER0);
288
289         return checksum;
290 }
291
292 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
293                                    unsigned long long block)
294 {
295         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
296         if (tag_bytes > JBD2_TAG_SIZE32)
297                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
298 }
299
300 /*
301  * jbd2_journal_commit_transaction
302  *
303  * The primary function for committing a transaction to the log.  This
304  * function is called by the journal thread to begin a complete commit.
305  */
306 void jbd2_journal_commit_transaction(journal_t *journal)
307 {
308         struct transaction_stats_s stats;
309         transaction_t *commit_transaction;
310         struct journal_head *jh, *new_jh, *descriptor;
311         struct buffer_head **wbuf = journal->j_wbuf;
312         int bufs;
313         int flags;
314         int err;
315         unsigned long long blocknr;
316         ktime_t start_time;
317         u64 commit_time;
318         char *tagp = NULL;
319         journal_header_t *header;
320         journal_block_tag_t *tag = NULL;
321         int space_left = 0;
322         int first_tag = 0;
323         int tag_flag;
324         int i, to_free = 0;
325         int tag_bytes = journal_tag_bytes(journal);
326         struct buffer_head *cbh = NULL; /* For transactional checksums */
327         __u32 crc32_sum = ~0;
328         int write_op = WRITE_SYNC;
329
330         /*
331          * First job: lock down the current transaction and wait for
332          * all outstanding updates to complete.
333          */
334
335 #ifdef COMMIT_STATS
336         spin_lock(&journal->j_list_lock);
337         summarise_journal_usage(journal);
338         spin_unlock(&journal->j_list_lock);
339 #endif
340
341         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
342         if (journal->j_flags & JBD2_FLUSHED) {
343                 jbd_debug(3, "super block updated\n");
344                 jbd2_journal_update_superblock(journal, 1);
345         } else {
346                 jbd_debug(3, "superblock not updated\n");
347         }
348
349         J_ASSERT(journal->j_running_transaction != NULL);
350         J_ASSERT(journal->j_committing_transaction == NULL);
351
352         commit_transaction = journal->j_running_transaction;
353         J_ASSERT(commit_transaction->t_state == T_RUNNING);
354
355         trace_jbd2_start_commit(journal, commit_transaction);
356         jbd_debug(1, "JBD: starting commit of transaction %d\n",
357                         commit_transaction->t_tid);
358
359         write_lock(&journal->j_state_lock);
360         commit_transaction->t_state = T_LOCKED;
361
362         /*
363          * Use plugged writes here, since we want to submit several before
364          * we unplug the device. We don't do explicit unplugging in here,
365          * instead we rely on sync_buffer() doing the unplug for us.
366          */
367         if (commit_transaction->t_synchronous_commit)
368                 write_op = WRITE_SYNC_PLUG;
369         trace_jbd2_commit_locking(journal, commit_transaction);
370         stats.run.rs_wait = commit_transaction->t_max_wait;
371         stats.run.rs_locked = jiffies;
372         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
373                                               stats.run.rs_locked);
374
375         spin_lock(&commit_transaction->t_handle_lock);
376         while (atomic_read(&commit_transaction->t_updates)) {
377                 DEFINE_WAIT(wait);
378
379                 prepare_to_wait(&journal->j_wait_updates, &wait,
380                                         TASK_UNINTERRUPTIBLE);
381                 if (atomic_read(&commit_transaction->t_updates)) {
382                         spin_unlock(&commit_transaction->t_handle_lock);
383                         write_unlock(&journal->j_state_lock);
384                         schedule();
385                         write_lock(&journal->j_state_lock);
386                         spin_lock(&commit_transaction->t_handle_lock);
387                 }
388                 finish_wait(&journal->j_wait_updates, &wait);
389         }
390         spin_unlock(&commit_transaction->t_handle_lock);
391
392         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
393                         journal->j_max_transaction_buffers);
394
395         /*
396          * First thing we are allowed to do is to discard any remaining
397          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
398          * that there are no such buffers: if a large filesystem
399          * operation like a truncate needs to split itself over multiple
400          * transactions, then it may try to do a jbd2_journal_restart() while
401          * there are still BJ_Reserved buffers outstanding.  These must
402          * be released cleanly from the current transaction.
403          *
404          * In this case, the filesystem must still reserve write access
405          * again before modifying the buffer in the new transaction, but
406          * we do not require it to remember exactly which old buffers it
407          * has reserved.  This is consistent with the existing behaviour
408          * that multiple jbd2_journal_get_write_access() calls to the same
409          * buffer are perfectly permissable.
410          */
411         while (commit_transaction->t_reserved_list) {
412                 jh = commit_transaction->t_reserved_list;
413                 JBUFFER_TRACE(jh, "reserved, unused: refile");
414                 /*
415                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
416                  * leave undo-committed data.
417                  */
418                 if (jh->b_committed_data) {
419                         struct buffer_head *bh = jh2bh(jh);
420
421                         jbd_lock_bh_state(bh);
422                         jbd2_free(jh->b_committed_data, bh->b_size);
423                         jh->b_committed_data = NULL;
424                         jbd_unlock_bh_state(bh);
425                 }
426                 jbd2_journal_refile_buffer(journal, jh);
427         }
428
429         /*
430          * Now try to drop any written-back buffers from the journal's
431          * checkpoint lists.  We do this *before* commit because it potentially
432          * frees some memory
433          */
434         spin_lock(&journal->j_list_lock);
435         __jbd2_journal_clean_checkpoint_list(journal);
436         spin_unlock(&journal->j_list_lock);
437
438         jbd_debug (3, "JBD: commit phase 1\n");
439
440         /*
441          * Switch to a new revoke table.
442          */
443         jbd2_journal_switch_revoke_table(journal);
444
445         trace_jbd2_commit_flushing(journal, commit_transaction);
446         stats.run.rs_flushing = jiffies;
447         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
448                                              stats.run.rs_flushing);
449
450         commit_transaction->t_state = T_FLUSH;
451         journal->j_committing_transaction = commit_transaction;
452         journal->j_running_transaction = NULL;
453         start_time = ktime_get();
454         commit_transaction->t_log_start = journal->j_head;
455         wake_up(&journal->j_wait_transaction_locked);
456         write_unlock(&journal->j_state_lock);
457
458         jbd_debug (3, "JBD: commit phase 2\n");
459
460         /*
461          * Now start flushing things to disk, in the order they appear
462          * on the transaction lists.  Data blocks go first.
463          */
464         err = journal_submit_data_buffers(journal, commit_transaction);
465         if (err)
466                 jbd2_journal_abort(journal, err);
467
468         jbd2_journal_write_revoke_records(journal, commit_transaction,
469                                           write_op);
470
471         jbd_debug(3, "JBD: commit phase 2\n");
472
473         /*
474          * Way to go: we have now written out all of the data for a
475          * transaction!  Now comes the tricky part: we need to write out
476          * metadata.  Loop over the transaction's entire buffer list:
477          */
478         write_lock(&journal->j_state_lock);
479         commit_transaction->t_state = T_COMMIT;
480         write_unlock(&journal->j_state_lock);
481
482         trace_jbd2_commit_logging(journal, commit_transaction);
483         stats.run.rs_logging = jiffies;
484         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
485                                                stats.run.rs_logging);
486         stats.run.rs_blocks =
487                 atomic_read(&commit_transaction->t_outstanding_credits);
488         stats.run.rs_blocks_logged = 0;
489
490         J_ASSERT(commit_transaction->t_nr_buffers <=
491                  atomic_read(&commit_transaction->t_outstanding_credits));
492
493         err = 0;
494         descriptor = NULL;
495         bufs = 0;
496         while (commit_transaction->t_buffers) {
497
498                 /* Find the next buffer to be journaled... */
499
500                 jh = commit_transaction->t_buffers;
501
502                 /* If we're in abort mode, we just un-journal the buffer and
503                    release it. */
504
505                 if (is_journal_aborted(journal)) {
506                         clear_buffer_jbddirty(jh2bh(jh));
507                         JBUFFER_TRACE(jh, "journal is aborting: refile");
508                         jbd2_buffer_abort_trigger(jh,
509                                                   jh->b_frozen_data ?
510                                                   jh->b_frozen_triggers :
511                                                   jh->b_triggers);
512                         jbd2_journal_refile_buffer(journal, jh);
513                         /* If that was the last one, we need to clean up
514                          * any descriptor buffers which may have been
515                          * already allocated, even if we are now
516                          * aborting. */
517                         if (!commit_transaction->t_buffers)
518                                 goto start_journal_io;
519                         continue;
520                 }
521
522                 /* Make sure we have a descriptor block in which to
523                    record the metadata buffer. */
524
525                 if (!descriptor) {
526                         struct buffer_head *bh;
527
528                         J_ASSERT (bufs == 0);
529
530                         jbd_debug(4, "JBD: get descriptor\n");
531
532                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
533                         if (!descriptor) {
534                                 jbd2_journal_abort(journal, -EIO);
535                                 continue;
536                         }
537
538                         bh = jh2bh(descriptor);
539                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
540                                 (unsigned long long)bh->b_blocknr, bh->b_data);
541                         header = (journal_header_t *)&bh->b_data[0];
542                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
543                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
544                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
545
546                         tagp = &bh->b_data[sizeof(journal_header_t)];
547                         space_left = bh->b_size - sizeof(journal_header_t);
548                         first_tag = 1;
549                         set_buffer_jwrite(bh);
550                         set_buffer_dirty(bh);
551                         wbuf[bufs++] = bh;
552
553                         /* Record it so that we can wait for IO
554                            completion later */
555                         BUFFER_TRACE(bh, "ph3: file as descriptor");
556                         jbd2_journal_file_buffer(descriptor, commit_transaction,
557                                         BJ_LogCtl);
558                 }
559
560                 /* Where is the buffer to be written? */
561
562                 err = jbd2_journal_next_log_block(journal, &blocknr);
563                 /* If the block mapping failed, just abandon the buffer
564                    and repeat this loop: we'll fall into the
565                    refile-on-abort condition above. */
566                 if (err) {
567                         jbd2_journal_abort(journal, err);
568                         continue;
569                 }
570
571                 /*
572                  * start_this_handle() uses t_outstanding_credits to determine
573                  * the free space in the log, but this counter is changed
574                  * by jbd2_journal_next_log_block() also.
575                  */
576                 atomic_dec(&commit_transaction->t_outstanding_credits);
577
578                 /* Bump b_count to prevent truncate from stumbling over
579                    the shadowed buffer!  @@@ This can go if we ever get
580                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
581                 atomic_inc(&jh2bh(jh)->b_count);
582
583                 /* Make a temporary IO buffer with which to write it out
584                    (this will requeue both the metadata buffer and the
585                    temporary IO buffer). new_bh goes on BJ_IO*/
586
587                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
588                 /*
589                  * akpm: jbd2_journal_write_metadata_buffer() sets
590                  * new_bh->b_transaction to commit_transaction.
591                  * We need to clean this up before we release new_bh
592                  * (which is of type BJ_IO)
593                  */
594                 JBUFFER_TRACE(jh, "ph3: write metadata");
595                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
596                                                       jh, &new_jh, blocknr);
597                 if (flags < 0) {
598                         jbd2_journal_abort(journal, flags);
599                         continue;
600                 }
601                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
602                 wbuf[bufs++] = jh2bh(new_jh);
603
604                 /* Record the new block's tag in the current descriptor
605                    buffer */
606
607                 tag_flag = 0;
608                 if (flags & 1)
609                         tag_flag |= JBD2_FLAG_ESCAPE;
610                 if (!first_tag)
611                         tag_flag |= JBD2_FLAG_SAME_UUID;
612
613                 tag = (journal_block_tag_t *) tagp;
614                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
615                 tag->t_flags = cpu_to_be32(tag_flag);
616                 tagp += tag_bytes;
617                 space_left -= tag_bytes;
618
619                 if (first_tag) {
620                         memcpy (tagp, journal->j_uuid, 16);
621                         tagp += 16;
622                         space_left -= 16;
623                         first_tag = 0;
624                 }
625
626                 /* If there's no more to do, or if the descriptor is full,
627                    let the IO rip! */
628
629                 if (bufs == journal->j_wbufsize ||
630                     commit_transaction->t_buffers == NULL ||
631                     space_left < tag_bytes + 16) {
632
633                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
634
635                         /* Write an end-of-descriptor marker before
636                            submitting the IOs.  "tag" still points to
637                            the last tag we set up. */
638
639                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
640
641 start_journal_io:
642                         for (i = 0; i < bufs; i++) {
643                                 struct buffer_head *bh = wbuf[i];
644                                 /*
645                                  * Compute checksum.
646                                  */
647                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
648                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
649                                         crc32_sum =
650                                             jbd2_checksum_data(crc32_sum, bh);
651                                 }
652
653                                 lock_buffer(bh);
654                                 clear_buffer_dirty(bh);
655                                 set_buffer_uptodate(bh);
656                                 bh->b_end_io = journal_end_buffer_io_sync;
657                                 submit_bh(write_op, bh);
658                         }
659                         cond_resched();
660                         stats.run.rs_blocks_logged += bufs;
661
662                         /* Force a new descriptor to be generated next
663                            time round the loop. */
664                         descriptor = NULL;
665                         bufs = 0;
666                 }
667         }
668
669         err = journal_finish_inode_data_buffers(journal, commit_transaction);
670         if (err) {
671                 printk(KERN_WARNING
672                         "JBD2: Detected IO errors while flushing file data "
673                        "on %s\n", journal->j_devname);
674                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
675                         jbd2_journal_abort(journal, err);
676                 err = 0;
677         }
678
679         /* 
680          * If the journal is not located on the file system device,
681          * then we must flush the file system device before we issue
682          * the commit record
683          */
684         if (commit_transaction->t_flushed_data_blocks &&
685             (journal->j_fs_dev != journal->j_dev) &&
686             (journal->j_flags & JBD2_BARRIER))
687                 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
688
689         /* Done it all: now write the commit record asynchronously. */
690         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
691                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
692                 err = journal_submit_commit_record(journal, commit_transaction,
693                                                  &cbh, crc32_sum);
694                 if (err)
695                         __jbd2_journal_abort_hard(journal);
696         }
697
698         /* Lo and behold: we have just managed to send a transaction to
699            the log.  Before we can commit it, wait for the IO so far to
700            complete.  Control buffers being written are on the
701            transaction's t_log_list queue, and metadata buffers are on
702            the t_iobuf_list queue.
703
704            Wait for the buffers in reverse order.  That way we are
705            less likely to be woken up until all IOs have completed, and
706            so we incur less scheduling load.
707         */
708
709         jbd_debug(3, "JBD: commit phase 3\n");
710
711         /*
712          * akpm: these are BJ_IO, and j_list_lock is not needed.
713          * See __journal_try_to_free_buffer.
714          */
715 wait_for_iobuf:
716         while (commit_transaction->t_iobuf_list != NULL) {
717                 struct buffer_head *bh;
718
719                 jh = commit_transaction->t_iobuf_list->b_tprev;
720                 bh = jh2bh(jh);
721                 if (buffer_locked(bh)) {
722                         wait_on_buffer(bh);
723                         goto wait_for_iobuf;
724                 }
725                 if (cond_resched())
726                         goto wait_for_iobuf;
727
728                 if (unlikely(!buffer_uptodate(bh)))
729                         err = -EIO;
730
731                 clear_buffer_jwrite(bh);
732
733                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
734                 jbd2_journal_unfile_buffer(journal, jh);
735
736                 /*
737                  * ->t_iobuf_list should contain only dummy buffer_heads
738                  * which were created by jbd2_journal_write_metadata_buffer().
739                  */
740                 BUFFER_TRACE(bh, "dumping temporary bh");
741                 jbd2_journal_put_journal_head(jh);
742                 __brelse(bh);
743                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
744                 free_buffer_head(bh);
745
746                 /* We also have to unlock and free the corresponding
747                    shadowed buffer */
748                 jh = commit_transaction->t_shadow_list->b_tprev;
749                 bh = jh2bh(jh);
750                 clear_bit(BH_JWrite, &bh->b_state);
751                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
752
753                 /* The metadata is now released for reuse, but we need
754                    to remember it against this transaction so that when
755                    we finally commit, we can do any checkpointing
756                    required. */
757                 JBUFFER_TRACE(jh, "file as BJ_Forget");
758                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
759                 /* Wake up any transactions which were waiting for this
760                    IO to complete */
761                 wake_up_bit(&bh->b_state, BH_Unshadow);
762                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
763                 __brelse(bh);
764         }
765
766         J_ASSERT (commit_transaction->t_shadow_list == NULL);
767
768         jbd_debug(3, "JBD: commit phase 4\n");
769
770         /* Here we wait for the revoke record and descriptor record buffers */
771  wait_for_ctlbuf:
772         while (commit_transaction->t_log_list != NULL) {
773                 struct buffer_head *bh;
774
775                 jh = commit_transaction->t_log_list->b_tprev;
776                 bh = jh2bh(jh);
777                 if (buffer_locked(bh)) {
778                         wait_on_buffer(bh);
779                         goto wait_for_ctlbuf;
780                 }
781                 if (cond_resched())
782                         goto wait_for_ctlbuf;
783
784                 if (unlikely(!buffer_uptodate(bh)))
785                         err = -EIO;
786
787                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
788                 clear_buffer_jwrite(bh);
789                 jbd2_journal_unfile_buffer(journal, jh);
790                 jbd2_journal_put_journal_head(jh);
791                 __brelse(bh);           /* One for getblk */
792                 /* AKPM: bforget here */
793         }
794
795         if (err)
796                 jbd2_journal_abort(journal, err);
797
798         jbd_debug(3, "JBD: commit phase 5\n");
799
800         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
801                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
802                 err = journal_submit_commit_record(journal, commit_transaction,
803                                                 &cbh, crc32_sum);
804                 if (err)
805                         __jbd2_journal_abort_hard(journal);
806         }
807         if (!err && !is_journal_aborted(journal))
808                 err = journal_wait_on_commit_record(journal, cbh);
809         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
810                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
811             journal->j_flags & JBD2_BARRIER) {
812                 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
813         }
814
815         if (err)
816                 jbd2_journal_abort(journal, err);
817
818         /* End of a transaction!  Finally, we can do checkpoint
819            processing: any buffers committed as a result of this
820            transaction can be removed from any checkpoint list it was on
821            before. */
822
823         jbd_debug(3, "JBD: commit phase 6\n");
824
825         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
826         J_ASSERT(commit_transaction->t_buffers == NULL);
827         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
828         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
829         J_ASSERT(commit_transaction->t_shadow_list == NULL);
830         J_ASSERT(commit_transaction->t_log_list == NULL);
831
832 restart_loop:
833         /*
834          * As there are other places (journal_unmap_buffer()) adding buffers
835          * to this list we have to be careful and hold the j_list_lock.
836          */
837         spin_lock(&journal->j_list_lock);
838         while (commit_transaction->t_forget) {
839                 transaction_t *cp_transaction;
840                 struct buffer_head *bh;
841
842                 jh = commit_transaction->t_forget;
843                 spin_unlock(&journal->j_list_lock);
844                 bh = jh2bh(jh);
845                 jbd_lock_bh_state(bh);
846                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
847
848                 /*
849                  * If there is undo-protected committed data against
850                  * this buffer, then we can remove it now.  If it is a
851                  * buffer needing such protection, the old frozen_data
852                  * field now points to a committed version of the
853                  * buffer, so rotate that field to the new committed
854                  * data.
855                  *
856                  * Otherwise, we can just throw away the frozen data now.
857                  *
858                  * We also know that the frozen data has already fired
859                  * its triggers if they exist, so we can clear that too.
860                  */
861                 if (jh->b_committed_data) {
862                         jbd2_free(jh->b_committed_data, bh->b_size);
863                         jh->b_committed_data = NULL;
864                         if (jh->b_frozen_data) {
865                                 jh->b_committed_data = jh->b_frozen_data;
866                                 jh->b_frozen_data = NULL;
867                                 jh->b_frozen_triggers = NULL;
868                         }
869                 } else if (jh->b_frozen_data) {
870                         jbd2_free(jh->b_frozen_data, bh->b_size);
871                         jh->b_frozen_data = NULL;
872                         jh->b_frozen_triggers = NULL;
873                 }
874
875                 spin_lock(&journal->j_list_lock);
876                 cp_transaction = jh->b_cp_transaction;
877                 if (cp_transaction) {
878                         JBUFFER_TRACE(jh, "remove from old cp transaction");
879                         cp_transaction->t_chp_stats.cs_dropped++;
880                         __jbd2_journal_remove_checkpoint(jh);
881                 }
882
883                 /* Only re-checkpoint the buffer_head if it is marked
884                  * dirty.  If the buffer was added to the BJ_Forget list
885                  * by jbd2_journal_forget, it may no longer be dirty and
886                  * there's no point in keeping a checkpoint record for
887                  * it. */
888
889                 /* A buffer which has been freed while still being
890                  * journaled by a previous transaction may end up still
891                  * being dirty here, but we want to avoid writing back
892                  * that buffer in the future after the "add to orphan"
893                  * operation been committed,  That's not only a performance
894                  * gain, it also stops aliasing problems if the buffer is
895                  * left behind for writeback and gets reallocated for another
896                  * use in a different page. */
897                 if (buffer_freed(bh) && !jh->b_next_transaction) {
898                         clear_buffer_freed(bh);
899                         clear_buffer_jbddirty(bh);
900                 }
901
902                 if (buffer_jbddirty(bh)) {
903                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
904                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
905                         if (is_journal_aborted(journal))
906                                 clear_buffer_jbddirty(bh);
907                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
908                         __jbd2_journal_refile_buffer(jh);
909                         jbd_unlock_bh_state(bh);
910                 } else {
911                         J_ASSERT_BH(bh, !buffer_dirty(bh));
912                         /* The buffer on BJ_Forget list and not jbddirty means
913                          * it has been freed by this transaction and hence it
914                          * could not have been reallocated until this
915                          * transaction has committed. *BUT* it could be
916                          * reallocated once we have written all the data to
917                          * disk and before we process the buffer on BJ_Forget
918                          * list. */
919                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
920                         __jbd2_journal_refile_buffer(jh);
921                         if (!jh->b_transaction) {
922                                 jbd_unlock_bh_state(bh);
923                                  /* needs a brelse */
924                                 jbd2_journal_remove_journal_head(bh);
925                                 release_buffer_page(bh);
926                         } else
927                                 jbd_unlock_bh_state(bh);
928                 }
929                 cond_resched_lock(&journal->j_list_lock);
930         }
931         spin_unlock(&journal->j_list_lock);
932         /*
933          * This is a bit sleazy.  We use j_list_lock to protect transition
934          * of a transaction into T_FINISHED state and calling
935          * __jbd2_journal_drop_transaction(). Otherwise we could race with
936          * other checkpointing code processing the transaction...
937          */
938         write_lock(&journal->j_state_lock);
939         spin_lock(&journal->j_list_lock);
940         /*
941          * Now recheck if some buffers did not get attached to the transaction
942          * while the lock was dropped...
943          */
944         if (commit_transaction->t_forget) {
945                 spin_unlock(&journal->j_list_lock);
946                 write_unlock(&journal->j_state_lock);
947                 goto restart_loop;
948         }
949
950         /* Done with this transaction! */
951
952         jbd_debug(3, "JBD: commit phase 7\n");
953
954         J_ASSERT(commit_transaction->t_state == T_COMMIT);
955
956         commit_transaction->t_start = jiffies;
957         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
958                                               commit_transaction->t_start);
959
960         /*
961          * File the transaction statistics
962          */
963         stats.ts_tid = commit_transaction->t_tid;
964         stats.run.rs_handle_count =
965                 atomic_read(&commit_transaction->t_handle_count);
966         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
967                              commit_transaction->t_tid, &stats.run);
968
969         /*
970          * Calculate overall stats
971          */
972         spin_lock(&journal->j_history_lock);
973         journal->j_stats.ts_tid++;
974         journal->j_stats.run.rs_wait += stats.run.rs_wait;
975         journal->j_stats.run.rs_running += stats.run.rs_running;
976         journal->j_stats.run.rs_locked += stats.run.rs_locked;
977         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
978         journal->j_stats.run.rs_logging += stats.run.rs_logging;
979         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
980         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
981         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
982         spin_unlock(&journal->j_history_lock);
983
984         commit_transaction->t_state = T_FINISHED;
985         J_ASSERT(commit_transaction == journal->j_committing_transaction);
986         journal->j_commit_sequence = commit_transaction->t_tid;
987         journal->j_committing_transaction = NULL;
988         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
989
990         /*
991          * weight the commit time higher than the average time so we don't
992          * react too strongly to vast changes in the commit time
993          */
994         if (likely(journal->j_average_commit_time))
995                 journal->j_average_commit_time = (commit_time +
996                                 journal->j_average_commit_time*3) / 4;
997         else
998                 journal->j_average_commit_time = commit_time;
999         write_unlock(&journal->j_state_lock);
1000
1001         if (commit_transaction->t_checkpoint_list == NULL &&
1002             commit_transaction->t_checkpoint_io_list == NULL) {
1003                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1004                 to_free = 1;
1005         } else {
1006                 if (journal->j_checkpoint_transactions == NULL) {
1007                         journal->j_checkpoint_transactions = commit_transaction;
1008                         commit_transaction->t_cpnext = commit_transaction;
1009                         commit_transaction->t_cpprev = commit_transaction;
1010                 } else {
1011                         commit_transaction->t_cpnext =
1012                                 journal->j_checkpoint_transactions;
1013                         commit_transaction->t_cpprev =
1014                                 commit_transaction->t_cpnext->t_cpprev;
1015                         commit_transaction->t_cpnext->t_cpprev =
1016                                 commit_transaction;
1017                         commit_transaction->t_cpprev->t_cpnext =
1018                                 commit_transaction;
1019                 }
1020         }
1021         spin_unlock(&journal->j_list_lock);
1022
1023         if (journal->j_commit_callback)
1024                 journal->j_commit_callback(journal, commit_transaction);
1025
1026         trace_jbd2_end_commit(journal, commit_transaction);
1027         jbd_debug(1, "JBD: commit %d complete, head %d\n",
1028                   journal->j_commit_sequence, journal->j_tail_sequence);
1029         if (to_free)
1030                 kfree(commit_transaction);
1031
1032         wake_up(&journal->j_wait_done_commit);
1033 }