blob: 447c8d93f48081c11ed85e6076df2617af4f20c0 [file] [log] [blame]
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07001// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -070010#include "ext4.h"
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -070011#include "ext4_jbd2.h"
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -070012#include "ext4_extents.h"
13#include "mballoc.h"
14
15/*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligiblity is marked by calling one of the
70 * two following functions:
71 *
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
74 *
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
83 *
84 * Atomicity of commits
85 * --------------------
86 * In order to gaurantee atomicity during the commit operation, fast commit
87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
93 *
94 * - Create a new file A and remove existing file B
95 * - fsync()
96 * - Append contents to file A
97 * - Truncate file A
98 * - fsync()
99 *
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
103 *
104 * Replay code should thus check for all the valid tails in the FC area.
105 *
106 * TODOs
107 * -----
108 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109 * eligible update must be protected within ext4_fc_start_update() and
110 * ext4_fc_stop_update(). These routines are called at much higher
111 * routines. This can be made more fine grained by combining with
112 * ext4_journal_start().
113 *
114 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115 *
116 * 3) Handle more ineligible cases.
117 */
118
119#include <trace/events/ext4.h>
120static struct kmem_cache *ext4_fc_dentry_cachep;
121
122static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123{
124 BUFFER_TRACE(bh, "");
125 if (uptodate) {
126 ext4_debug("%s: Block %lld up-to-date",
127 __func__, bh->b_blocknr);
128 set_buffer_uptodate(bh);
129 } else {
130 ext4_debug("%s: Block %lld not up-to-date",
131 __func__, bh->b_blocknr);
132 clear_buffer_uptodate(bh);
133 }
134
135 unlock_buffer(bh);
136}
137
138static inline void ext4_fc_reset_inode(struct inode *inode)
139{
140 struct ext4_inode_info *ei = EXT4_I(inode);
141
142 ei->i_fc_lblk_start = 0;
143 ei->i_fc_lblk_len = 0;
144}
145
146void ext4_fc_init_inode(struct inode *inode)
147{
148 struct ext4_inode_info *ei = EXT4_I(inode);
149
150 ext4_fc_reset_inode(inode);
151 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152 INIT_LIST_HEAD(&ei->i_fc_list);
153 init_waitqueue_head(&ei->i_fc_wait);
154 atomic_set(&ei->i_fc_updates, 0);
155 ei->i_fc_committed_subtid = 0;
156}
157
158/*
159 * Inform Ext4's fast about start of an inode update
160 *
161 * This function is called by the high level call VFS callbacks before
162 * performing any inode update. This function blocks if there's an ongoing
163 * fast commit on the inode in question.
164 */
165void ext4_fc_start_update(struct inode *inode)
166{
167 struct ext4_inode_info *ei = EXT4_I(inode);
168
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700169 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
170 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700171 return;
172
173restart:
174 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
175 if (list_empty(&ei->i_fc_list))
176 goto out;
177
178 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
179 wait_queue_head_t *wq;
180#if (BITS_PER_LONG < 64)
181 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
182 EXT4_STATE_FC_COMMITTING);
183 wq = bit_waitqueue(&ei->i_state_flags,
184 EXT4_STATE_FC_COMMITTING);
185#else
186 DEFINE_WAIT_BIT(wait, &ei->i_flags,
187 EXT4_STATE_FC_COMMITTING);
188 wq = bit_waitqueue(&ei->i_flags,
189 EXT4_STATE_FC_COMMITTING);
190#endif
191 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
192 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
193 schedule();
194 finish_wait(wq, &wait.wq_entry);
195 goto restart;
196 }
197out:
198 atomic_inc(&ei->i_fc_updates);
199 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
200}
201
202/*
203 * Stop inode update and wake up waiting fast commits if any.
204 */
205void ext4_fc_stop_update(struct inode *inode)
206{
207 struct ext4_inode_info *ei = EXT4_I(inode);
208
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700209 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
210 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700211 return;
212
213 if (atomic_dec_and_test(&ei->i_fc_updates))
214 wake_up_all(&ei->i_fc_wait);
215}
216
217/*
218 * Remove inode from fast commit list. If the inode is being committed
219 * we wait until inode commit is done.
220 */
221void ext4_fc_del(struct inode *inode)
222{
223 struct ext4_inode_info *ei = EXT4_I(inode);
224
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700225 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
226 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700227 return;
228
229restart:
230 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
231 if (list_empty(&ei->i_fc_list)) {
232 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
233 return;
234 }
235
236 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
237 wait_queue_head_t *wq;
238#if (BITS_PER_LONG < 64)
239 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
240 EXT4_STATE_FC_COMMITTING);
241 wq = bit_waitqueue(&ei->i_state_flags,
242 EXT4_STATE_FC_COMMITTING);
243#else
244 DEFINE_WAIT_BIT(wait, &ei->i_flags,
245 EXT4_STATE_FC_COMMITTING);
246 wq = bit_waitqueue(&ei->i_flags,
247 EXT4_STATE_FC_COMMITTING);
248#endif
249 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
250 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
251 schedule();
252 finish_wait(wq, &wait.wq_entry);
253 goto restart;
254 }
255 if (!list_empty(&ei->i_fc_list))
256 list_del_init(&ei->i_fc_list);
257 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
258}
259
260/*
261 * Mark file system as fast commit ineligible. This means that next commit
262 * operation would result in a full jbd2 commit.
263 */
264void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
265{
266 struct ext4_sb_info *sbi = EXT4_SB(sb);
267
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700268 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
269 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
270 return;
271
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700272 sbi->s_mount_state |= EXT4_FC_INELIGIBLE;
273 WARN_ON(reason >= EXT4_FC_REASON_MAX);
274 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
275}
276
277/*
278 * Start a fast commit ineligible update. Any commits that happen while
279 * such an operation is in progress fall back to full commits.
280 */
281void ext4_fc_start_ineligible(struct super_block *sb, int reason)
282{
283 struct ext4_sb_info *sbi = EXT4_SB(sb);
284
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700285 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
286 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
287 return;
288
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700289 WARN_ON(reason >= EXT4_FC_REASON_MAX);
290 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
291 atomic_inc(&sbi->s_fc_ineligible_updates);
292}
293
294/*
295 * Stop a fast commit ineligible update. We set EXT4_FC_INELIGIBLE flag here
296 * to ensure that after stopping the ineligible update, at least one full
297 * commit takes place.
298 */
299void ext4_fc_stop_ineligible(struct super_block *sb)
300{
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700301 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
302 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
303 return;
304
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700305 EXT4_SB(sb)->s_mount_state |= EXT4_FC_INELIGIBLE;
306 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
307}
308
309static inline int ext4_fc_is_ineligible(struct super_block *sb)
310{
311 return (EXT4_SB(sb)->s_mount_state & EXT4_FC_INELIGIBLE) ||
312 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
313}
314
315/*
316 * Generic fast commit tracking function. If this is the first time this we are
317 * called after a full commit, we initialize fast commit fields and then call
318 * __fc_track_fn() with update = 0. If we have already been called after a full
319 * commit, we pass update = 1. Based on that, the track function can determine
320 * if it needs to track a field for the first time or if it needs to just
321 * update the previously tracked value.
322 *
323 * If enqueue is set, this function enqueues the inode in fast commit list.
324 */
325static int ext4_fc_track_template(
326 struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool),
327 void *args, int enqueue)
328{
329 tid_t running_txn_tid;
330 bool update = false;
331 struct ext4_inode_info *ei = EXT4_I(inode);
332 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
333 int ret;
334
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700335 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
336 (sbi->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700337 return -EOPNOTSUPP;
338
339 if (ext4_fc_is_ineligible(inode->i_sb))
340 return -EINVAL;
341
342 running_txn_tid = sbi->s_journal ?
343 sbi->s_journal->j_commit_sequence + 1 : 0;
344
345 mutex_lock(&ei->i_fc_lock);
346 if (running_txn_tid == ei->i_sync_tid) {
347 update = true;
348 } else {
349 ext4_fc_reset_inode(inode);
350 ei->i_sync_tid = running_txn_tid;
351 }
352 ret = __fc_track_fn(inode, args, update);
353 mutex_unlock(&ei->i_fc_lock);
354
355 if (!enqueue)
356 return ret;
357
358 spin_lock(&sbi->s_fc_lock);
359 if (list_empty(&EXT4_I(inode)->i_fc_list))
360 list_add_tail(&EXT4_I(inode)->i_fc_list,
361 (sbi->s_mount_state & EXT4_FC_COMMITTING) ?
362 &sbi->s_fc_q[FC_Q_STAGING] :
363 &sbi->s_fc_q[FC_Q_MAIN]);
364 spin_unlock(&sbi->s_fc_lock);
365
366 return ret;
367}
368
369struct __track_dentry_update_args {
370 struct dentry *dentry;
371 int op;
372};
373
374/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
375static int __track_dentry_update(struct inode *inode, void *arg, bool update)
376{
377 struct ext4_fc_dentry_update *node;
378 struct ext4_inode_info *ei = EXT4_I(inode);
379 struct __track_dentry_update_args *dentry_update =
380 (struct __track_dentry_update_args *)arg;
381 struct dentry *dentry = dentry_update->dentry;
382 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
383
384 mutex_unlock(&ei->i_fc_lock);
385 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
386 if (!node) {
387 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MEM);
388 mutex_lock(&ei->i_fc_lock);
389 return -ENOMEM;
390 }
391
392 node->fcd_op = dentry_update->op;
393 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
394 node->fcd_ino = inode->i_ino;
395 if (dentry->d_name.len > DNAME_INLINE_LEN) {
396 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
397 if (!node->fcd_name.name) {
398 kmem_cache_free(ext4_fc_dentry_cachep, node);
399 ext4_fc_mark_ineligible(inode->i_sb,
400 EXT4_FC_REASON_MEM);
401 mutex_lock(&ei->i_fc_lock);
402 return -ENOMEM;
403 }
404 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
405 dentry->d_name.len);
406 } else {
407 memcpy(node->fcd_iname, dentry->d_name.name,
408 dentry->d_name.len);
409 node->fcd_name.name = node->fcd_iname;
410 }
411 node->fcd_name.len = dentry->d_name.len;
412
413 spin_lock(&sbi->s_fc_lock);
414 if (sbi->s_mount_state & EXT4_FC_COMMITTING)
415 list_add_tail(&node->fcd_list,
416 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
417 else
418 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
419 spin_unlock(&sbi->s_fc_lock);
420 mutex_lock(&ei->i_fc_lock);
421
422 return 0;
423}
424
425void ext4_fc_track_unlink(struct inode *inode, struct dentry *dentry)
426{
427 struct __track_dentry_update_args args;
428 int ret;
429
430 args.dentry = dentry;
431 args.op = EXT4_FC_TAG_UNLINK;
432
433 ret = ext4_fc_track_template(inode, __track_dentry_update,
434 (void *)&args, 0);
435 trace_ext4_fc_track_unlink(inode, dentry, ret);
436}
437
438void ext4_fc_track_link(struct inode *inode, struct dentry *dentry)
439{
440 struct __track_dentry_update_args args;
441 int ret;
442
443 args.dentry = dentry;
444 args.op = EXT4_FC_TAG_LINK;
445
446 ret = ext4_fc_track_template(inode, __track_dentry_update,
447 (void *)&args, 0);
448 trace_ext4_fc_track_link(inode, dentry, ret);
449}
450
451void ext4_fc_track_create(struct inode *inode, struct dentry *dentry)
452{
453 struct __track_dentry_update_args args;
454 int ret;
455
456 args.dentry = dentry;
457 args.op = EXT4_FC_TAG_CREAT;
458
459 ret = ext4_fc_track_template(inode, __track_dentry_update,
460 (void *)&args, 0);
461 trace_ext4_fc_track_create(inode, dentry, ret);
462}
463
464/* __track_fn for inode tracking */
465static int __track_inode(struct inode *inode, void *arg, bool update)
466{
467 if (update)
468 return -EEXIST;
469
470 EXT4_I(inode)->i_fc_lblk_len = 0;
471
472 return 0;
473}
474
475void ext4_fc_track_inode(struct inode *inode)
476{
477 int ret;
478
479 if (S_ISDIR(inode->i_mode))
480 return;
481
482 ret = ext4_fc_track_template(inode, __track_inode, NULL, 1);
483 trace_ext4_fc_track_inode(inode, ret);
484}
485
486struct __track_range_args {
487 ext4_lblk_t start, end;
488};
489
490/* __track_fn for tracking data updates */
491static int __track_range(struct inode *inode, void *arg, bool update)
492{
493 struct ext4_inode_info *ei = EXT4_I(inode);
494 ext4_lblk_t oldstart;
495 struct __track_range_args *__arg =
496 (struct __track_range_args *)arg;
497
498 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
499 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
500 return -ECANCELED;
501 }
502
503 oldstart = ei->i_fc_lblk_start;
504
505 if (update && ei->i_fc_lblk_len > 0) {
506 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
507 ei->i_fc_lblk_len =
508 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
509 ei->i_fc_lblk_start + 1;
510 } else {
511 ei->i_fc_lblk_start = __arg->start;
512 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
513 }
514
515 return 0;
516}
517
518void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start,
519 ext4_lblk_t end)
520{
521 struct __track_range_args args;
522 int ret;
523
524 if (S_ISDIR(inode->i_mode))
525 return;
526
527 args.start = start;
528 args.end = end;
529
530 ret = ext4_fc_track_template(inode, __track_range, &args, 1);
531
532 trace_ext4_fc_track_range(inode, start, end, ret);
533}
534
535static void ext4_fc_submit_bh(struct super_block *sb)
536{
537 int write_flags = REQ_SYNC;
538 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
539
540 if (test_opt(sb, BARRIER))
541 write_flags |= REQ_FUA | REQ_PREFLUSH;
542 lock_buffer(bh);
543 clear_buffer_dirty(bh);
544 set_buffer_uptodate(bh);
545 bh->b_end_io = ext4_end_buffer_io_sync;
546 submit_bh(REQ_OP_WRITE, write_flags, bh);
547 EXT4_SB(sb)->s_fc_bh = NULL;
548}
549
550/* Ext4 commit path routines */
551
552/* memzero and update CRC */
553static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
554 u32 *crc)
555{
556 void *ret;
557
558 ret = memset(dst, 0, len);
559 if (crc)
560 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
561 return ret;
562}
563
564/*
565 * Allocate len bytes on a fast commit buffer.
566 *
567 * During the commit time this function is used to manage fast commit
568 * block space. We don't split a fast commit log onto different
569 * blocks. So this function makes sure that if there's not enough space
570 * on the current block, the remaining space in the current block is
571 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
572 * new block is from jbd2 and CRC is updated to reflect the padding
573 * we added.
574 */
575static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
576{
577 struct ext4_fc_tl *tl;
578 struct ext4_sb_info *sbi = EXT4_SB(sb);
579 struct buffer_head *bh;
580 int bsize = sbi->s_journal->j_blocksize;
581 int ret, off = sbi->s_fc_bytes % bsize;
582 int pad_len;
583
584 /*
585 * After allocating len, we should have space at least for a 0 byte
586 * padding.
587 */
588 if (len + sizeof(struct ext4_fc_tl) > bsize)
589 return NULL;
590
591 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
592 /*
593 * Only allocate from current buffer if we have enough space for
594 * this request AND we have space to add a zero byte padding.
595 */
596 if (!sbi->s_fc_bh) {
597 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
598 if (ret)
599 return NULL;
600 sbi->s_fc_bh = bh;
601 }
602 sbi->s_fc_bytes += len;
603 return sbi->s_fc_bh->b_data + off;
604 }
605 /* Need to add PAD tag */
606 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
607 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
608 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
609 tl->fc_len = cpu_to_le16(pad_len);
610 if (crc)
611 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
612 if (pad_len > 0)
613 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
614 ext4_fc_submit_bh(sb);
615
616 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
617 if (ret)
618 return NULL;
619 sbi->s_fc_bh = bh;
620 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
621 return sbi->s_fc_bh->b_data;
622}
623
624/* memcpy to fc reserved space and update CRC */
625static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
626 int len, u32 *crc)
627{
628 if (crc)
629 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
630 return memcpy(dst, src, len);
631}
632
633/*
634 * Complete a fast commit by writing tail tag.
635 *
636 * Writing tail tag marks the end of a fast commit. In order to guarantee
637 * atomicity, after writing tail tag, even if there's space remaining
638 * in the block, next commit shouldn't use it. That's why tail tag
639 * has the length as that of the remaining space on the block.
640 */
641static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
642{
643 struct ext4_sb_info *sbi = EXT4_SB(sb);
644 struct ext4_fc_tl tl;
645 struct ext4_fc_tail tail;
646 int off, bsize = sbi->s_journal->j_blocksize;
647 u8 *dst;
648
649 /*
650 * ext4_fc_reserve_space takes care of allocating an extra block if
651 * there's no enough space on this block for accommodating this tail.
652 */
653 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
654 if (!dst)
655 return -ENOSPC;
656
657 off = sbi->s_fc_bytes % bsize;
658
659 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
660 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
661 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
662
663 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
664 dst += sizeof(tl);
665 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
666 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
667 dst += sizeof(tail.fc_tid);
668 tail.fc_crc = cpu_to_le32(crc);
669 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
670
671 ext4_fc_submit_bh(sb);
672
673 return 0;
674}
675
676/*
677 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
678 * Returns false if there's not enough space.
679 */
680static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
681 u32 *crc)
682{
683 struct ext4_fc_tl tl;
684 u8 *dst;
685
686 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
687 if (!dst)
688 return false;
689
690 tl.fc_tag = cpu_to_le16(tag);
691 tl.fc_len = cpu_to_le16(len);
692
693 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
694 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
695
696 return true;
697}
698
699/* Same as above, but adds dentry tlv. */
700static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
701 int parent_ino, int ino, int dlen,
702 const unsigned char *dname,
703 u32 *crc)
704{
705 struct ext4_fc_dentry_info fcd;
706 struct ext4_fc_tl tl;
707 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
708 crc);
709
710 if (!dst)
711 return false;
712
713 fcd.fc_parent_ino = cpu_to_le32(parent_ino);
714 fcd.fc_ino = cpu_to_le32(ino);
715 tl.fc_tag = cpu_to_le16(tag);
716 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
717 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
718 dst += sizeof(tl);
719 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
720 dst += sizeof(fcd);
721 ext4_fc_memcpy(sb, dst, dname, dlen, crc);
722 dst += dlen;
723
724 return true;
725}
726
727/*
728 * Writes inode in the fast commit space under TLV with tag @tag.
729 * Returns 0 on success, error on failure.
730 */
731static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
732{
733 struct ext4_inode_info *ei = EXT4_I(inode);
734 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
735 int ret;
736 struct ext4_iloc iloc;
737 struct ext4_fc_inode fc_inode;
738 struct ext4_fc_tl tl;
739 u8 *dst;
740
741 ret = ext4_get_inode_loc(inode, &iloc);
742 if (ret)
743 return ret;
744
745 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
746 inode_len += ei->i_extra_isize;
747
748 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
749 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
750 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
751
752 dst = ext4_fc_reserve_space(inode->i_sb,
753 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
754 if (!dst)
755 return -ECANCELED;
756
757 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
758 return -ECANCELED;
759 dst += sizeof(tl);
760 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
761 return -ECANCELED;
762 dst += sizeof(fc_inode);
763 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
764 inode_len, crc))
765 return -ECANCELED;
766
767 return 0;
768}
769
770/*
771 * Writes updated data ranges for the inode in question. Updates CRC.
772 * Returns 0 on success, error otherwise.
773 */
774static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
775{
776 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
777 struct ext4_inode_info *ei = EXT4_I(inode);
778 struct ext4_map_blocks map;
779 struct ext4_fc_add_range fc_ext;
780 struct ext4_fc_del_range lrange;
781 struct ext4_extent *ex;
782 int ret;
783
784 mutex_lock(&ei->i_fc_lock);
785 if (ei->i_fc_lblk_len == 0) {
786 mutex_unlock(&ei->i_fc_lock);
787 return 0;
788 }
789 old_blk_size = ei->i_fc_lblk_start;
790 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
791 ei->i_fc_lblk_len = 0;
792 mutex_unlock(&ei->i_fc_lock);
793
794 cur_lblk_off = old_blk_size;
795 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
796 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
797
798 while (cur_lblk_off <= new_blk_size) {
799 map.m_lblk = cur_lblk_off;
800 map.m_len = new_blk_size - cur_lblk_off + 1;
801 ret = ext4_map_blocks(NULL, inode, &map, 0);
802 if (ret < 0)
803 return -ECANCELED;
804
805 if (map.m_len == 0) {
806 cur_lblk_off++;
807 continue;
808 }
809
810 if (ret == 0) {
811 lrange.fc_ino = cpu_to_le32(inode->i_ino);
812 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
813 lrange.fc_len = cpu_to_le32(map.m_len);
814 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
815 sizeof(lrange), (u8 *)&lrange, crc))
816 return -ENOSPC;
817 } else {
818 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
819 ex = (struct ext4_extent *)&fc_ext.fc_ex;
820 ex->ee_block = cpu_to_le32(map.m_lblk);
821 ex->ee_len = cpu_to_le16(map.m_len);
822 ext4_ext_store_pblock(ex, map.m_pblk);
823 if (map.m_flags & EXT4_MAP_UNWRITTEN)
824 ext4_ext_mark_unwritten(ex);
825 else
826 ext4_ext_mark_initialized(ex);
827 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
828 sizeof(fc_ext), (u8 *)&fc_ext, crc))
829 return -ENOSPC;
830 }
831
832 cur_lblk_off += map.m_len;
833 }
834
835 return 0;
836}
837
838
839/* Submit data for all the fast commit inodes */
840static int ext4_fc_submit_inode_data_all(journal_t *journal)
841{
842 struct super_block *sb = (struct super_block *)(journal->j_private);
843 struct ext4_sb_info *sbi = EXT4_SB(sb);
844 struct ext4_inode_info *ei;
845 struct list_head *pos;
846 int ret = 0;
847
848 spin_lock(&sbi->s_fc_lock);
849 sbi->s_mount_state |= EXT4_FC_COMMITTING;
850 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
851 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
852 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
853 while (atomic_read(&ei->i_fc_updates)) {
854 DEFINE_WAIT(wait);
855
856 prepare_to_wait(&ei->i_fc_wait, &wait,
857 TASK_UNINTERRUPTIBLE);
858 if (atomic_read(&ei->i_fc_updates)) {
859 spin_unlock(&sbi->s_fc_lock);
860 schedule();
861 spin_lock(&sbi->s_fc_lock);
862 }
863 finish_wait(&ei->i_fc_wait, &wait);
864 }
865 spin_unlock(&sbi->s_fc_lock);
866 ret = jbd2_submit_inode_data(ei->jinode);
867 if (ret)
868 return ret;
869 spin_lock(&sbi->s_fc_lock);
870 }
871 spin_unlock(&sbi->s_fc_lock);
872
873 return ret;
874}
875
876/* Wait for completion of data for all the fast commit inodes */
877static int ext4_fc_wait_inode_data_all(journal_t *journal)
878{
879 struct super_block *sb = (struct super_block *)(journal->j_private);
880 struct ext4_sb_info *sbi = EXT4_SB(sb);
881 struct ext4_inode_info *pos, *n;
882 int ret = 0;
883
884 spin_lock(&sbi->s_fc_lock);
885 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
886 if (!ext4_test_inode_state(&pos->vfs_inode,
887 EXT4_STATE_FC_COMMITTING))
888 continue;
889 spin_unlock(&sbi->s_fc_lock);
890
891 ret = jbd2_wait_inode_data(journal, pos->jinode);
892 if (ret)
893 return ret;
894 spin_lock(&sbi->s_fc_lock);
895 }
896 spin_unlock(&sbi->s_fc_lock);
897
898 return 0;
899}
900
901/* Commit all the directory entry updates */
902static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
903{
904 struct super_block *sb = (struct super_block *)(journal->j_private);
905 struct ext4_sb_info *sbi = EXT4_SB(sb);
906 struct ext4_fc_dentry_update *fc_dentry;
907 struct inode *inode;
908 struct list_head *pos, *n, *fcd_pos, *fcd_n;
909 struct ext4_inode_info *ei;
910 int ret;
911
912 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
913 return 0;
914 list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
915 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
916 fcd_list);
917 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
918 spin_unlock(&sbi->s_fc_lock);
919 if (!ext4_fc_add_dentry_tlv(
920 sb, fc_dentry->fcd_op,
921 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
922 fc_dentry->fcd_name.len,
923 fc_dentry->fcd_name.name, crc)) {
924 ret = -ENOSPC;
925 goto lock_and_exit;
926 }
927 spin_lock(&sbi->s_fc_lock);
928 continue;
929 }
930
931 inode = NULL;
932 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
933 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
934 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
935 inode = &ei->vfs_inode;
936 break;
937 }
938 }
939 /*
940 * If we don't find inode in our list, then it was deleted,
941 * in which case, we don't need to record it's create tag.
942 */
943 if (!inode)
944 continue;
945 spin_unlock(&sbi->s_fc_lock);
946
947 /*
948 * We first write the inode and then the create dirent. This
949 * allows the recovery code to create an unnamed inode first
950 * and then link it to a directory entry. This allows us
951 * to use namei.c routines almost as is and simplifies
952 * the recovery code.
953 */
954 ret = ext4_fc_write_inode(inode, crc);
955 if (ret)
956 goto lock_and_exit;
957
958 ret = ext4_fc_write_inode_data(inode, crc);
959 if (ret)
960 goto lock_and_exit;
961
962 if (!ext4_fc_add_dentry_tlv(
963 sb, fc_dentry->fcd_op,
964 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
965 fc_dentry->fcd_name.len,
966 fc_dentry->fcd_name.name, crc)) {
967 spin_lock(&sbi->s_fc_lock);
968 ret = -ENOSPC;
969 goto lock_and_exit;
970 }
971
972 spin_lock(&sbi->s_fc_lock);
973 }
974 return 0;
975lock_and_exit:
976 spin_lock(&sbi->s_fc_lock);
977 return ret;
978}
979
980static int ext4_fc_perform_commit(journal_t *journal)
981{
982 struct super_block *sb = (struct super_block *)(journal->j_private);
983 struct ext4_sb_info *sbi = EXT4_SB(sb);
984 struct ext4_inode_info *iter;
985 struct ext4_fc_head head;
986 struct list_head *pos;
987 struct inode *inode;
988 struct blk_plug plug;
989 int ret = 0;
990 u32 crc = 0;
991
992 ret = ext4_fc_submit_inode_data_all(journal);
993 if (ret)
994 return ret;
995
996 ret = ext4_fc_wait_inode_data_all(journal);
997 if (ret)
998 return ret;
999
1000 blk_start_plug(&plug);
1001 if (sbi->s_fc_bytes == 0) {
1002 /*
1003 * Add a head tag only if this is the first fast commit
1004 * in this TID.
1005 */
1006 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1007 head.fc_tid = cpu_to_le32(
1008 sbi->s_journal->j_running_transaction->t_tid);
1009 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1010 (u8 *)&head, &crc))
1011 goto out;
1012 }
1013
1014 spin_lock(&sbi->s_fc_lock);
1015 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1016 if (ret) {
1017 spin_unlock(&sbi->s_fc_lock);
1018 goto out;
1019 }
1020
1021 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1022 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1023 inode = &iter->vfs_inode;
1024 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1025 continue;
1026
1027 spin_unlock(&sbi->s_fc_lock);
1028 ret = ext4_fc_write_inode_data(inode, &crc);
1029 if (ret)
1030 goto out;
1031 ret = ext4_fc_write_inode(inode, &crc);
1032 if (ret)
1033 goto out;
1034 spin_lock(&sbi->s_fc_lock);
1035 EXT4_I(inode)->i_fc_committed_subtid =
1036 atomic_read(&sbi->s_fc_subtid);
1037 }
1038 spin_unlock(&sbi->s_fc_lock);
1039
1040 ret = ext4_fc_write_tail(sb, crc);
1041
1042out:
1043 blk_finish_plug(&plug);
1044 return ret;
1045}
1046
1047/*
1048 * The main commit entry point. Performs a fast commit for transaction
1049 * commit_tid if needed. If it's not possible to perform a fast commit
1050 * due to various reasons, we fall back to full commit. Returns 0
1051 * on success, error otherwise.
1052 */
1053int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1054{
1055 struct super_block *sb = (struct super_block *)(journal->j_private);
1056 struct ext4_sb_info *sbi = EXT4_SB(sb);
1057 int nblks = 0, ret, bsize = journal->j_blocksize;
1058 int subtid = atomic_read(&sbi->s_fc_subtid);
1059 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1060 ktime_t start_time, commit_time;
1061
1062 trace_ext4_fc_commit_start(sb);
1063
1064 start_time = ktime_get();
1065
1066 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1067 (ext4_fc_is_ineligible(sb))) {
1068 reason = EXT4_FC_REASON_INELIGIBLE;
1069 goto out;
1070 }
1071
1072restart_fc:
1073 ret = jbd2_fc_begin_commit(journal, commit_tid);
1074 if (ret == -EALREADY) {
1075 /* There was an ongoing commit, check if we need to restart */
1076 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1077 commit_tid > journal->j_commit_sequence)
1078 goto restart_fc;
1079 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1080 goto out;
1081 } else if (ret) {
1082 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1083 reason = EXT4_FC_REASON_FC_START_FAILED;
1084 goto out;
1085 }
1086
1087 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1088 ret = ext4_fc_perform_commit(journal);
1089 if (ret < 0) {
1090 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1091 reason = EXT4_FC_REASON_FC_FAILED;
1092 goto out;
1093 }
1094 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1095 ret = jbd2_fc_wait_bufs(journal, nblks);
1096 if (ret < 0) {
1097 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1098 reason = EXT4_FC_REASON_FC_FAILED;
1099 goto out;
1100 }
1101 atomic_inc(&sbi->s_fc_subtid);
1102 jbd2_fc_end_commit(journal);
1103out:
1104 /* Has any ineligible update happened since we started? */
1105 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1106 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1107 reason = EXT4_FC_REASON_INELIGIBLE;
1108 }
1109
1110 spin_lock(&sbi->s_fc_lock);
1111 if (reason != EXT4_FC_REASON_OK &&
1112 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1113 sbi->s_fc_stats.fc_ineligible_commits++;
1114 } else {
1115 sbi->s_fc_stats.fc_num_commits++;
1116 sbi->s_fc_stats.fc_numblks += nblks;
1117 }
1118 spin_unlock(&sbi->s_fc_lock);
1119 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1120 trace_ext4_fc_commit_stop(sb, nblks, reason);
1121 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1122 /*
1123 * weight the commit time higher than the average time so we don't
1124 * react too strongly to vast changes in the commit time
1125 */
1126 if (likely(sbi->s_fc_avg_commit_time))
1127 sbi->s_fc_avg_commit_time = (commit_time +
1128 sbi->s_fc_avg_commit_time * 3) / 4;
1129 else
1130 sbi->s_fc_avg_commit_time = commit_time;
1131 jbd_debug(1,
1132 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1133 nblks, reason, subtid);
1134 if (reason == EXT4_FC_REASON_FC_FAILED)
1135 return jbd2_fc_end_commit_fallback(journal, commit_tid);
1136 if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1137 reason == EXT4_FC_REASON_INELIGIBLE)
1138 return jbd2_complete_transaction(journal, commit_tid);
1139 return 0;
1140}
1141
Harshad Shirwadkarff780b92020-10-15 13:37:56 -07001142/*
1143 * Fast commit cleanup routine. This is called after every fast commit and
1144 * full commit. full is true if we are called after a full commit.
1145 */
1146static void ext4_fc_cleanup(journal_t *journal, int full)
1147{
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07001148 struct super_block *sb = journal->j_private;
1149 struct ext4_sb_info *sbi = EXT4_SB(sb);
1150 struct ext4_inode_info *iter;
1151 struct ext4_fc_dentry_update *fc_dentry;
1152 struct list_head *pos, *n;
1153
1154 if (full && sbi->s_fc_bh)
1155 sbi->s_fc_bh = NULL;
1156
1157 jbd2_fc_release_bufs(journal);
1158
1159 spin_lock(&sbi->s_fc_lock);
1160 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1161 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1162 list_del_init(&iter->i_fc_list);
1163 ext4_clear_inode_state(&iter->vfs_inode,
1164 EXT4_STATE_FC_COMMITTING);
1165 ext4_fc_reset_inode(&iter->vfs_inode);
1166 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1167 smp_mb();
1168#if (BITS_PER_LONG < 64)
1169 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1170#else
1171 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1172#endif
1173 }
1174
1175 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1176 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1177 struct ext4_fc_dentry_update,
1178 fcd_list);
1179 list_del_init(&fc_dentry->fcd_list);
1180 spin_unlock(&sbi->s_fc_lock);
1181
1182 if (fc_dentry->fcd_name.name &&
1183 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1184 kfree(fc_dentry->fcd_name.name);
1185 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1186 spin_lock(&sbi->s_fc_lock);
1187 }
1188
1189 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1190 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1191 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1192 &sbi->s_fc_q[FC_Q_STAGING]);
1193
1194 sbi->s_mount_state &= ~EXT4_FC_COMMITTING;
1195 sbi->s_mount_state &= ~EXT4_FC_INELIGIBLE;
1196
1197 if (full)
1198 sbi->s_fc_bytes = 0;
1199 spin_unlock(&sbi->s_fc_lock);
1200 trace_ext4_fc_stats(sb);
Harshad Shirwadkarff780b92020-10-15 13:37:56 -07001201}
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07001202
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001203/* Ext4 Replay Path Routines */
1204
1205/* Get length of a particular tlv */
1206static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1207{
1208 return le16_to_cpu(tl->fc_len);
1209}
1210
1211/* Get a pointer to "value" of a tlv */
1212static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1213{
1214 return (u8 *)tl + sizeof(*tl);
1215}
1216
1217/* Helper struct for dentry replay routines */
1218struct dentry_info_args {
1219 int parent_ino, dname_len, ino, inode_len;
1220 char *dname;
1221};
1222
1223static inline void tl_to_darg(struct dentry_info_args *darg,
1224 struct ext4_fc_tl *tl)
1225{
1226 struct ext4_fc_dentry_info *fcd;
1227
1228 fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1229
1230 darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1231 darg->ino = le32_to_cpu(fcd->fc_ino);
1232 darg->dname = fcd->fc_dname;
1233 darg->dname_len = ext4_fc_tag_len(tl) -
1234 sizeof(struct ext4_fc_dentry_info);
1235}
1236
1237/* Unlink replay function */
1238static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1239{
1240 struct inode *inode, *old_parent;
1241 struct qstr entry;
1242 struct dentry_info_args darg;
1243 int ret = 0;
1244
1245 tl_to_darg(&darg, tl);
1246
1247 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1248 darg.parent_ino, darg.dname_len);
1249
1250 entry.name = darg.dname;
1251 entry.len = darg.dname_len;
1252 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1253
1254 if (IS_ERR_OR_NULL(inode)) {
1255 jbd_debug(1, "Inode %d not found", darg.ino);
1256 return 0;
1257 }
1258
1259 old_parent = ext4_iget(sb, darg.parent_ino,
1260 EXT4_IGET_NORMAL);
1261 if (IS_ERR_OR_NULL(old_parent)) {
1262 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1263 iput(inode);
1264 return 0;
1265 }
1266
1267 ret = __ext4_unlink(old_parent, &entry, inode);
1268 /* -ENOENT ok coz it might not exist anymore. */
1269 if (ret == -ENOENT)
1270 ret = 0;
1271 iput(old_parent);
1272 iput(inode);
1273 return ret;
1274}
1275
1276static int ext4_fc_replay_link_internal(struct super_block *sb,
1277 struct dentry_info_args *darg,
1278 struct inode *inode)
1279{
1280 struct inode *dir = NULL;
1281 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1282 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1283 int ret = 0;
1284
1285 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1286 if (IS_ERR(dir)) {
1287 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1288 dir = NULL;
1289 goto out;
1290 }
1291
1292 dentry_dir = d_obtain_alias(dir);
1293 if (IS_ERR(dentry_dir)) {
1294 jbd_debug(1, "Failed to obtain dentry");
1295 dentry_dir = NULL;
1296 goto out;
1297 }
1298
1299 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1300 if (!dentry_inode) {
1301 jbd_debug(1, "Inode dentry not created.");
1302 ret = -ENOMEM;
1303 goto out;
1304 }
1305
1306 ret = __ext4_link(dir, inode, dentry_inode);
1307 /*
1308 * It's possible that link already existed since data blocks
1309 * for the dir in question got persisted before we crashed OR
1310 * we replayed this tag and crashed before the entire replay
1311 * could complete.
1312 */
1313 if (ret && ret != -EEXIST) {
1314 jbd_debug(1, "Failed to link\n");
1315 goto out;
1316 }
1317
1318 ret = 0;
1319out:
1320 if (dentry_dir) {
1321 d_drop(dentry_dir);
1322 dput(dentry_dir);
1323 } else if (dir) {
1324 iput(dir);
1325 }
1326 if (dentry_inode) {
1327 d_drop(dentry_inode);
1328 dput(dentry_inode);
1329 }
1330
1331 return ret;
1332}
1333
1334/* Link replay function */
1335static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1336{
1337 struct inode *inode;
1338 struct dentry_info_args darg;
1339 int ret = 0;
1340
1341 tl_to_darg(&darg, tl);
1342 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1343 darg.parent_ino, darg.dname_len);
1344
1345 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1346 if (IS_ERR_OR_NULL(inode)) {
1347 jbd_debug(1, "Inode not found.");
1348 return 0;
1349 }
1350
1351 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1352 iput(inode);
1353 return ret;
1354}
1355
1356/*
1357 * Record all the modified inodes during replay. We use this later to setup
1358 * block bitmaps correctly.
1359 */
1360static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1361{
1362 struct ext4_fc_replay_state *state;
1363 int i;
1364
1365 state = &EXT4_SB(sb)->s_fc_replay_state;
1366 for (i = 0; i < state->fc_modified_inodes_used; i++)
1367 if (state->fc_modified_inodes[i] == ino)
1368 return 0;
1369 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1370 state->fc_modified_inodes_size +=
1371 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1372 state->fc_modified_inodes = krealloc(
1373 state->fc_modified_inodes, sizeof(int) *
1374 state->fc_modified_inodes_size,
1375 GFP_KERNEL);
1376 if (!state->fc_modified_inodes)
1377 return -ENOMEM;
1378 }
1379 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1380 return 0;
1381}
1382
1383/*
1384 * Inode replay function
1385 */
1386static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1387{
1388 struct ext4_fc_inode *fc_inode;
1389 struct ext4_inode *raw_inode;
1390 struct ext4_inode *raw_fc_inode;
1391 struct inode *inode = NULL;
1392 struct ext4_iloc iloc;
1393 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1394 struct ext4_extent_header *eh;
1395
1396 fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1397
1398 ino = le32_to_cpu(fc_inode->fc_ino);
1399 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1400
1401 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1402 if (!IS_ERR_OR_NULL(inode)) {
1403 ext4_ext_clear_bb(inode);
1404 iput(inode);
1405 }
1406
1407 ext4_fc_record_modified_inode(sb, ino);
1408
1409 raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1410 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1411 if (ret)
1412 goto out;
1413
1414 inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1415 raw_inode = ext4_raw_inode(&iloc);
1416
1417 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1418 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1419 inode_len - offsetof(struct ext4_inode, i_generation));
1420 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1421 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1422 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1423 memset(eh, 0, sizeof(*eh));
1424 eh->eh_magic = EXT4_EXT_MAGIC;
1425 eh->eh_max = cpu_to_le16(
1426 (sizeof(raw_inode->i_block) -
1427 sizeof(struct ext4_extent_header))
1428 / sizeof(struct ext4_extent));
1429 }
1430 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1431 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1432 sizeof(raw_inode->i_block));
1433 }
1434
1435 /* Immediately update the inode on disk. */
1436 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1437 if (ret)
1438 goto out;
1439 ret = sync_dirty_buffer(iloc.bh);
1440 if (ret)
1441 goto out;
1442 ret = ext4_mark_inode_used(sb, ino);
1443 if (ret)
1444 goto out;
1445
1446 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1447 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1448 if (IS_ERR_OR_NULL(inode)) {
1449 jbd_debug(1, "Inode not found.");
1450 return -EFSCORRUPTED;
1451 }
1452
1453 /*
1454 * Our allocator could have made different decisions than before
1455 * crashing. This should be fixed but until then, we calculate
1456 * the number of blocks the inode.
1457 */
1458 ext4_ext_replay_set_iblocks(inode);
1459
1460 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1461 ext4_reset_inode_seed(inode);
1462
1463 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1464 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1465 sync_dirty_buffer(iloc.bh);
1466 brelse(iloc.bh);
1467out:
1468 iput(inode);
1469 if (!ret)
1470 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1471
1472 return 0;
1473}
1474
1475/*
1476 * Dentry create replay function.
1477 *
1478 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1479 * inode for which we are trying to create a dentry here, should already have
1480 * been replayed before we start here.
1481 */
1482static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1483{
1484 int ret = 0;
1485 struct inode *inode = NULL;
1486 struct inode *dir = NULL;
1487 struct dentry_info_args darg;
1488
1489 tl_to_darg(&darg, tl);
1490
1491 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1492 darg.parent_ino, darg.dname_len);
1493
1494 /* This takes care of update group descriptor and other metadata */
1495 ret = ext4_mark_inode_used(sb, darg.ino);
1496 if (ret)
1497 goto out;
1498
1499 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1500 if (IS_ERR_OR_NULL(inode)) {
1501 jbd_debug(1, "inode %d not found.", darg.ino);
1502 inode = NULL;
1503 ret = -EINVAL;
1504 goto out;
1505 }
1506
1507 if (S_ISDIR(inode->i_mode)) {
1508 /*
1509 * If we are creating a directory, we need to make sure that the
1510 * dot and dot dot dirents are setup properly.
1511 */
1512 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1513 if (IS_ERR_OR_NULL(dir)) {
1514 jbd_debug(1, "Dir %d not found.", darg.ino);
1515 goto out;
1516 }
1517 ret = ext4_init_new_dir(NULL, dir, inode);
1518 iput(dir);
1519 if (ret) {
1520 ret = 0;
1521 goto out;
1522 }
1523 }
1524 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1525 if (ret)
1526 goto out;
1527 set_nlink(inode, 1);
1528 ext4_mark_inode_dirty(NULL, inode);
1529out:
1530 if (inode)
1531 iput(inode);
1532 return ret;
1533}
1534
1535/*
1536 * Record physical disk regions which are in use as per fast commit area. Our
1537 * simple replay phase allocator excludes these regions from allocation.
1538 */
1539static int ext4_fc_record_regions(struct super_block *sb, int ino,
1540 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1541{
1542 struct ext4_fc_replay_state *state;
1543 struct ext4_fc_alloc_region *region;
1544
1545 state = &EXT4_SB(sb)->s_fc_replay_state;
1546 if (state->fc_regions_used == state->fc_regions_size) {
1547 state->fc_regions_size +=
1548 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1549 state->fc_regions = krealloc(
1550 state->fc_regions,
1551 state->fc_regions_size *
1552 sizeof(struct ext4_fc_alloc_region),
1553 GFP_KERNEL);
1554 if (!state->fc_regions)
1555 return -ENOMEM;
1556 }
1557 region = &state->fc_regions[state->fc_regions_used++];
1558 region->ino = ino;
1559 region->lblk = lblk;
1560 region->pblk = pblk;
1561 region->len = len;
1562
1563 return 0;
1564}
1565
1566/* Replay add range tag */
1567static int ext4_fc_replay_add_range(struct super_block *sb,
1568 struct ext4_fc_tl *tl)
1569{
1570 struct ext4_fc_add_range *fc_add_ex;
1571 struct ext4_extent newex, *ex;
1572 struct inode *inode;
1573 ext4_lblk_t start, cur;
1574 int remaining, len;
1575 ext4_fsblk_t start_pblk;
1576 struct ext4_map_blocks map;
1577 struct ext4_ext_path *path = NULL;
1578 int ret;
1579
1580 fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1581 ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1582
1583 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1584 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1585 ext4_ext_get_actual_len(ex));
1586
1587 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1588 EXT4_IGET_NORMAL);
1589 if (IS_ERR_OR_NULL(inode)) {
1590 jbd_debug(1, "Inode not found.");
1591 return 0;
1592 }
1593
1594 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1595
1596 start = le32_to_cpu(ex->ee_block);
1597 start_pblk = ext4_ext_pblock(ex);
1598 len = ext4_ext_get_actual_len(ex);
1599
1600 cur = start;
1601 remaining = len;
1602 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1603 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1604 inode->i_ino);
1605
1606 while (remaining > 0) {
1607 map.m_lblk = cur;
1608 map.m_len = remaining;
1609 map.m_pblk = 0;
1610 ret = ext4_map_blocks(NULL, inode, &map, 0);
1611
1612 if (ret < 0) {
1613 iput(inode);
1614 return 0;
1615 }
1616
1617 if (ret == 0) {
1618 /* Range is not mapped */
1619 path = ext4_find_extent(inode, cur, NULL, 0);
1620 if (!path)
1621 continue;
1622 memset(&newex, 0, sizeof(newex));
1623 newex.ee_block = cpu_to_le32(cur);
1624 ext4_ext_store_pblock(
1625 &newex, start_pblk + cur - start);
1626 newex.ee_len = cpu_to_le16(map.m_len);
1627 if (ext4_ext_is_unwritten(ex))
1628 ext4_ext_mark_unwritten(&newex);
1629 down_write(&EXT4_I(inode)->i_data_sem);
1630 ret = ext4_ext_insert_extent(
1631 NULL, inode, &path, &newex, 0);
1632 up_write((&EXT4_I(inode)->i_data_sem));
1633 ext4_ext_drop_refs(path);
1634 kfree(path);
1635 if (ret) {
1636 iput(inode);
1637 return 0;
1638 }
1639 goto next;
1640 }
1641
1642 if (start_pblk + cur - start != map.m_pblk) {
1643 /*
1644 * Logical to physical mapping changed. This can happen
1645 * if this range was removed and then reallocated to
1646 * map to new physical blocks during a fast commit.
1647 */
1648 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1649 ext4_ext_is_unwritten(ex),
1650 start_pblk + cur - start);
1651 if (ret) {
1652 iput(inode);
1653 return 0;
1654 }
1655 /*
1656 * Mark the old blocks as free since they aren't used
1657 * anymore. We maintain an array of all the modified
1658 * inodes. In case these blocks are still used at either
1659 * a different logical range in the same inode or in
1660 * some different inode, we will mark them as allocated
1661 * at the end of the FC replay using our array of
1662 * modified inodes.
1663 */
1664 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1665 goto next;
1666 }
1667
1668 /* Range is mapped and needs a state change */
1669 jbd_debug(1, "Converting from %d to %d %lld",
1670 map.m_flags & EXT4_MAP_UNWRITTEN,
1671 ext4_ext_is_unwritten(ex), map.m_pblk);
1672 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1673 ext4_ext_is_unwritten(ex), map.m_pblk);
1674 if (ret) {
1675 iput(inode);
1676 return 0;
1677 }
1678 /*
1679 * We may have split the extent tree while toggling the state.
1680 * Try to shrink the extent tree now.
1681 */
1682 ext4_ext_replay_shrink_inode(inode, start + len);
1683next:
1684 cur += map.m_len;
1685 remaining -= map.m_len;
1686 }
1687 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1688 sb->s_blocksize_bits);
1689 iput(inode);
1690 return 0;
1691}
1692
1693/* Replay DEL_RANGE tag */
1694static int
1695ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1696{
1697 struct inode *inode;
1698 struct ext4_fc_del_range *lrange;
1699 struct ext4_map_blocks map;
1700 ext4_lblk_t cur, remaining;
1701 int ret;
1702
1703 lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1704 cur = le32_to_cpu(lrange->fc_lblk);
1705 remaining = le32_to_cpu(lrange->fc_len);
1706
1707 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1708 le32_to_cpu(lrange->fc_ino), cur, remaining);
1709
1710 inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1711 if (IS_ERR_OR_NULL(inode)) {
1712 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1713 return 0;
1714 }
1715
1716 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1717
1718 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1719 inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1720 le32_to_cpu(lrange->fc_len));
1721 while (remaining > 0) {
1722 map.m_lblk = cur;
1723 map.m_len = remaining;
1724
1725 ret = ext4_map_blocks(NULL, inode, &map, 0);
1726 if (ret < 0) {
1727 iput(inode);
1728 return 0;
1729 }
1730 if (ret > 0) {
1731 remaining -= ret;
1732 cur += ret;
1733 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1734 } else {
1735 remaining -= map.m_len;
1736 cur += map.m_len;
1737 }
1738 }
1739
1740 ret = ext4_punch_hole(inode,
1741 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1742 le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits);
1743 if (ret)
1744 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1745 ext4_ext_replay_shrink_inode(inode,
1746 i_size_read(inode) >> sb->s_blocksize_bits);
1747 ext4_mark_inode_dirty(NULL, inode);
1748 iput(inode);
1749
1750 return 0;
1751}
1752
1753static inline const char *tag2str(u16 tag)
1754{
1755 switch (tag) {
1756 case EXT4_FC_TAG_LINK:
1757 return "TAG_ADD_ENTRY";
1758 case EXT4_FC_TAG_UNLINK:
1759 return "TAG_DEL_ENTRY";
1760 case EXT4_FC_TAG_ADD_RANGE:
1761 return "TAG_ADD_RANGE";
1762 case EXT4_FC_TAG_CREAT:
1763 return "TAG_CREAT_DENTRY";
1764 case EXT4_FC_TAG_DEL_RANGE:
1765 return "TAG_DEL_RANGE";
1766 case EXT4_FC_TAG_INODE:
1767 return "TAG_INODE";
1768 case EXT4_FC_TAG_PAD:
1769 return "TAG_PAD";
1770 case EXT4_FC_TAG_TAIL:
1771 return "TAG_TAIL";
1772 case EXT4_FC_TAG_HEAD:
1773 return "TAG_HEAD";
1774 default:
1775 return "TAG_ERROR";
1776 }
1777}
1778
1779static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1780{
1781 struct ext4_fc_replay_state *state;
1782 struct inode *inode;
1783 struct ext4_ext_path *path = NULL;
1784 struct ext4_map_blocks map;
1785 int i, ret, j;
1786 ext4_lblk_t cur, end;
1787
1788 state = &EXT4_SB(sb)->s_fc_replay_state;
1789 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1790 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1791 EXT4_IGET_NORMAL);
1792 if (IS_ERR_OR_NULL(inode)) {
1793 jbd_debug(1, "Inode %d not found.",
1794 state->fc_modified_inodes[i]);
1795 continue;
1796 }
1797 cur = 0;
1798 end = EXT_MAX_BLOCKS;
1799 while (cur < end) {
1800 map.m_lblk = cur;
1801 map.m_len = end - cur;
1802
1803 ret = ext4_map_blocks(NULL, inode, &map, 0);
1804 if (ret < 0)
1805 break;
1806
1807 if (ret > 0) {
1808 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1809 if (!IS_ERR_OR_NULL(path)) {
1810 for (j = 0; j < path->p_depth; j++)
1811 ext4_mb_mark_bb(inode->i_sb,
1812 path[j].p_block, 1, 1);
1813 ext4_ext_drop_refs(path);
1814 kfree(path);
1815 }
1816 cur += ret;
1817 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1818 map.m_len, 1);
1819 } else {
1820 cur = cur + (map.m_len ? map.m_len : 1);
1821 }
1822 }
1823 iput(inode);
1824 }
1825}
1826
1827/*
1828 * Check if block is in excluded regions for block allocation. The simple
1829 * allocator that runs during replay phase is calls this function to see
1830 * if it is okay to use a block.
1831 */
1832bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1833{
1834 int i;
1835 struct ext4_fc_replay_state *state;
1836
1837 state = &EXT4_SB(sb)->s_fc_replay_state;
1838 for (i = 0; i < state->fc_regions_valid; i++) {
1839 if (state->fc_regions[i].ino == 0 ||
1840 state->fc_regions[i].len == 0)
1841 continue;
1842 if (blk >= state->fc_regions[i].pblk &&
1843 blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1844 return true;
1845 }
1846 return false;
1847}
1848
1849/* Cleanup function called after replay */
1850void ext4_fc_replay_cleanup(struct super_block *sb)
1851{
1852 struct ext4_sb_info *sbi = EXT4_SB(sb);
1853
1854 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1855 kfree(sbi->s_fc_replay_state.fc_regions);
1856 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1857}
1858
1859/*
1860 * Recovery Scan phase handler
1861 *
1862 * This function is called during the scan phase and is responsible
1863 * for doing following things:
1864 * - Make sure the fast commit area has valid tags for replay
1865 * - Count number of tags that need to be replayed by the replay handler
1866 * - Verify CRC
1867 * - Create a list of excluded blocks for allocation during replay phase
1868 *
1869 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1870 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1871 * to indicate that scan has finished and JBD2 can now start replay phase.
1872 * It returns a negative error to indicate that there was an error. At the end
1873 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1874 * to indicate the number of tags that need to replayed during the replay phase.
1875 */
1876static int ext4_fc_replay_scan(journal_t *journal,
1877 struct buffer_head *bh, int off,
1878 tid_t expected_tid)
1879{
1880 struct super_block *sb = journal->j_private;
1881 struct ext4_sb_info *sbi = EXT4_SB(sb);
1882 struct ext4_fc_replay_state *state;
1883 int ret = JBD2_FC_REPLAY_CONTINUE;
1884 struct ext4_fc_add_range *ext;
1885 struct ext4_fc_tl *tl;
1886 struct ext4_fc_tail *tail;
1887 __u8 *start, *end;
1888 struct ext4_fc_head *head;
1889 struct ext4_extent *ex;
1890
1891 state = &sbi->s_fc_replay_state;
1892
1893 start = (u8 *)bh->b_data;
1894 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1895
1896 if (state->fc_replay_expected_off == 0) {
1897 state->fc_cur_tag = 0;
1898 state->fc_replay_num_tags = 0;
1899 state->fc_crc = 0;
1900 state->fc_regions = NULL;
1901 state->fc_regions_valid = state->fc_regions_used =
1902 state->fc_regions_size = 0;
1903 /* Check if we can stop early */
1904 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1905 != EXT4_FC_TAG_HEAD)
1906 return 0;
1907 }
1908
1909 if (off != state->fc_replay_expected_off) {
1910 ret = -EFSCORRUPTED;
1911 goto out_err;
1912 }
1913
1914 state->fc_replay_expected_off++;
1915 fc_for_each_tl(start, end, tl) {
1916 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1917 tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1918 switch (le16_to_cpu(tl->fc_tag)) {
1919 case EXT4_FC_TAG_ADD_RANGE:
1920 ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1921 ex = (struct ext4_extent *)&ext->fc_ex;
1922 ret = ext4_fc_record_regions(sb,
1923 le32_to_cpu(ext->fc_ino),
1924 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1925 ext4_ext_get_actual_len(ex));
1926 if (ret < 0)
1927 break;
1928 ret = JBD2_FC_REPLAY_CONTINUE;
1929 fallthrough;
1930 case EXT4_FC_TAG_DEL_RANGE:
1931 case EXT4_FC_TAG_LINK:
1932 case EXT4_FC_TAG_UNLINK:
1933 case EXT4_FC_TAG_CREAT:
1934 case EXT4_FC_TAG_INODE:
1935 case EXT4_FC_TAG_PAD:
1936 state->fc_cur_tag++;
1937 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1938 sizeof(*tl) + ext4_fc_tag_len(tl));
1939 break;
1940 case EXT4_FC_TAG_TAIL:
1941 state->fc_cur_tag++;
1942 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1943 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1944 sizeof(*tl) +
1945 offsetof(struct ext4_fc_tail,
1946 fc_crc));
1947 if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1948 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1949 state->fc_replay_num_tags = state->fc_cur_tag;
1950 state->fc_regions_valid =
1951 state->fc_regions_used;
1952 } else {
1953 ret = state->fc_replay_num_tags ?
1954 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1955 }
1956 state->fc_crc = 0;
1957 break;
1958 case EXT4_FC_TAG_HEAD:
1959 head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1960 if (le32_to_cpu(head->fc_features) &
1961 ~EXT4_FC_SUPPORTED_FEATURES) {
1962 ret = -EOPNOTSUPP;
1963 break;
1964 }
1965 if (le32_to_cpu(head->fc_tid) != expected_tid) {
1966 ret = JBD2_FC_REPLAY_STOP;
1967 break;
1968 }
1969 state->fc_cur_tag++;
1970 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1971 sizeof(*tl) + ext4_fc_tag_len(tl));
1972 break;
1973 default:
1974 ret = state->fc_replay_num_tags ?
1975 JBD2_FC_REPLAY_STOP : -ECANCELED;
1976 }
1977 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1978 break;
1979 }
1980
1981out_err:
1982 trace_ext4_fc_replay_scan(sb, ret, off);
1983 return ret;
1984}
1985
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07001986/*
1987 * Main recovery path entry point.
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001988 * The meaning of return codes is similar as above.
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07001989 */
1990static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
1991 enum passtype pass, int off, tid_t expected_tid)
1992{
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001993 struct super_block *sb = journal->j_private;
1994 struct ext4_sb_info *sbi = EXT4_SB(sb);
1995 struct ext4_fc_tl *tl;
1996 __u8 *start, *end;
1997 int ret = JBD2_FC_REPLAY_CONTINUE;
1998 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
1999 struct ext4_fc_tail *tail;
2000
2001 if (pass == PASS_SCAN) {
2002 state->fc_current_pass = PASS_SCAN;
2003 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2004 }
2005
2006 if (state->fc_current_pass != pass) {
2007 state->fc_current_pass = pass;
2008 sbi->s_mount_state |= EXT4_FC_REPLAY;
2009 }
2010 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2011 jbd_debug(1, "Replay stops\n");
2012 ext4_fc_set_bitmaps_and_counters(sb);
2013 return 0;
2014 }
2015
2016#ifdef CONFIG_EXT4_DEBUG
2017 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2018 pr_warn("Dropping fc block %d because max_replay set\n", off);
2019 return JBD2_FC_REPLAY_STOP;
2020 }
2021#endif
2022
2023 start = (u8 *)bh->b_data;
2024 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2025
2026 fc_for_each_tl(start, end, tl) {
2027 if (state->fc_replay_num_tags == 0) {
2028 ret = JBD2_FC_REPLAY_STOP;
2029 ext4_fc_set_bitmaps_and_counters(sb);
2030 break;
2031 }
2032 jbd_debug(3, "Replay phase, tag:%s\n",
2033 tag2str(le16_to_cpu(tl->fc_tag)));
2034 state->fc_replay_num_tags--;
2035 switch (le16_to_cpu(tl->fc_tag)) {
2036 case EXT4_FC_TAG_LINK:
2037 ret = ext4_fc_replay_link(sb, tl);
2038 break;
2039 case EXT4_FC_TAG_UNLINK:
2040 ret = ext4_fc_replay_unlink(sb, tl);
2041 break;
2042 case EXT4_FC_TAG_ADD_RANGE:
2043 ret = ext4_fc_replay_add_range(sb, tl);
2044 break;
2045 case EXT4_FC_TAG_CREAT:
2046 ret = ext4_fc_replay_create(sb, tl);
2047 break;
2048 case EXT4_FC_TAG_DEL_RANGE:
2049 ret = ext4_fc_replay_del_range(sb, tl);
2050 break;
2051 case EXT4_FC_TAG_INODE:
2052 ret = ext4_fc_replay_inode(sb, tl);
2053 break;
2054 case EXT4_FC_TAG_PAD:
2055 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2056 ext4_fc_tag_len(tl), 0);
2057 break;
2058 case EXT4_FC_TAG_TAIL:
2059 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2060 ext4_fc_tag_len(tl), 0);
2061 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2062 WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2063 break;
2064 case EXT4_FC_TAG_HEAD:
2065 break;
2066 default:
2067 trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2068 ext4_fc_tag_len(tl), 0);
2069 ret = -ECANCELED;
2070 break;
2071 }
2072 if (ret < 0)
2073 break;
2074 ret = JBD2_FC_REPLAY_CONTINUE;
2075 }
2076 return ret;
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07002077}
2078
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07002079void ext4_fc_init(struct super_block *sb, journal_t *journal)
2080{
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07002081 /*
2082 * We set replay callback even if fast commit disabled because we may
2083 * could still have fast commit blocks that need to be replayed even if
2084 * fast commit has now been turned off.
2085 */
2086 journal->j_fc_replay_callback = ext4_fc_replay;
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07002087 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2088 return;
Harshad Shirwadkarff780b92020-10-15 13:37:56 -07002089 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07002090 if (jbd2_fc_init(journal, EXT4_NUM_FC_BLKS)) {
2091 pr_warn("Error while enabling fast commits, turning off.");
2092 ext4_clear_feature_fast_commit(sb);
2093 }
2094}
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07002095
Harshad Shirwadkarce8c59d2020-10-15 13:38:01 -07002096const char *fc_ineligible_reasons[] = {
2097 "Extended attributes changed",
2098 "Cross rename",
2099 "Journal flag changed",
2100 "Insufficient memory",
2101 "Swap boot",
2102 "Resize",
2103 "Dir renamed",
2104 "Falloc range op",
2105 "FC Commit Failed"
2106};
2107
2108int ext4_fc_info_show(struct seq_file *seq, void *v)
2109{
2110 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2111 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2112 int i;
2113
2114 if (v != SEQ_START_TOKEN)
2115 return 0;
2116
2117 seq_printf(seq,
2118 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2119 stats->fc_num_commits, stats->fc_ineligible_commits,
2120 stats->fc_numblks,
2121 div_u64(sbi->s_fc_avg_commit_time, 1000));
2122 seq_puts(seq, "Ineligible reasons:\n");
2123 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2124 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2125 stats->fc_ineligible_reason_count[i]);
2126
2127 return 0;
2128}
2129
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07002130int __init ext4_fc_init_dentry_cache(void)
2131{
2132 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2133 SLAB_RECLAIM_ACCOUNT);
2134
2135 if (ext4_fc_dentry_cachep == NULL)
2136 return -ENOMEM;
2137
2138 return 0;
2139}