blob: e69c580fa91e3fd5df5af753478afbf2592decd5 [file] [log] [blame]
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07001// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -070010#include "ext4.h"
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -070011#include "ext4_jbd2.h"
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -070012#include "ext4_extents.h"
13#include "mballoc.h"
14
15/*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligiblity is marked by calling one of the
70 * two following functions:
71 *
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
74 *
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
83 *
84 * Atomicity of commits
85 * --------------------
86 * In order to gaurantee atomicity during the commit operation, fast commit
87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
93 *
94 * - Create a new file A and remove existing file B
95 * - fsync()
96 * - Append contents to file A
97 * - Truncate file A
98 * - fsync()
99 *
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
103 *
104 * Replay code should thus check for all the valid tails in the FC area.
105 *
106 * TODOs
107 * -----
108 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109 * eligible update must be protected within ext4_fc_start_update() and
110 * ext4_fc_stop_update(). These routines are called at much higher
111 * routines. This can be made more fine grained by combining with
112 * ext4_journal_start().
113 *
114 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115 *
116 * 3) Handle more ineligible cases.
117 */
118
119#include <trace/events/ext4.h>
120static struct kmem_cache *ext4_fc_dentry_cachep;
121
122static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123{
124 BUFFER_TRACE(bh, "");
125 if (uptodate) {
126 ext4_debug("%s: Block %lld up-to-date",
127 __func__, bh->b_blocknr);
128 set_buffer_uptodate(bh);
129 } else {
130 ext4_debug("%s: Block %lld not up-to-date",
131 __func__, bh->b_blocknr);
132 clear_buffer_uptodate(bh);
133 }
134
135 unlock_buffer(bh);
136}
137
138static inline void ext4_fc_reset_inode(struct inode *inode)
139{
140 struct ext4_inode_info *ei = EXT4_I(inode);
141
142 ei->i_fc_lblk_start = 0;
143 ei->i_fc_lblk_len = 0;
144}
145
146void ext4_fc_init_inode(struct inode *inode)
147{
148 struct ext4_inode_info *ei = EXT4_I(inode);
149
150 ext4_fc_reset_inode(inode);
151 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152 INIT_LIST_HEAD(&ei->i_fc_list);
153 init_waitqueue_head(&ei->i_fc_wait);
154 atomic_set(&ei->i_fc_updates, 0);
155 ei->i_fc_committed_subtid = 0;
156}
157
158/*
159 * Inform Ext4's fast about start of an inode update
160 *
161 * This function is called by the high level call VFS callbacks before
162 * performing any inode update. This function blocks if there's an ongoing
163 * fast commit on the inode in question.
164 */
165void ext4_fc_start_update(struct inode *inode)
166{
167 struct ext4_inode_info *ei = EXT4_I(inode);
168
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700169 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
170 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700171 return;
172
173restart:
174 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
175 if (list_empty(&ei->i_fc_list))
176 goto out;
177
178 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
179 wait_queue_head_t *wq;
180#if (BITS_PER_LONG < 64)
181 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
182 EXT4_STATE_FC_COMMITTING);
183 wq = bit_waitqueue(&ei->i_state_flags,
184 EXT4_STATE_FC_COMMITTING);
185#else
186 DEFINE_WAIT_BIT(wait, &ei->i_flags,
187 EXT4_STATE_FC_COMMITTING);
188 wq = bit_waitqueue(&ei->i_flags,
189 EXT4_STATE_FC_COMMITTING);
190#endif
191 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
192 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
193 schedule();
194 finish_wait(wq, &wait.wq_entry);
195 goto restart;
196 }
197out:
198 atomic_inc(&ei->i_fc_updates);
199 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
200}
201
202/*
203 * Stop inode update and wake up waiting fast commits if any.
204 */
205void ext4_fc_stop_update(struct inode *inode)
206{
207 struct ext4_inode_info *ei = EXT4_I(inode);
208
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700209 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
210 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700211 return;
212
213 if (atomic_dec_and_test(&ei->i_fc_updates))
214 wake_up_all(&ei->i_fc_wait);
215}
216
217/*
218 * Remove inode from fast commit list. If the inode is being committed
219 * we wait until inode commit is done.
220 */
221void ext4_fc_del(struct inode *inode)
222{
223 struct ext4_inode_info *ei = EXT4_I(inode);
224
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700225 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
226 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700227 return;
228
229restart:
230 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
231 if (list_empty(&ei->i_fc_list)) {
232 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
233 return;
234 }
235
236 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
237 wait_queue_head_t *wq;
238#if (BITS_PER_LONG < 64)
239 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
240 EXT4_STATE_FC_COMMITTING);
241 wq = bit_waitqueue(&ei->i_state_flags,
242 EXT4_STATE_FC_COMMITTING);
243#else
244 DEFINE_WAIT_BIT(wait, &ei->i_flags,
245 EXT4_STATE_FC_COMMITTING);
246 wq = bit_waitqueue(&ei->i_flags,
247 EXT4_STATE_FC_COMMITTING);
248#endif
249 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
250 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
251 schedule();
252 finish_wait(wq, &wait.wq_entry);
253 goto restart;
254 }
255 if (!list_empty(&ei->i_fc_list))
256 list_del_init(&ei->i_fc_list);
257 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
258}
259
260/*
261 * Mark file system as fast commit ineligible. This means that next commit
262 * operation would result in a full jbd2 commit.
263 */
264void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
265{
266 struct ext4_sb_info *sbi = EXT4_SB(sb);
267
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700268 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
269 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
270 return;
271
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700272 sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700273 WARN_ON(reason >= EXT4_FC_REASON_MAX);
274 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
275}
276
277/*
278 * Start a fast commit ineligible update. Any commits that happen while
279 * such an operation is in progress fall back to full commits.
280 */
281void ext4_fc_start_ineligible(struct super_block *sb, int reason)
282{
283 struct ext4_sb_info *sbi = EXT4_SB(sb);
284
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700285 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
286 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
287 return;
288
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700289 WARN_ON(reason >= EXT4_FC_REASON_MAX);
290 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
291 atomic_inc(&sbi->s_fc_ineligible_updates);
292}
293
294/*
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700295 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700296 * to ensure that after stopping the ineligible update, at least one full
297 * commit takes place.
298 */
299void ext4_fc_stop_ineligible(struct super_block *sb)
300{
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700301 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
302 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
303 return;
304
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700305 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700306 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
307}
308
309static inline int ext4_fc_is_ineligible(struct super_block *sb)
310{
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700311 return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) ||
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700312 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
313}
314
315/*
316 * Generic fast commit tracking function. If this is the first time this we are
317 * called after a full commit, we initialize fast commit fields and then call
318 * __fc_track_fn() with update = 0. If we have already been called after a full
319 * commit, we pass update = 1. Based on that, the track function can determine
320 * if it needs to track a field for the first time or if it needs to just
321 * update the previously tracked value.
322 *
323 * If enqueue is set, this function enqueues the inode in fast commit list.
324 */
325static int ext4_fc_track_template(
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800326 handle_t *handle, struct inode *inode,
327 int (*__fc_track_fn)(struct inode *, void *, bool),
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700328 void *args, int enqueue)
329{
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700330 bool update = false;
331 struct ext4_inode_info *ei = EXT4_I(inode);
332 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800333 tid_t tid = 0;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700334 int ret;
335
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700336 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
337 (sbi->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700338 return -EOPNOTSUPP;
339
340 if (ext4_fc_is_ineligible(inode->i_sb))
341 return -EINVAL;
342
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800343 tid = handle->h_transaction->t_tid;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700344 mutex_lock(&ei->i_fc_lock);
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800345 if (tid == ei->i_sync_tid) {
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700346 update = true;
347 } else {
348 ext4_fc_reset_inode(inode);
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800349 ei->i_sync_tid = tid;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700350 }
351 ret = __fc_track_fn(inode, args, update);
352 mutex_unlock(&ei->i_fc_lock);
353
354 if (!enqueue)
355 return ret;
356
357 spin_lock(&sbi->s_fc_lock);
358 if (list_empty(&EXT4_I(inode)->i_fc_list))
359 list_add_tail(&EXT4_I(inode)->i_fc_list,
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700360 (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ?
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700361 &sbi->s_fc_q[FC_Q_STAGING] :
362 &sbi->s_fc_q[FC_Q_MAIN]);
363 spin_unlock(&sbi->s_fc_lock);
364
365 return ret;
366}
367
368struct __track_dentry_update_args {
369 struct dentry *dentry;
370 int op;
371};
372
373/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
374static int __track_dentry_update(struct inode *inode, void *arg, bool update)
375{
376 struct ext4_fc_dentry_update *node;
377 struct ext4_inode_info *ei = EXT4_I(inode);
378 struct __track_dentry_update_args *dentry_update =
379 (struct __track_dentry_update_args *)arg;
380 struct dentry *dentry = dentry_update->dentry;
381 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
382
383 mutex_unlock(&ei->i_fc_lock);
384 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
385 if (!node) {
Harshad Shirwadkarb21ebf12020-11-05 19:58:51 -0800386 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700387 mutex_lock(&ei->i_fc_lock);
388 return -ENOMEM;
389 }
390
391 node->fcd_op = dentry_update->op;
392 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
393 node->fcd_ino = inode->i_ino;
394 if (dentry->d_name.len > DNAME_INLINE_LEN) {
395 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
396 if (!node->fcd_name.name) {
397 kmem_cache_free(ext4_fc_dentry_cachep, node);
398 ext4_fc_mark_ineligible(inode->i_sb,
Harshad Shirwadkarb21ebf12020-11-05 19:58:51 -0800399 EXT4_FC_REASON_NOMEM);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700400 mutex_lock(&ei->i_fc_lock);
401 return -ENOMEM;
402 }
403 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
404 dentry->d_name.len);
405 } else {
406 memcpy(node->fcd_iname, dentry->d_name.name,
407 dentry->d_name.len);
408 node->fcd_name.name = node->fcd_iname;
409 }
410 node->fcd_name.len = dentry->d_name.len;
411
412 spin_lock(&sbi->s_fc_lock);
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700413 if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700414 list_add_tail(&node->fcd_list,
415 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
416 else
417 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
418 spin_unlock(&sbi->s_fc_lock);
419 mutex_lock(&ei->i_fc_lock);
420
421 return 0;
422}
423
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800424void __ext4_fc_track_unlink(handle_t *handle,
425 struct inode *inode, struct dentry *dentry)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700426{
427 struct __track_dentry_update_args args;
428 int ret;
429
430 args.dentry = dentry;
431 args.op = EXT4_FC_TAG_UNLINK;
432
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800433 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700434 (void *)&args, 0);
435 trace_ext4_fc_track_unlink(inode, dentry, ret);
436}
437
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800438void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
439{
440 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
441}
442
443void __ext4_fc_track_link(handle_t *handle,
444 struct inode *inode, struct dentry *dentry)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700445{
446 struct __track_dentry_update_args args;
447 int ret;
448
449 args.dentry = dentry;
450 args.op = EXT4_FC_TAG_LINK;
451
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800452 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700453 (void *)&args, 0);
454 trace_ext4_fc_track_link(inode, dentry, ret);
455}
456
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800457void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
458{
459 __ext4_fc_track_link(handle, d_inode(dentry), dentry);
460}
461
462void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700463{
464 struct __track_dentry_update_args args;
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800465 struct inode *inode = d_inode(dentry);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700466 int ret;
467
468 args.dentry = dentry;
469 args.op = EXT4_FC_TAG_CREAT;
470
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800471 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700472 (void *)&args, 0);
473 trace_ext4_fc_track_create(inode, dentry, ret);
474}
475
476/* __track_fn for inode tracking */
477static int __track_inode(struct inode *inode, void *arg, bool update)
478{
479 if (update)
480 return -EEXIST;
481
482 EXT4_I(inode)->i_fc_lblk_len = 0;
483
484 return 0;
485}
486
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800487void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700488{
489 int ret;
490
491 if (S_ISDIR(inode->i_mode))
492 return;
493
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800494 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700495 trace_ext4_fc_track_inode(inode, ret);
496}
497
498struct __track_range_args {
499 ext4_lblk_t start, end;
500};
501
502/* __track_fn for tracking data updates */
503static int __track_range(struct inode *inode, void *arg, bool update)
504{
505 struct ext4_inode_info *ei = EXT4_I(inode);
506 ext4_lblk_t oldstart;
507 struct __track_range_args *__arg =
508 (struct __track_range_args *)arg;
509
510 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
511 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
512 return -ECANCELED;
513 }
514
515 oldstart = ei->i_fc_lblk_start;
516
517 if (update && ei->i_fc_lblk_len > 0) {
518 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
519 ei->i_fc_lblk_len =
520 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
521 ei->i_fc_lblk_start + 1;
522 } else {
523 ei->i_fc_lblk_start = __arg->start;
524 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
525 }
526
527 return 0;
528}
529
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800530void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700531 ext4_lblk_t end)
532{
533 struct __track_range_args args;
534 int ret;
535
536 if (S_ISDIR(inode->i_mode))
537 return;
538
539 args.start = start;
540 args.end = end;
541
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800542 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700543
544 trace_ext4_fc_track_range(inode, start, end, ret);
545}
546
547static void ext4_fc_submit_bh(struct super_block *sb)
548{
549 int write_flags = REQ_SYNC;
550 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
551
552 if (test_opt(sb, BARRIER))
553 write_flags |= REQ_FUA | REQ_PREFLUSH;
554 lock_buffer(bh);
555 clear_buffer_dirty(bh);
556 set_buffer_uptodate(bh);
557 bh->b_end_io = ext4_end_buffer_io_sync;
558 submit_bh(REQ_OP_WRITE, write_flags, bh);
559 EXT4_SB(sb)->s_fc_bh = NULL;
560}
561
562/* Ext4 commit path routines */
563
564/* memzero and update CRC */
565static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
566 u32 *crc)
567{
568 void *ret;
569
570 ret = memset(dst, 0, len);
571 if (crc)
572 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
573 return ret;
574}
575
576/*
577 * Allocate len bytes on a fast commit buffer.
578 *
579 * During the commit time this function is used to manage fast commit
580 * block space. We don't split a fast commit log onto different
581 * blocks. So this function makes sure that if there's not enough space
582 * on the current block, the remaining space in the current block is
583 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
584 * new block is from jbd2 and CRC is updated to reflect the padding
585 * we added.
586 */
587static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
588{
589 struct ext4_fc_tl *tl;
590 struct ext4_sb_info *sbi = EXT4_SB(sb);
591 struct buffer_head *bh;
592 int bsize = sbi->s_journal->j_blocksize;
593 int ret, off = sbi->s_fc_bytes % bsize;
594 int pad_len;
595
596 /*
597 * After allocating len, we should have space at least for a 0 byte
598 * padding.
599 */
600 if (len + sizeof(struct ext4_fc_tl) > bsize)
601 return NULL;
602
603 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
604 /*
605 * Only allocate from current buffer if we have enough space for
606 * this request AND we have space to add a zero byte padding.
607 */
608 if (!sbi->s_fc_bh) {
609 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
610 if (ret)
611 return NULL;
612 sbi->s_fc_bh = bh;
613 }
614 sbi->s_fc_bytes += len;
615 return sbi->s_fc_bh->b_data + off;
616 }
617 /* Need to add PAD tag */
618 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
619 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
620 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
621 tl->fc_len = cpu_to_le16(pad_len);
622 if (crc)
623 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
624 if (pad_len > 0)
625 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
626 ext4_fc_submit_bh(sb);
627
628 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
629 if (ret)
630 return NULL;
631 sbi->s_fc_bh = bh;
632 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
633 return sbi->s_fc_bh->b_data;
634}
635
636/* memcpy to fc reserved space and update CRC */
637static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
638 int len, u32 *crc)
639{
640 if (crc)
641 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
642 return memcpy(dst, src, len);
643}
644
645/*
646 * Complete a fast commit by writing tail tag.
647 *
648 * Writing tail tag marks the end of a fast commit. In order to guarantee
649 * atomicity, after writing tail tag, even if there's space remaining
650 * in the block, next commit shouldn't use it. That's why tail tag
651 * has the length as that of the remaining space on the block.
652 */
653static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
654{
655 struct ext4_sb_info *sbi = EXT4_SB(sb);
656 struct ext4_fc_tl tl;
657 struct ext4_fc_tail tail;
658 int off, bsize = sbi->s_journal->j_blocksize;
659 u8 *dst;
660
661 /*
662 * ext4_fc_reserve_space takes care of allocating an extra block if
663 * there's no enough space on this block for accommodating this tail.
664 */
665 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
666 if (!dst)
667 return -ENOSPC;
668
669 off = sbi->s_fc_bytes % bsize;
670
671 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
672 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
673 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
674
675 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
676 dst += sizeof(tl);
677 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
678 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
679 dst += sizeof(tail.fc_tid);
680 tail.fc_crc = cpu_to_le32(crc);
681 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
682
683 ext4_fc_submit_bh(sb);
684
685 return 0;
686}
687
688/*
689 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
690 * Returns false if there's not enough space.
691 */
692static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
693 u32 *crc)
694{
695 struct ext4_fc_tl tl;
696 u8 *dst;
697
698 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
699 if (!dst)
700 return false;
701
702 tl.fc_tag = cpu_to_le16(tag);
703 tl.fc_len = cpu_to_le16(len);
704
705 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
706 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
707
708 return true;
709}
710
711/* Same as above, but adds dentry tlv. */
712static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
713 int parent_ino, int ino, int dlen,
714 const unsigned char *dname,
715 u32 *crc)
716{
717 struct ext4_fc_dentry_info fcd;
718 struct ext4_fc_tl tl;
719 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
720 crc);
721
722 if (!dst)
723 return false;
724
725 fcd.fc_parent_ino = cpu_to_le32(parent_ino);
726 fcd.fc_ino = cpu_to_le32(ino);
727 tl.fc_tag = cpu_to_le16(tag);
728 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
729 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
730 dst += sizeof(tl);
731 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
732 dst += sizeof(fcd);
733 ext4_fc_memcpy(sb, dst, dname, dlen, crc);
734 dst += dlen;
735
736 return true;
737}
738
739/*
740 * Writes inode in the fast commit space under TLV with tag @tag.
741 * Returns 0 on success, error on failure.
742 */
743static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
744{
745 struct ext4_inode_info *ei = EXT4_I(inode);
746 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
747 int ret;
748 struct ext4_iloc iloc;
749 struct ext4_fc_inode fc_inode;
750 struct ext4_fc_tl tl;
751 u8 *dst;
752
753 ret = ext4_get_inode_loc(inode, &iloc);
754 if (ret)
755 return ret;
756
757 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
758 inode_len += ei->i_extra_isize;
759
760 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
761 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
762 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
763
764 dst = ext4_fc_reserve_space(inode->i_sb,
765 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
766 if (!dst)
767 return -ECANCELED;
768
769 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
770 return -ECANCELED;
771 dst += sizeof(tl);
772 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
773 return -ECANCELED;
774 dst += sizeof(fc_inode);
775 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
776 inode_len, crc))
777 return -ECANCELED;
778
779 return 0;
780}
781
782/*
783 * Writes updated data ranges for the inode in question. Updates CRC.
784 * Returns 0 on success, error otherwise.
785 */
786static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
787{
788 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
789 struct ext4_inode_info *ei = EXT4_I(inode);
790 struct ext4_map_blocks map;
791 struct ext4_fc_add_range fc_ext;
792 struct ext4_fc_del_range lrange;
793 struct ext4_extent *ex;
794 int ret;
795
796 mutex_lock(&ei->i_fc_lock);
797 if (ei->i_fc_lblk_len == 0) {
798 mutex_unlock(&ei->i_fc_lock);
799 return 0;
800 }
801 old_blk_size = ei->i_fc_lblk_start;
802 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
803 ei->i_fc_lblk_len = 0;
804 mutex_unlock(&ei->i_fc_lock);
805
806 cur_lblk_off = old_blk_size;
807 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
808 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
809
810 while (cur_lblk_off <= new_blk_size) {
811 map.m_lblk = cur_lblk_off;
812 map.m_len = new_blk_size - cur_lblk_off + 1;
813 ret = ext4_map_blocks(NULL, inode, &map, 0);
814 if (ret < 0)
815 return -ECANCELED;
816
817 if (map.m_len == 0) {
818 cur_lblk_off++;
819 continue;
820 }
821
822 if (ret == 0) {
823 lrange.fc_ino = cpu_to_le32(inode->i_ino);
824 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
825 lrange.fc_len = cpu_to_le32(map.m_len);
826 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
827 sizeof(lrange), (u8 *)&lrange, crc))
828 return -ENOSPC;
829 } else {
830 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
831 ex = (struct ext4_extent *)&fc_ext.fc_ex;
832 ex->ee_block = cpu_to_le32(map.m_lblk);
833 ex->ee_len = cpu_to_le16(map.m_len);
834 ext4_ext_store_pblock(ex, map.m_pblk);
835 if (map.m_flags & EXT4_MAP_UNWRITTEN)
836 ext4_ext_mark_unwritten(ex);
837 else
838 ext4_ext_mark_initialized(ex);
839 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
840 sizeof(fc_ext), (u8 *)&fc_ext, crc))
841 return -ENOSPC;
842 }
843
844 cur_lblk_off += map.m_len;
845 }
846
847 return 0;
848}
849
850
851/* Submit data for all the fast commit inodes */
852static int ext4_fc_submit_inode_data_all(journal_t *journal)
853{
854 struct super_block *sb = (struct super_block *)(journal->j_private);
855 struct ext4_sb_info *sbi = EXT4_SB(sb);
856 struct ext4_inode_info *ei;
857 struct list_head *pos;
858 int ret = 0;
859
860 spin_lock(&sbi->s_fc_lock);
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700861 sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700862 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
863 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
864 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
865 while (atomic_read(&ei->i_fc_updates)) {
866 DEFINE_WAIT(wait);
867
868 prepare_to_wait(&ei->i_fc_wait, &wait,
869 TASK_UNINTERRUPTIBLE);
870 if (atomic_read(&ei->i_fc_updates)) {
871 spin_unlock(&sbi->s_fc_lock);
872 schedule();
873 spin_lock(&sbi->s_fc_lock);
874 }
875 finish_wait(&ei->i_fc_wait, &wait);
876 }
877 spin_unlock(&sbi->s_fc_lock);
878 ret = jbd2_submit_inode_data(ei->jinode);
879 if (ret)
880 return ret;
881 spin_lock(&sbi->s_fc_lock);
882 }
883 spin_unlock(&sbi->s_fc_lock);
884
885 return ret;
886}
887
888/* Wait for completion of data for all the fast commit inodes */
889static int ext4_fc_wait_inode_data_all(journal_t *journal)
890{
891 struct super_block *sb = (struct super_block *)(journal->j_private);
892 struct ext4_sb_info *sbi = EXT4_SB(sb);
893 struct ext4_inode_info *pos, *n;
894 int ret = 0;
895
896 spin_lock(&sbi->s_fc_lock);
897 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
898 if (!ext4_test_inode_state(&pos->vfs_inode,
899 EXT4_STATE_FC_COMMITTING))
900 continue;
901 spin_unlock(&sbi->s_fc_lock);
902
903 ret = jbd2_wait_inode_data(journal, pos->jinode);
904 if (ret)
905 return ret;
906 spin_lock(&sbi->s_fc_lock);
907 }
908 spin_unlock(&sbi->s_fc_lock);
909
910 return 0;
911}
912
913/* Commit all the directory entry updates */
914static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
915{
916 struct super_block *sb = (struct super_block *)(journal->j_private);
917 struct ext4_sb_info *sbi = EXT4_SB(sb);
918 struct ext4_fc_dentry_update *fc_dentry;
919 struct inode *inode;
920 struct list_head *pos, *n, *fcd_pos, *fcd_n;
921 struct ext4_inode_info *ei;
922 int ret;
923
924 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
925 return 0;
926 list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
927 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
928 fcd_list);
929 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
930 spin_unlock(&sbi->s_fc_lock);
931 if (!ext4_fc_add_dentry_tlv(
932 sb, fc_dentry->fcd_op,
933 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
934 fc_dentry->fcd_name.len,
935 fc_dentry->fcd_name.name, crc)) {
936 ret = -ENOSPC;
937 goto lock_and_exit;
938 }
939 spin_lock(&sbi->s_fc_lock);
940 continue;
941 }
942
943 inode = NULL;
944 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
945 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
946 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
947 inode = &ei->vfs_inode;
948 break;
949 }
950 }
951 /*
952 * If we don't find inode in our list, then it was deleted,
953 * in which case, we don't need to record it's create tag.
954 */
955 if (!inode)
956 continue;
957 spin_unlock(&sbi->s_fc_lock);
958
959 /*
960 * We first write the inode and then the create dirent. This
961 * allows the recovery code to create an unnamed inode first
962 * and then link it to a directory entry. This allows us
963 * to use namei.c routines almost as is and simplifies
964 * the recovery code.
965 */
966 ret = ext4_fc_write_inode(inode, crc);
967 if (ret)
968 goto lock_and_exit;
969
970 ret = ext4_fc_write_inode_data(inode, crc);
971 if (ret)
972 goto lock_and_exit;
973
974 if (!ext4_fc_add_dentry_tlv(
975 sb, fc_dentry->fcd_op,
976 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
977 fc_dentry->fcd_name.len,
978 fc_dentry->fcd_name.name, crc)) {
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700979 ret = -ENOSPC;
980 goto lock_and_exit;
981 }
982
983 spin_lock(&sbi->s_fc_lock);
984 }
985 return 0;
986lock_and_exit:
987 spin_lock(&sbi->s_fc_lock);
988 return ret;
989}
990
991static int ext4_fc_perform_commit(journal_t *journal)
992{
993 struct super_block *sb = (struct super_block *)(journal->j_private);
994 struct ext4_sb_info *sbi = EXT4_SB(sb);
995 struct ext4_inode_info *iter;
996 struct ext4_fc_head head;
997 struct list_head *pos;
998 struct inode *inode;
999 struct blk_plug plug;
1000 int ret = 0;
1001 u32 crc = 0;
1002
1003 ret = ext4_fc_submit_inode_data_all(journal);
1004 if (ret)
1005 return ret;
1006
1007 ret = ext4_fc_wait_inode_data_all(journal);
1008 if (ret)
1009 return ret;
1010
1011 blk_start_plug(&plug);
1012 if (sbi->s_fc_bytes == 0) {
1013 /*
1014 * Add a head tag only if this is the first fast commit
1015 * in this TID.
1016 */
1017 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1018 head.fc_tid = cpu_to_le32(
1019 sbi->s_journal->j_running_transaction->t_tid);
1020 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1021 (u8 *)&head, &crc))
1022 goto out;
1023 }
1024
1025 spin_lock(&sbi->s_fc_lock);
1026 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1027 if (ret) {
1028 spin_unlock(&sbi->s_fc_lock);
1029 goto out;
1030 }
1031
1032 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1033 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1034 inode = &iter->vfs_inode;
1035 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1036 continue;
1037
1038 spin_unlock(&sbi->s_fc_lock);
1039 ret = ext4_fc_write_inode_data(inode, &crc);
1040 if (ret)
1041 goto out;
1042 ret = ext4_fc_write_inode(inode, &crc);
1043 if (ret)
1044 goto out;
1045 spin_lock(&sbi->s_fc_lock);
1046 EXT4_I(inode)->i_fc_committed_subtid =
1047 atomic_read(&sbi->s_fc_subtid);
1048 }
1049 spin_unlock(&sbi->s_fc_lock);
1050
1051 ret = ext4_fc_write_tail(sb, crc);
1052
1053out:
1054 blk_finish_plug(&plug);
1055 return ret;
1056}
1057
1058/*
1059 * The main commit entry point. Performs a fast commit for transaction
1060 * commit_tid if needed. If it's not possible to perform a fast commit
1061 * due to various reasons, we fall back to full commit. Returns 0
1062 * on success, error otherwise.
1063 */
1064int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1065{
1066 struct super_block *sb = (struct super_block *)(journal->j_private);
1067 struct ext4_sb_info *sbi = EXT4_SB(sb);
1068 int nblks = 0, ret, bsize = journal->j_blocksize;
1069 int subtid = atomic_read(&sbi->s_fc_subtid);
1070 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1071 ktime_t start_time, commit_time;
1072
1073 trace_ext4_fc_commit_start(sb);
1074
1075 start_time = ktime_get();
1076
1077 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1078 (ext4_fc_is_ineligible(sb))) {
1079 reason = EXT4_FC_REASON_INELIGIBLE;
1080 goto out;
1081 }
1082
1083restart_fc:
1084 ret = jbd2_fc_begin_commit(journal, commit_tid);
1085 if (ret == -EALREADY) {
1086 /* There was an ongoing commit, check if we need to restart */
1087 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1088 commit_tid > journal->j_commit_sequence)
1089 goto restart_fc;
1090 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1091 goto out;
1092 } else if (ret) {
1093 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1094 reason = EXT4_FC_REASON_FC_START_FAILED;
1095 goto out;
1096 }
1097
1098 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1099 ret = ext4_fc_perform_commit(journal);
1100 if (ret < 0) {
1101 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1102 reason = EXT4_FC_REASON_FC_FAILED;
1103 goto out;
1104 }
1105 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1106 ret = jbd2_fc_wait_bufs(journal, nblks);
1107 if (ret < 0) {
1108 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1109 reason = EXT4_FC_REASON_FC_FAILED;
1110 goto out;
1111 }
1112 atomic_inc(&sbi->s_fc_subtid);
1113 jbd2_fc_end_commit(journal);
1114out:
1115 /* Has any ineligible update happened since we started? */
1116 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1117 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1118 reason = EXT4_FC_REASON_INELIGIBLE;
1119 }
1120
1121 spin_lock(&sbi->s_fc_lock);
1122 if (reason != EXT4_FC_REASON_OK &&
1123 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1124 sbi->s_fc_stats.fc_ineligible_commits++;
1125 } else {
1126 sbi->s_fc_stats.fc_num_commits++;
1127 sbi->s_fc_stats.fc_numblks += nblks;
1128 }
1129 spin_unlock(&sbi->s_fc_lock);
1130 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1131 trace_ext4_fc_commit_stop(sb, nblks, reason);
1132 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1133 /*
1134 * weight the commit time higher than the average time so we don't
1135 * react too strongly to vast changes in the commit time
1136 */
1137 if (likely(sbi->s_fc_avg_commit_time))
1138 sbi->s_fc_avg_commit_time = (commit_time +
1139 sbi->s_fc_avg_commit_time * 3) / 4;
1140 else
1141 sbi->s_fc_avg_commit_time = commit_time;
1142 jbd_debug(1,
1143 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1144 nblks, reason, subtid);
1145 if (reason == EXT4_FC_REASON_FC_FAILED)
Harshad Shirwadkar0bce5772020-11-05 19:58:58 -08001146 return jbd2_fc_end_commit_fallback(journal);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07001147 if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1148 reason == EXT4_FC_REASON_INELIGIBLE)
1149 return jbd2_complete_transaction(journal, commit_tid);
1150 return 0;
1151}
1152
Harshad Shirwadkarff780b92020-10-15 13:37:56 -07001153/*
1154 * Fast commit cleanup routine. This is called after every fast commit and
1155 * full commit. full is true if we are called after a full commit.
1156 */
1157static void ext4_fc_cleanup(journal_t *journal, int full)
1158{
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07001159 struct super_block *sb = journal->j_private;
1160 struct ext4_sb_info *sbi = EXT4_SB(sb);
1161 struct ext4_inode_info *iter;
1162 struct ext4_fc_dentry_update *fc_dentry;
1163 struct list_head *pos, *n;
1164
1165 if (full && sbi->s_fc_bh)
1166 sbi->s_fc_bh = NULL;
1167
1168 jbd2_fc_release_bufs(journal);
1169
1170 spin_lock(&sbi->s_fc_lock);
1171 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1172 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1173 list_del_init(&iter->i_fc_list);
1174 ext4_clear_inode_state(&iter->vfs_inode,
1175 EXT4_STATE_FC_COMMITTING);
1176 ext4_fc_reset_inode(&iter->vfs_inode);
1177 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1178 smp_mb();
1179#if (BITS_PER_LONG < 64)
1180 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1181#else
1182 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1183#endif
1184 }
1185
1186 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1187 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1188 struct ext4_fc_dentry_update,
1189 fcd_list);
1190 list_del_init(&fc_dentry->fcd_list);
1191 spin_unlock(&sbi->s_fc_lock);
1192
1193 if (fc_dentry->fcd_name.name &&
1194 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1195 kfree(fc_dentry->fcd_name.name);
1196 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1197 spin_lock(&sbi->s_fc_lock);
1198 }
1199
1200 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1201 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1202 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1203 &sbi->s_fc_q[FC_Q_STAGING]);
1204
Harshad Shirwadkarababea72020-10-26 21:49:15 -07001205 sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
1206 sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07001207
1208 if (full)
1209 sbi->s_fc_bytes = 0;
1210 spin_unlock(&sbi->s_fc_lock);
1211 trace_ext4_fc_stats(sb);
Harshad Shirwadkarff780b92020-10-15 13:37:56 -07001212}
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07001213
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001214/* Ext4 Replay Path Routines */
1215
1216/* Get length of a particular tlv */
1217static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1218{
1219 return le16_to_cpu(tl->fc_len);
1220}
1221
1222/* Get a pointer to "value" of a tlv */
1223static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1224{
1225 return (u8 *)tl + sizeof(*tl);
1226}
1227
1228/* Helper struct for dentry replay routines */
1229struct dentry_info_args {
1230 int parent_ino, dname_len, ino, inode_len;
1231 char *dname;
1232};
1233
1234static inline void tl_to_darg(struct dentry_info_args *darg,
1235 struct ext4_fc_tl *tl)
1236{
1237 struct ext4_fc_dentry_info *fcd;
1238
1239 fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1240
1241 darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1242 darg->ino = le32_to_cpu(fcd->fc_ino);
1243 darg->dname = fcd->fc_dname;
1244 darg->dname_len = ext4_fc_tag_len(tl) -
1245 sizeof(struct ext4_fc_dentry_info);
1246}
1247
1248/* Unlink replay function */
1249static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1250{
1251 struct inode *inode, *old_parent;
1252 struct qstr entry;
1253 struct dentry_info_args darg;
1254 int ret = 0;
1255
1256 tl_to_darg(&darg, tl);
1257
1258 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1259 darg.parent_ino, darg.dname_len);
1260
1261 entry.name = darg.dname;
1262 entry.len = darg.dname_len;
1263 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1264
1265 if (IS_ERR_OR_NULL(inode)) {
1266 jbd_debug(1, "Inode %d not found", darg.ino);
1267 return 0;
1268 }
1269
1270 old_parent = ext4_iget(sb, darg.parent_ino,
1271 EXT4_IGET_NORMAL);
1272 if (IS_ERR_OR_NULL(old_parent)) {
1273 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1274 iput(inode);
1275 return 0;
1276 }
1277
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -08001278 ret = __ext4_unlink(NULL, old_parent, &entry, inode);
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001279 /* -ENOENT ok coz it might not exist anymore. */
1280 if (ret == -ENOENT)
1281 ret = 0;
1282 iput(old_parent);
1283 iput(inode);
1284 return ret;
1285}
1286
1287static int ext4_fc_replay_link_internal(struct super_block *sb,
1288 struct dentry_info_args *darg,
1289 struct inode *inode)
1290{
1291 struct inode *dir = NULL;
1292 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1293 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1294 int ret = 0;
1295
1296 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1297 if (IS_ERR(dir)) {
1298 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1299 dir = NULL;
1300 goto out;
1301 }
1302
1303 dentry_dir = d_obtain_alias(dir);
1304 if (IS_ERR(dentry_dir)) {
1305 jbd_debug(1, "Failed to obtain dentry");
1306 dentry_dir = NULL;
1307 goto out;
1308 }
1309
1310 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1311 if (!dentry_inode) {
1312 jbd_debug(1, "Inode dentry not created.");
1313 ret = -ENOMEM;
1314 goto out;
1315 }
1316
1317 ret = __ext4_link(dir, inode, dentry_inode);
1318 /*
1319 * It's possible that link already existed since data blocks
1320 * for the dir in question got persisted before we crashed OR
1321 * we replayed this tag and crashed before the entire replay
1322 * could complete.
1323 */
1324 if (ret && ret != -EEXIST) {
1325 jbd_debug(1, "Failed to link\n");
1326 goto out;
1327 }
1328
1329 ret = 0;
1330out:
1331 if (dentry_dir) {
1332 d_drop(dentry_dir);
1333 dput(dentry_dir);
1334 } else if (dir) {
1335 iput(dir);
1336 }
1337 if (dentry_inode) {
1338 d_drop(dentry_inode);
1339 dput(dentry_inode);
1340 }
1341
1342 return ret;
1343}
1344
1345/* Link replay function */
1346static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1347{
1348 struct inode *inode;
1349 struct dentry_info_args darg;
1350 int ret = 0;
1351
1352 tl_to_darg(&darg, tl);
1353 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1354 darg.parent_ino, darg.dname_len);
1355
1356 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1357 if (IS_ERR_OR_NULL(inode)) {
1358 jbd_debug(1, "Inode not found.");
1359 return 0;
1360 }
1361
1362 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1363 iput(inode);
1364 return ret;
1365}
1366
1367/*
1368 * Record all the modified inodes during replay. We use this later to setup
1369 * block bitmaps correctly.
1370 */
1371static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1372{
1373 struct ext4_fc_replay_state *state;
1374 int i;
1375
1376 state = &EXT4_SB(sb)->s_fc_replay_state;
1377 for (i = 0; i < state->fc_modified_inodes_used; i++)
1378 if (state->fc_modified_inodes[i] == ino)
1379 return 0;
1380 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1381 state->fc_modified_inodes_size +=
1382 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1383 state->fc_modified_inodes = krealloc(
1384 state->fc_modified_inodes, sizeof(int) *
1385 state->fc_modified_inodes_size,
1386 GFP_KERNEL);
1387 if (!state->fc_modified_inodes)
1388 return -ENOMEM;
1389 }
1390 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1391 return 0;
1392}
1393
1394/*
1395 * Inode replay function
1396 */
1397static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1398{
1399 struct ext4_fc_inode *fc_inode;
1400 struct ext4_inode *raw_inode;
1401 struct ext4_inode *raw_fc_inode;
1402 struct inode *inode = NULL;
1403 struct ext4_iloc iloc;
1404 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1405 struct ext4_extent_header *eh;
1406
1407 fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1408
1409 ino = le32_to_cpu(fc_inode->fc_ino);
1410 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1411
1412 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1413 if (!IS_ERR_OR_NULL(inode)) {
1414 ext4_ext_clear_bb(inode);
1415 iput(inode);
1416 }
1417
1418 ext4_fc_record_modified_inode(sb, ino);
1419
1420 raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1421 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1422 if (ret)
1423 goto out;
1424
1425 inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1426 raw_inode = ext4_raw_inode(&iloc);
1427
1428 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1429 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1430 inode_len - offsetof(struct ext4_inode, i_generation));
1431 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1432 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1433 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1434 memset(eh, 0, sizeof(*eh));
1435 eh->eh_magic = EXT4_EXT_MAGIC;
1436 eh->eh_max = cpu_to_le16(
1437 (sizeof(raw_inode->i_block) -
1438 sizeof(struct ext4_extent_header))
1439 / sizeof(struct ext4_extent));
1440 }
1441 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1442 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1443 sizeof(raw_inode->i_block));
1444 }
1445
1446 /* Immediately update the inode on disk. */
1447 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1448 if (ret)
1449 goto out;
1450 ret = sync_dirty_buffer(iloc.bh);
1451 if (ret)
1452 goto out;
1453 ret = ext4_mark_inode_used(sb, ino);
1454 if (ret)
1455 goto out;
1456
1457 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1458 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1459 if (IS_ERR_OR_NULL(inode)) {
1460 jbd_debug(1, "Inode not found.");
1461 return -EFSCORRUPTED;
1462 }
1463
1464 /*
1465 * Our allocator could have made different decisions than before
1466 * crashing. This should be fixed but until then, we calculate
1467 * the number of blocks the inode.
1468 */
1469 ext4_ext_replay_set_iblocks(inode);
1470
1471 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1472 ext4_reset_inode_seed(inode);
1473
1474 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1475 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1476 sync_dirty_buffer(iloc.bh);
1477 brelse(iloc.bh);
1478out:
1479 iput(inode);
1480 if (!ret)
1481 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1482
1483 return 0;
1484}
1485
1486/*
1487 * Dentry create replay function.
1488 *
1489 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1490 * inode for which we are trying to create a dentry here, should already have
1491 * been replayed before we start here.
1492 */
1493static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1494{
1495 int ret = 0;
1496 struct inode *inode = NULL;
1497 struct inode *dir = NULL;
1498 struct dentry_info_args darg;
1499
1500 tl_to_darg(&darg, tl);
1501
1502 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1503 darg.parent_ino, darg.dname_len);
1504
1505 /* This takes care of update group descriptor and other metadata */
1506 ret = ext4_mark_inode_used(sb, darg.ino);
1507 if (ret)
1508 goto out;
1509
1510 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1511 if (IS_ERR_OR_NULL(inode)) {
1512 jbd_debug(1, "inode %d not found.", darg.ino);
1513 inode = NULL;
1514 ret = -EINVAL;
1515 goto out;
1516 }
1517
1518 if (S_ISDIR(inode->i_mode)) {
1519 /*
1520 * If we are creating a directory, we need to make sure that the
1521 * dot and dot dot dirents are setup properly.
1522 */
1523 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1524 if (IS_ERR_OR_NULL(dir)) {
1525 jbd_debug(1, "Dir %d not found.", darg.ino);
1526 goto out;
1527 }
1528 ret = ext4_init_new_dir(NULL, dir, inode);
1529 iput(dir);
1530 if (ret) {
1531 ret = 0;
1532 goto out;
1533 }
1534 }
1535 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1536 if (ret)
1537 goto out;
1538 set_nlink(inode, 1);
1539 ext4_mark_inode_dirty(NULL, inode);
1540out:
1541 if (inode)
1542 iput(inode);
1543 return ret;
1544}
1545
1546/*
1547 * Record physical disk regions which are in use as per fast commit area. Our
1548 * simple replay phase allocator excludes these regions from allocation.
1549 */
1550static int ext4_fc_record_regions(struct super_block *sb, int ino,
1551 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1552{
1553 struct ext4_fc_replay_state *state;
1554 struct ext4_fc_alloc_region *region;
1555
1556 state = &EXT4_SB(sb)->s_fc_replay_state;
1557 if (state->fc_regions_used == state->fc_regions_size) {
1558 state->fc_regions_size +=
1559 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1560 state->fc_regions = krealloc(
1561 state->fc_regions,
1562 state->fc_regions_size *
1563 sizeof(struct ext4_fc_alloc_region),
1564 GFP_KERNEL);
1565 if (!state->fc_regions)
1566 return -ENOMEM;
1567 }
1568 region = &state->fc_regions[state->fc_regions_used++];
1569 region->ino = ino;
1570 region->lblk = lblk;
1571 region->pblk = pblk;
1572 region->len = len;
1573
1574 return 0;
1575}
1576
1577/* Replay add range tag */
1578static int ext4_fc_replay_add_range(struct super_block *sb,
1579 struct ext4_fc_tl *tl)
1580{
1581 struct ext4_fc_add_range *fc_add_ex;
1582 struct ext4_extent newex, *ex;
1583 struct inode *inode;
1584 ext4_lblk_t start, cur;
1585 int remaining, len;
1586 ext4_fsblk_t start_pblk;
1587 struct ext4_map_blocks map;
1588 struct ext4_ext_path *path = NULL;
1589 int ret;
1590
1591 fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1592 ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1593
1594 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1595 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1596 ext4_ext_get_actual_len(ex));
1597
1598 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1599 EXT4_IGET_NORMAL);
1600 if (IS_ERR_OR_NULL(inode)) {
1601 jbd_debug(1, "Inode not found.");
1602 return 0;
1603 }
1604
1605 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1606
1607 start = le32_to_cpu(ex->ee_block);
1608 start_pblk = ext4_ext_pblock(ex);
1609 len = ext4_ext_get_actual_len(ex);
1610
1611 cur = start;
1612 remaining = len;
1613 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1614 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1615 inode->i_ino);
1616
1617 while (remaining > 0) {
1618 map.m_lblk = cur;
1619 map.m_len = remaining;
1620 map.m_pblk = 0;
1621 ret = ext4_map_blocks(NULL, inode, &map, 0);
1622
1623 if (ret < 0) {
1624 iput(inode);
1625 return 0;
1626 }
1627
1628 if (ret == 0) {
1629 /* Range is not mapped */
1630 path = ext4_find_extent(inode, cur, NULL, 0);
Harshad Shirwadkar8c9be1e2020-10-27 13:43:42 -07001631 if (IS_ERR(path)) {
1632 iput(inode);
1633 return 0;
1634 }
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001635 memset(&newex, 0, sizeof(newex));
1636 newex.ee_block = cpu_to_le32(cur);
1637 ext4_ext_store_pblock(
1638 &newex, start_pblk + cur - start);
1639 newex.ee_len = cpu_to_le16(map.m_len);
1640 if (ext4_ext_is_unwritten(ex))
1641 ext4_ext_mark_unwritten(&newex);
1642 down_write(&EXT4_I(inode)->i_data_sem);
1643 ret = ext4_ext_insert_extent(
1644 NULL, inode, &path, &newex, 0);
1645 up_write((&EXT4_I(inode)->i_data_sem));
1646 ext4_ext_drop_refs(path);
1647 kfree(path);
1648 if (ret) {
1649 iput(inode);
1650 return 0;
1651 }
1652 goto next;
1653 }
1654
1655 if (start_pblk + cur - start != map.m_pblk) {
1656 /*
1657 * Logical to physical mapping changed. This can happen
1658 * if this range was removed and then reallocated to
1659 * map to new physical blocks during a fast commit.
1660 */
1661 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1662 ext4_ext_is_unwritten(ex),
1663 start_pblk + cur - start);
1664 if (ret) {
1665 iput(inode);
1666 return 0;
1667 }
1668 /*
1669 * Mark the old blocks as free since they aren't used
1670 * anymore. We maintain an array of all the modified
1671 * inodes. In case these blocks are still used at either
1672 * a different logical range in the same inode or in
1673 * some different inode, we will mark them as allocated
1674 * at the end of the FC replay using our array of
1675 * modified inodes.
1676 */
1677 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1678 goto next;
1679 }
1680
1681 /* Range is mapped and needs a state change */
1682 jbd_debug(1, "Converting from %d to %d %lld",
1683 map.m_flags & EXT4_MAP_UNWRITTEN,
1684 ext4_ext_is_unwritten(ex), map.m_pblk);
1685 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1686 ext4_ext_is_unwritten(ex), map.m_pblk);
1687 if (ret) {
1688 iput(inode);
1689 return 0;
1690 }
1691 /*
1692 * We may have split the extent tree while toggling the state.
1693 * Try to shrink the extent tree now.
1694 */
1695 ext4_ext_replay_shrink_inode(inode, start + len);
1696next:
1697 cur += map.m_len;
1698 remaining -= map.m_len;
1699 }
1700 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1701 sb->s_blocksize_bits);
1702 iput(inode);
1703 return 0;
1704}
1705
1706/* Replay DEL_RANGE tag */
1707static int
1708ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1709{
1710 struct inode *inode;
1711 struct ext4_fc_del_range *lrange;
1712 struct ext4_map_blocks map;
1713 ext4_lblk_t cur, remaining;
1714 int ret;
1715
1716 lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1717 cur = le32_to_cpu(lrange->fc_lblk);
1718 remaining = le32_to_cpu(lrange->fc_len);
1719
1720 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1721 le32_to_cpu(lrange->fc_ino), cur, remaining);
1722
1723 inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1724 if (IS_ERR_OR_NULL(inode)) {
1725 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1726 return 0;
1727 }
1728
1729 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1730
1731 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1732 inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1733 le32_to_cpu(lrange->fc_len));
1734 while (remaining > 0) {
1735 map.m_lblk = cur;
1736 map.m_len = remaining;
1737
1738 ret = ext4_map_blocks(NULL, inode, &map, 0);
1739 if (ret < 0) {
1740 iput(inode);
1741 return 0;
1742 }
1743 if (ret > 0) {
1744 remaining -= ret;
1745 cur += ret;
1746 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1747 } else {
1748 remaining -= map.m_len;
1749 cur += map.m_len;
1750 }
1751 }
1752
1753 ret = ext4_punch_hole(inode,
1754 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1755 le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits);
1756 if (ret)
1757 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1758 ext4_ext_replay_shrink_inode(inode,
1759 i_size_read(inode) >> sb->s_blocksize_bits);
1760 ext4_mark_inode_dirty(NULL, inode);
1761 iput(inode);
1762
1763 return 0;
1764}
1765
1766static inline const char *tag2str(u16 tag)
1767{
1768 switch (tag) {
1769 case EXT4_FC_TAG_LINK:
1770 return "TAG_ADD_ENTRY";
1771 case EXT4_FC_TAG_UNLINK:
1772 return "TAG_DEL_ENTRY";
1773 case EXT4_FC_TAG_ADD_RANGE:
1774 return "TAG_ADD_RANGE";
1775 case EXT4_FC_TAG_CREAT:
1776 return "TAG_CREAT_DENTRY";
1777 case EXT4_FC_TAG_DEL_RANGE:
1778 return "TAG_DEL_RANGE";
1779 case EXT4_FC_TAG_INODE:
1780 return "TAG_INODE";
1781 case EXT4_FC_TAG_PAD:
1782 return "TAG_PAD";
1783 case EXT4_FC_TAG_TAIL:
1784 return "TAG_TAIL";
1785 case EXT4_FC_TAG_HEAD:
1786 return "TAG_HEAD";
1787 default:
1788 return "TAG_ERROR";
1789 }
1790}
1791
1792static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1793{
1794 struct ext4_fc_replay_state *state;
1795 struct inode *inode;
1796 struct ext4_ext_path *path = NULL;
1797 struct ext4_map_blocks map;
1798 int i, ret, j;
1799 ext4_lblk_t cur, end;
1800
1801 state = &EXT4_SB(sb)->s_fc_replay_state;
1802 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1803 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1804 EXT4_IGET_NORMAL);
1805 if (IS_ERR_OR_NULL(inode)) {
1806 jbd_debug(1, "Inode %d not found.",
1807 state->fc_modified_inodes[i]);
1808 continue;
1809 }
1810 cur = 0;
1811 end = EXT_MAX_BLOCKS;
1812 while (cur < end) {
1813 map.m_lblk = cur;
1814 map.m_len = end - cur;
1815
1816 ret = ext4_map_blocks(NULL, inode, &map, 0);
1817 if (ret < 0)
1818 break;
1819
1820 if (ret > 0) {
1821 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1822 if (!IS_ERR_OR_NULL(path)) {
1823 for (j = 0; j < path->p_depth; j++)
1824 ext4_mb_mark_bb(inode->i_sb,
1825 path[j].p_block, 1, 1);
1826 ext4_ext_drop_refs(path);
1827 kfree(path);
1828 }
1829 cur += ret;
1830 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1831 map.m_len, 1);
1832 } else {
1833 cur = cur + (map.m_len ? map.m_len : 1);
1834 }
1835 }
1836 iput(inode);
1837 }
1838}
1839
1840/*
1841 * Check if block is in excluded regions for block allocation. The simple
1842 * allocator that runs during replay phase is calls this function to see
1843 * if it is okay to use a block.
1844 */
1845bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1846{
1847 int i;
1848 struct ext4_fc_replay_state *state;
1849
1850 state = &EXT4_SB(sb)->s_fc_replay_state;
1851 for (i = 0; i < state->fc_regions_valid; i++) {
1852 if (state->fc_regions[i].ino == 0 ||
1853 state->fc_regions[i].len == 0)
1854 continue;
1855 if (blk >= state->fc_regions[i].pblk &&
1856 blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1857 return true;
1858 }
1859 return false;
1860}
1861
1862/* Cleanup function called after replay */
1863void ext4_fc_replay_cleanup(struct super_block *sb)
1864{
1865 struct ext4_sb_info *sbi = EXT4_SB(sb);
1866
1867 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1868 kfree(sbi->s_fc_replay_state.fc_regions);
1869 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1870}
1871
1872/*
1873 * Recovery Scan phase handler
1874 *
1875 * This function is called during the scan phase and is responsible
1876 * for doing following things:
1877 * - Make sure the fast commit area has valid tags for replay
1878 * - Count number of tags that need to be replayed by the replay handler
1879 * - Verify CRC
1880 * - Create a list of excluded blocks for allocation during replay phase
1881 *
1882 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1883 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1884 * to indicate that scan has finished and JBD2 can now start replay phase.
1885 * It returns a negative error to indicate that there was an error. At the end
1886 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1887 * to indicate the number of tags that need to replayed during the replay phase.
1888 */
1889static int ext4_fc_replay_scan(journal_t *journal,
1890 struct buffer_head *bh, int off,
1891 tid_t expected_tid)
1892{
1893 struct super_block *sb = journal->j_private;
1894 struct ext4_sb_info *sbi = EXT4_SB(sb);
1895 struct ext4_fc_replay_state *state;
1896 int ret = JBD2_FC_REPLAY_CONTINUE;
1897 struct ext4_fc_add_range *ext;
1898 struct ext4_fc_tl *tl;
1899 struct ext4_fc_tail *tail;
1900 __u8 *start, *end;
1901 struct ext4_fc_head *head;
1902 struct ext4_extent *ex;
1903
1904 state = &sbi->s_fc_replay_state;
1905
1906 start = (u8 *)bh->b_data;
1907 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1908
1909 if (state->fc_replay_expected_off == 0) {
1910 state->fc_cur_tag = 0;
1911 state->fc_replay_num_tags = 0;
1912 state->fc_crc = 0;
1913 state->fc_regions = NULL;
1914 state->fc_regions_valid = state->fc_regions_used =
1915 state->fc_regions_size = 0;
1916 /* Check if we can stop early */
1917 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1918 != EXT4_FC_TAG_HEAD)
1919 return 0;
1920 }
1921
1922 if (off != state->fc_replay_expected_off) {
1923 ret = -EFSCORRUPTED;
1924 goto out_err;
1925 }
1926
1927 state->fc_replay_expected_off++;
1928 fc_for_each_tl(start, end, tl) {
1929 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1930 tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1931 switch (le16_to_cpu(tl->fc_tag)) {
1932 case EXT4_FC_TAG_ADD_RANGE:
1933 ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1934 ex = (struct ext4_extent *)&ext->fc_ex;
1935 ret = ext4_fc_record_regions(sb,
1936 le32_to_cpu(ext->fc_ino),
1937 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1938 ext4_ext_get_actual_len(ex));
1939 if (ret < 0)
1940 break;
1941 ret = JBD2_FC_REPLAY_CONTINUE;
1942 fallthrough;
1943 case EXT4_FC_TAG_DEL_RANGE:
1944 case EXT4_FC_TAG_LINK:
1945 case EXT4_FC_TAG_UNLINK:
1946 case EXT4_FC_TAG_CREAT:
1947 case EXT4_FC_TAG_INODE:
1948 case EXT4_FC_TAG_PAD:
1949 state->fc_cur_tag++;
1950 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1951 sizeof(*tl) + ext4_fc_tag_len(tl));
1952 break;
1953 case EXT4_FC_TAG_TAIL:
1954 state->fc_cur_tag++;
1955 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1956 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1957 sizeof(*tl) +
1958 offsetof(struct ext4_fc_tail,
1959 fc_crc));
1960 if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1961 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1962 state->fc_replay_num_tags = state->fc_cur_tag;
1963 state->fc_regions_valid =
1964 state->fc_regions_used;
1965 } else {
1966 ret = state->fc_replay_num_tags ?
1967 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1968 }
1969 state->fc_crc = 0;
1970 break;
1971 case EXT4_FC_TAG_HEAD:
1972 head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1973 if (le32_to_cpu(head->fc_features) &
1974 ~EXT4_FC_SUPPORTED_FEATURES) {
1975 ret = -EOPNOTSUPP;
1976 break;
1977 }
1978 if (le32_to_cpu(head->fc_tid) != expected_tid) {
1979 ret = JBD2_FC_REPLAY_STOP;
1980 break;
1981 }
1982 state->fc_cur_tag++;
1983 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1984 sizeof(*tl) + ext4_fc_tag_len(tl));
1985 break;
1986 default:
1987 ret = state->fc_replay_num_tags ?
1988 JBD2_FC_REPLAY_STOP : -ECANCELED;
1989 }
1990 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1991 break;
1992 }
1993
1994out_err:
1995 trace_ext4_fc_replay_scan(sb, ret, off);
1996 return ret;
1997}
1998
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07001999/*
2000 * Main recovery path entry point.
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07002001 * The meaning of return codes is similar as above.
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07002002 */
2003static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2004 enum passtype pass, int off, tid_t expected_tid)
2005{
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07002006 struct super_block *sb = journal->j_private;
2007 struct ext4_sb_info *sbi = EXT4_SB(sb);
2008 struct ext4_fc_tl *tl;
2009 __u8 *start, *end;
2010 int ret = JBD2_FC_REPLAY_CONTINUE;
2011 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2012 struct ext4_fc_tail *tail;
2013
2014 if (pass == PASS_SCAN) {
2015 state->fc_current_pass = PASS_SCAN;
2016 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2017 }
2018
2019 if (state->fc_current_pass != pass) {
2020 state->fc_current_pass = pass;
2021 sbi->s_mount_state |= EXT4_FC_REPLAY;
2022 }
2023 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2024 jbd_debug(1, "Replay stops\n");
2025 ext4_fc_set_bitmaps_and_counters(sb);
2026 return 0;
2027 }
2028
2029#ifdef CONFIG_EXT4_DEBUG
2030 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2031 pr_warn("Dropping fc block %d because max_replay set\n", off);
2032 return JBD2_FC_REPLAY_STOP;
2033 }
2034#endif
2035
2036 start = (u8 *)bh->b_data;
2037 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2038
2039 fc_for_each_tl(start, end, tl) {
2040 if (state->fc_replay_num_tags == 0) {
2041 ret = JBD2_FC_REPLAY_STOP;
2042 ext4_fc_set_bitmaps_and_counters(sb);
2043 break;
2044 }
2045 jbd_debug(3, "Replay phase, tag:%s\n",
2046 tag2str(le16_to_cpu(tl->fc_tag)));
2047 state->fc_replay_num_tags--;
2048 switch (le16_to_cpu(tl->fc_tag)) {
2049 case EXT4_FC_TAG_LINK:
2050 ret = ext4_fc_replay_link(sb, tl);
2051 break;
2052 case EXT4_FC_TAG_UNLINK:
2053 ret = ext4_fc_replay_unlink(sb, tl);
2054 break;
2055 case EXT4_FC_TAG_ADD_RANGE:
2056 ret = ext4_fc_replay_add_range(sb, tl);
2057 break;
2058 case EXT4_FC_TAG_CREAT:
2059 ret = ext4_fc_replay_create(sb, tl);
2060 break;
2061 case EXT4_FC_TAG_DEL_RANGE:
2062 ret = ext4_fc_replay_del_range(sb, tl);
2063 break;
2064 case EXT4_FC_TAG_INODE:
2065 ret = ext4_fc_replay_inode(sb, tl);
2066 break;
2067 case EXT4_FC_TAG_PAD:
2068 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2069 ext4_fc_tag_len(tl), 0);
2070 break;
2071 case EXT4_FC_TAG_TAIL:
2072 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2073 ext4_fc_tag_len(tl), 0);
2074 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2075 WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2076 break;
2077 case EXT4_FC_TAG_HEAD:
2078 break;
2079 default:
2080 trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2081 ext4_fc_tag_len(tl), 0);
2082 ret = -ECANCELED;
2083 break;
2084 }
2085 if (ret < 0)
2086 break;
2087 ret = JBD2_FC_REPLAY_CONTINUE;
2088 }
2089 return ret;
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07002090}
2091
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07002092void ext4_fc_init(struct super_block *sb, journal_t *journal)
2093{
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07002094 /*
2095 * We set replay callback even if fast commit disabled because we may
2096 * could still have fast commit blocks that need to be replayed even if
2097 * fast commit has now been turned off.
2098 */
2099 journal->j_fc_replay_callback = ext4_fc_replay;
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07002100 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2101 return;
Harshad Shirwadkarff780b92020-10-15 13:37:56 -07002102 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07002103}
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07002104
Harshad Shirwadkarce8c59d2020-10-15 13:38:01 -07002105const char *fc_ineligible_reasons[] = {
2106 "Extended attributes changed",
2107 "Cross rename",
2108 "Journal flag changed",
2109 "Insufficient memory",
2110 "Swap boot",
2111 "Resize",
2112 "Dir renamed",
2113 "Falloc range op",
2114 "FC Commit Failed"
2115};
2116
2117int ext4_fc_info_show(struct seq_file *seq, void *v)
2118{
2119 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2120 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2121 int i;
2122
2123 if (v != SEQ_START_TOKEN)
2124 return 0;
2125
2126 seq_printf(seq,
2127 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2128 stats->fc_num_commits, stats->fc_ineligible_commits,
2129 stats->fc_numblks,
2130 div_u64(sbi->s_fc_avg_commit_time, 1000));
2131 seq_puts(seq, "Ineligible reasons:\n");
2132 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2133 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2134 stats->fc_ineligible_reason_count[i]);
2135
2136 return 0;
2137}
2138
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07002139int __init ext4_fc_init_dentry_cache(void)
2140{
2141 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2142 SLAB_RECLAIM_ACCOUNT);
2143
2144 if (ext4_fc_dentry_cachep == NULL)
2145 return -ENOMEM;
2146
2147 return 0;
2148}