blob: 6b963e09af2c9ac6182fc720b3fff6a0a678eda6 [file] [log] [blame]
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07001// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -070010#include "ext4.h"
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -070011#include "ext4_jbd2.h"
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -070012#include "ext4_extents.h"
13#include "mballoc.h"
14
15/*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligiblity is marked by calling one of the
70 * two following functions:
71 *
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
74 *
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
83 *
84 * Atomicity of commits
85 * --------------------
Harshad Shirwadkara7407622020-11-05 19:59:03 -080086 * In order to guarantee atomicity during the commit operation, fast commit
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -070087 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
93 *
94 * - Create a new file A and remove existing file B
95 * - fsync()
96 * - Append contents to file A
97 * - Truncate file A
98 * - fsync()
99 *
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
103 *
104 * Replay code should thus check for all the valid tails in the FC area.
105 *
106 * TODOs
107 * -----
108 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109 * eligible update must be protected within ext4_fc_start_update() and
110 * ext4_fc_stop_update(). These routines are called at much higher
111 * routines. This can be made more fine grained by combining with
112 * ext4_journal_start().
113 *
114 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115 *
116 * 3) Handle more ineligible cases.
117 */
118
119#include <trace/events/ext4.h>
120static struct kmem_cache *ext4_fc_dentry_cachep;
121
122static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123{
124 BUFFER_TRACE(bh, "");
125 if (uptodate) {
126 ext4_debug("%s: Block %lld up-to-date",
127 __func__, bh->b_blocknr);
128 set_buffer_uptodate(bh);
129 } else {
130 ext4_debug("%s: Block %lld not up-to-date",
131 __func__, bh->b_blocknr);
132 clear_buffer_uptodate(bh);
133 }
134
135 unlock_buffer(bh);
136}
137
138static inline void ext4_fc_reset_inode(struct inode *inode)
139{
140 struct ext4_inode_info *ei = EXT4_I(inode);
141
142 ei->i_fc_lblk_start = 0;
143 ei->i_fc_lblk_len = 0;
144}
145
146void ext4_fc_init_inode(struct inode *inode)
147{
148 struct ext4_inode_info *ei = EXT4_I(inode);
149
150 ext4_fc_reset_inode(inode);
151 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152 INIT_LIST_HEAD(&ei->i_fc_list);
153 init_waitqueue_head(&ei->i_fc_wait);
154 atomic_set(&ei->i_fc_updates, 0);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700155}
156
Harshad Shirwadkarf6634e22020-11-05 19:59:02 -0800157/* This function must be called with sbi->s_fc_lock held. */
158static void ext4_fc_wait_committing_inode(struct inode *inode)
159{
160 wait_queue_head_t *wq;
161 struct ext4_inode_info *ei = EXT4_I(inode);
162
163#if (BITS_PER_LONG < 64)
164 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
165 EXT4_STATE_FC_COMMITTING);
166 wq = bit_waitqueue(&ei->i_state_flags,
167 EXT4_STATE_FC_COMMITTING);
168#else
169 DEFINE_WAIT_BIT(wait, &ei->i_flags,
170 EXT4_STATE_FC_COMMITTING);
171 wq = bit_waitqueue(&ei->i_flags,
172 EXT4_STATE_FC_COMMITTING);
173#endif
174 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
175 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
176 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
177 schedule();
178 finish_wait(wq, &wait.wq_entry);
179}
180
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700181/*
182 * Inform Ext4's fast about start of an inode update
183 *
184 * This function is called by the high level call VFS callbacks before
185 * performing any inode update. This function blocks if there's an ongoing
186 * fast commit on the inode in question.
187 */
188void ext4_fc_start_update(struct inode *inode)
189{
190 struct ext4_inode_info *ei = EXT4_I(inode);
191
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700192 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
193 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700194 return;
195
196restart:
197 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
198 if (list_empty(&ei->i_fc_list))
199 goto out;
200
201 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
Harshad Shirwadkarf6634e22020-11-05 19:59:02 -0800202 ext4_fc_wait_committing_inode(inode);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700203 goto restart;
204 }
205out:
206 atomic_inc(&ei->i_fc_updates);
207 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
208}
209
210/*
211 * Stop inode update and wake up waiting fast commits if any.
212 */
213void ext4_fc_stop_update(struct inode *inode)
214{
215 struct ext4_inode_info *ei = EXT4_I(inode);
216
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700217 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
218 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700219 return;
220
221 if (atomic_dec_and_test(&ei->i_fc_updates))
222 wake_up_all(&ei->i_fc_wait);
223}
224
225/*
226 * Remove inode from fast commit list. If the inode is being committed
227 * we wait until inode commit is done.
228 */
229void ext4_fc_del(struct inode *inode)
230{
231 struct ext4_inode_info *ei = EXT4_I(inode);
232
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700233 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
234 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700235 return;
236
237restart:
238 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
239 if (list_empty(&ei->i_fc_list)) {
240 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
241 return;
242 }
243
244 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
Harshad Shirwadkarf6634e22020-11-05 19:59:02 -0800245 ext4_fc_wait_committing_inode(inode);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700246 goto restart;
247 }
Harshad Shirwadkarf6634e22020-11-05 19:59:02 -0800248 list_del_init(&ei->i_fc_list);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700249 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
250}
251
252/*
253 * Mark file system as fast commit ineligible. This means that next commit
254 * operation would result in a full jbd2 commit.
255 */
256void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
257{
258 struct ext4_sb_info *sbi = EXT4_SB(sb);
259
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700260 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
261 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
262 return;
263
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700264 sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700265 WARN_ON(reason >= EXT4_FC_REASON_MAX);
266 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
267}
268
269/*
270 * Start a fast commit ineligible update. Any commits that happen while
271 * such an operation is in progress fall back to full commits.
272 */
273void ext4_fc_start_ineligible(struct super_block *sb, int reason)
274{
275 struct ext4_sb_info *sbi = EXT4_SB(sb);
276
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700277 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
278 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
279 return;
280
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700281 WARN_ON(reason >= EXT4_FC_REASON_MAX);
282 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
283 atomic_inc(&sbi->s_fc_ineligible_updates);
284}
285
286/*
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700287 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700288 * to ensure that after stopping the ineligible update, at least one full
289 * commit takes place.
290 */
291void ext4_fc_stop_ineligible(struct super_block *sb)
292{
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700293 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
294 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
295 return;
296
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700297 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700298 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
299}
300
301static inline int ext4_fc_is_ineligible(struct super_block *sb)
302{
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700303 return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) ||
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700304 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
305}
306
307/*
308 * Generic fast commit tracking function. If this is the first time this we are
309 * called after a full commit, we initialize fast commit fields and then call
310 * __fc_track_fn() with update = 0. If we have already been called after a full
311 * commit, we pass update = 1. Based on that, the track function can determine
312 * if it needs to track a field for the first time or if it needs to just
313 * update the previously tracked value.
314 *
315 * If enqueue is set, this function enqueues the inode in fast commit list.
316 */
317static int ext4_fc_track_template(
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800318 handle_t *handle, struct inode *inode,
319 int (*__fc_track_fn)(struct inode *, void *, bool),
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700320 void *args, int enqueue)
321{
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700322 bool update = false;
323 struct ext4_inode_info *ei = EXT4_I(inode);
324 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800325 tid_t tid = 0;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700326 int ret;
327
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700328 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
329 (sbi->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700330 return -EOPNOTSUPP;
331
332 if (ext4_fc_is_ineligible(inode->i_sb))
333 return -EINVAL;
334
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800335 tid = handle->h_transaction->t_tid;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700336 mutex_lock(&ei->i_fc_lock);
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800337 if (tid == ei->i_sync_tid) {
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700338 update = true;
339 } else {
340 ext4_fc_reset_inode(inode);
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800341 ei->i_sync_tid = tid;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700342 }
343 ret = __fc_track_fn(inode, args, update);
344 mutex_unlock(&ei->i_fc_lock);
345
346 if (!enqueue)
347 return ret;
348
349 spin_lock(&sbi->s_fc_lock);
350 if (list_empty(&EXT4_I(inode)->i_fc_list))
351 list_add_tail(&EXT4_I(inode)->i_fc_list,
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700352 (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ?
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700353 &sbi->s_fc_q[FC_Q_STAGING] :
354 &sbi->s_fc_q[FC_Q_MAIN]);
355 spin_unlock(&sbi->s_fc_lock);
356
357 return ret;
358}
359
360struct __track_dentry_update_args {
361 struct dentry *dentry;
362 int op;
363};
364
365/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
366static int __track_dentry_update(struct inode *inode, void *arg, bool update)
367{
368 struct ext4_fc_dentry_update *node;
369 struct ext4_inode_info *ei = EXT4_I(inode);
370 struct __track_dentry_update_args *dentry_update =
371 (struct __track_dentry_update_args *)arg;
372 struct dentry *dentry = dentry_update->dentry;
373 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
374
375 mutex_unlock(&ei->i_fc_lock);
376 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
377 if (!node) {
Harshad Shirwadkarb21ebf12020-11-05 19:58:51 -0800378 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700379 mutex_lock(&ei->i_fc_lock);
380 return -ENOMEM;
381 }
382
383 node->fcd_op = dentry_update->op;
384 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
385 node->fcd_ino = inode->i_ino;
386 if (dentry->d_name.len > DNAME_INLINE_LEN) {
387 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
388 if (!node->fcd_name.name) {
389 kmem_cache_free(ext4_fc_dentry_cachep, node);
390 ext4_fc_mark_ineligible(inode->i_sb,
Harshad Shirwadkarb21ebf12020-11-05 19:58:51 -0800391 EXT4_FC_REASON_NOMEM);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700392 mutex_lock(&ei->i_fc_lock);
393 return -ENOMEM;
394 }
395 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
396 dentry->d_name.len);
397 } else {
398 memcpy(node->fcd_iname, dentry->d_name.name,
399 dentry->d_name.len);
400 node->fcd_name.name = node->fcd_iname;
401 }
402 node->fcd_name.len = dentry->d_name.len;
403
404 spin_lock(&sbi->s_fc_lock);
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700405 if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700406 list_add_tail(&node->fcd_list,
407 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
408 else
409 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
410 spin_unlock(&sbi->s_fc_lock);
411 mutex_lock(&ei->i_fc_lock);
412
413 return 0;
414}
415
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800416void __ext4_fc_track_unlink(handle_t *handle,
417 struct inode *inode, struct dentry *dentry)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700418{
419 struct __track_dentry_update_args args;
420 int ret;
421
422 args.dentry = dentry;
423 args.op = EXT4_FC_TAG_UNLINK;
424
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800425 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700426 (void *)&args, 0);
427 trace_ext4_fc_track_unlink(inode, dentry, ret);
428}
429
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800430void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
431{
432 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
433}
434
435void __ext4_fc_track_link(handle_t *handle,
436 struct inode *inode, struct dentry *dentry)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700437{
438 struct __track_dentry_update_args args;
439 int ret;
440
441 args.dentry = dentry;
442 args.op = EXT4_FC_TAG_LINK;
443
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800444 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700445 (void *)&args, 0);
446 trace_ext4_fc_track_link(inode, dentry, ret);
447}
448
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800449void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
450{
451 __ext4_fc_track_link(handle, d_inode(dentry), dentry);
452}
453
454void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700455{
456 struct __track_dentry_update_args args;
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800457 struct inode *inode = d_inode(dentry);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700458 int ret;
459
460 args.dentry = dentry;
461 args.op = EXT4_FC_TAG_CREAT;
462
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800463 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700464 (void *)&args, 0);
465 trace_ext4_fc_track_create(inode, dentry, ret);
466}
467
468/* __track_fn for inode tracking */
469static int __track_inode(struct inode *inode, void *arg, bool update)
470{
471 if (update)
472 return -EEXIST;
473
474 EXT4_I(inode)->i_fc_lblk_len = 0;
475
476 return 0;
477}
478
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800479void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700480{
481 int ret;
482
483 if (S_ISDIR(inode->i_mode))
484 return;
485
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800486 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700487 trace_ext4_fc_track_inode(inode, ret);
488}
489
490struct __track_range_args {
491 ext4_lblk_t start, end;
492};
493
494/* __track_fn for tracking data updates */
495static int __track_range(struct inode *inode, void *arg, bool update)
496{
497 struct ext4_inode_info *ei = EXT4_I(inode);
498 ext4_lblk_t oldstart;
499 struct __track_range_args *__arg =
500 (struct __track_range_args *)arg;
501
502 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
503 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
504 return -ECANCELED;
505 }
506
507 oldstart = ei->i_fc_lblk_start;
508
509 if (update && ei->i_fc_lblk_len > 0) {
510 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
511 ei->i_fc_lblk_len =
512 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
513 ei->i_fc_lblk_start + 1;
514 } else {
515 ei->i_fc_lblk_start = __arg->start;
516 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
517 }
518
519 return 0;
520}
521
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800522void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700523 ext4_lblk_t end)
524{
525 struct __track_range_args args;
526 int ret;
527
528 if (S_ISDIR(inode->i_mode))
529 return;
530
531 args.start = start;
532 args.end = end;
533
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800534 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700535
536 trace_ext4_fc_track_range(inode, start, end, ret);
537}
538
539static void ext4_fc_submit_bh(struct super_block *sb)
540{
541 int write_flags = REQ_SYNC;
542 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
543
Harshad Shirwadkara7407622020-11-05 19:59:03 -0800544 /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700545 if (test_opt(sb, BARRIER))
546 write_flags |= REQ_FUA | REQ_PREFLUSH;
547 lock_buffer(bh);
Harshad Shirwadkar764b3fd2020-11-05 19:59:04 -0800548 set_buffer_dirty(bh);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700549 set_buffer_uptodate(bh);
550 bh->b_end_io = ext4_end_buffer_io_sync;
551 submit_bh(REQ_OP_WRITE, write_flags, bh);
552 EXT4_SB(sb)->s_fc_bh = NULL;
553}
554
555/* Ext4 commit path routines */
556
557/* memzero and update CRC */
558static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
559 u32 *crc)
560{
561 void *ret;
562
563 ret = memset(dst, 0, len);
564 if (crc)
565 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
566 return ret;
567}
568
569/*
570 * Allocate len bytes on a fast commit buffer.
571 *
572 * During the commit time this function is used to manage fast commit
573 * block space. We don't split a fast commit log onto different
574 * blocks. So this function makes sure that if there's not enough space
575 * on the current block, the remaining space in the current block is
576 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
577 * new block is from jbd2 and CRC is updated to reflect the padding
578 * we added.
579 */
580static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
581{
582 struct ext4_fc_tl *tl;
583 struct ext4_sb_info *sbi = EXT4_SB(sb);
584 struct buffer_head *bh;
585 int bsize = sbi->s_journal->j_blocksize;
586 int ret, off = sbi->s_fc_bytes % bsize;
587 int pad_len;
588
589 /*
590 * After allocating len, we should have space at least for a 0 byte
591 * padding.
592 */
593 if (len + sizeof(struct ext4_fc_tl) > bsize)
594 return NULL;
595
596 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
597 /*
598 * Only allocate from current buffer if we have enough space for
599 * this request AND we have space to add a zero byte padding.
600 */
601 if (!sbi->s_fc_bh) {
602 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
603 if (ret)
604 return NULL;
605 sbi->s_fc_bh = bh;
606 }
607 sbi->s_fc_bytes += len;
608 return sbi->s_fc_bh->b_data + off;
609 }
610 /* Need to add PAD tag */
611 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
612 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
613 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
614 tl->fc_len = cpu_to_le16(pad_len);
615 if (crc)
616 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
617 if (pad_len > 0)
618 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
619 ext4_fc_submit_bh(sb);
620
621 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
622 if (ret)
623 return NULL;
624 sbi->s_fc_bh = bh;
625 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
626 return sbi->s_fc_bh->b_data;
627}
628
629/* memcpy to fc reserved space and update CRC */
630static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
631 int len, u32 *crc)
632{
633 if (crc)
634 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
635 return memcpy(dst, src, len);
636}
637
638/*
639 * Complete a fast commit by writing tail tag.
640 *
641 * Writing tail tag marks the end of a fast commit. In order to guarantee
642 * atomicity, after writing tail tag, even if there's space remaining
643 * in the block, next commit shouldn't use it. That's why tail tag
644 * has the length as that of the remaining space on the block.
645 */
646static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
647{
648 struct ext4_sb_info *sbi = EXT4_SB(sb);
649 struct ext4_fc_tl tl;
650 struct ext4_fc_tail tail;
651 int off, bsize = sbi->s_journal->j_blocksize;
652 u8 *dst;
653
654 /*
655 * ext4_fc_reserve_space takes care of allocating an extra block if
656 * there's no enough space on this block for accommodating this tail.
657 */
658 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
659 if (!dst)
660 return -ENOSPC;
661
662 off = sbi->s_fc_bytes % bsize;
663
664 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
665 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
666 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
667
668 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
669 dst += sizeof(tl);
670 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
671 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
672 dst += sizeof(tail.fc_tid);
673 tail.fc_crc = cpu_to_le32(crc);
674 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
675
676 ext4_fc_submit_bh(sb);
677
678 return 0;
679}
680
681/*
682 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
683 * Returns false if there's not enough space.
684 */
685static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
686 u32 *crc)
687{
688 struct ext4_fc_tl tl;
689 u8 *dst;
690
691 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
692 if (!dst)
693 return false;
694
695 tl.fc_tag = cpu_to_le16(tag);
696 tl.fc_len = cpu_to_le16(len);
697
698 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
699 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
700
701 return true;
702}
703
704/* Same as above, but adds dentry tlv. */
705static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
706 int parent_ino, int ino, int dlen,
707 const unsigned char *dname,
708 u32 *crc)
709{
710 struct ext4_fc_dentry_info fcd;
711 struct ext4_fc_tl tl;
712 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
713 crc);
714
715 if (!dst)
716 return false;
717
718 fcd.fc_parent_ino = cpu_to_le32(parent_ino);
719 fcd.fc_ino = cpu_to_le32(ino);
720 tl.fc_tag = cpu_to_le16(tag);
721 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
722 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
723 dst += sizeof(tl);
724 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
725 dst += sizeof(fcd);
726 ext4_fc_memcpy(sb, dst, dname, dlen, crc);
727 dst += dlen;
728
729 return true;
730}
731
732/*
733 * Writes inode in the fast commit space under TLV with tag @tag.
734 * Returns 0 on success, error on failure.
735 */
736static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
737{
738 struct ext4_inode_info *ei = EXT4_I(inode);
739 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
740 int ret;
741 struct ext4_iloc iloc;
742 struct ext4_fc_inode fc_inode;
743 struct ext4_fc_tl tl;
744 u8 *dst;
745
746 ret = ext4_get_inode_loc(inode, &iloc);
747 if (ret)
748 return ret;
749
750 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
751 inode_len += ei->i_extra_isize;
752
753 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
754 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
755 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
756
757 dst = ext4_fc_reserve_space(inode->i_sb,
758 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
759 if (!dst)
760 return -ECANCELED;
761
762 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
763 return -ECANCELED;
764 dst += sizeof(tl);
765 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
766 return -ECANCELED;
767 dst += sizeof(fc_inode);
768 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
769 inode_len, crc))
770 return -ECANCELED;
771
772 return 0;
773}
774
775/*
776 * Writes updated data ranges for the inode in question. Updates CRC.
777 * Returns 0 on success, error otherwise.
778 */
779static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
780{
781 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
782 struct ext4_inode_info *ei = EXT4_I(inode);
783 struct ext4_map_blocks map;
784 struct ext4_fc_add_range fc_ext;
785 struct ext4_fc_del_range lrange;
786 struct ext4_extent *ex;
787 int ret;
788
789 mutex_lock(&ei->i_fc_lock);
790 if (ei->i_fc_lblk_len == 0) {
791 mutex_unlock(&ei->i_fc_lock);
792 return 0;
793 }
794 old_blk_size = ei->i_fc_lblk_start;
795 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
796 ei->i_fc_lblk_len = 0;
797 mutex_unlock(&ei->i_fc_lock);
798
799 cur_lblk_off = old_blk_size;
800 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
801 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
802
803 while (cur_lblk_off <= new_blk_size) {
804 map.m_lblk = cur_lblk_off;
805 map.m_len = new_blk_size - cur_lblk_off + 1;
806 ret = ext4_map_blocks(NULL, inode, &map, 0);
807 if (ret < 0)
808 return -ECANCELED;
809
810 if (map.m_len == 0) {
811 cur_lblk_off++;
812 continue;
813 }
814
815 if (ret == 0) {
816 lrange.fc_ino = cpu_to_le32(inode->i_ino);
817 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
818 lrange.fc_len = cpu_to_le32(map.m_len);
819 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
820 sizeof(lrange), (u8 *)&lrange, crc))
821 return -ENOSPC;
822 } else {
823 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
824 ex = (struct ext4_extent *)&fc_ext.fc_ex;
825 ex->ee_block = cpu_to_le32(map.m_lblk);
826 ex->ee_len = cpu_to_le16(map.m_len);
827 ext4_ext_store_pblock(ex, map.m_pblk);
828 if (map.m_flags & EXT4_MAP_UNWRITTEN)
829 ext4_ext_mark_unwritten(ex);
830 else
831 ext4_ext_mark_initialized(ex);
832 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
833 sizeof(fc_ext), (u8 *)&fc_ext, crc))
834 return -ENOSPC;
835 }
836
837 cur_lblk_off += map.m_len;
838 }
839
840 return 0;
841}
842
843
844/* Submit data for all the fast commit inodes */
845static int ext4_fc_submit_inode_data_all(journal_t *journal)
846{
847 struct super_block *sb = (struct super_block *)(journal->j_private);
848 struct ext4_sb_info *sbi = EXT4_SB(sb);
849 struct ext4_inode_info *ei;
850 struct list_head *pos;
851 int ret = 0;
852
853 spin_lock(&sbi->s_fc_lock);
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700854 sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700855 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
856 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
857 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
858 while (atomic_read(&ei->i_fc_updates)) {
859 DEFINE_WAIT(wait);
860
861 prepare_to_wait(&ei->i_fc_wait, &wait,
862 TASK_UNINTERRUPTIBLE);
863 if (atomic_read(&ei->i_fc_updates)) {
864 spin_unlock(&sbi->s_fc_lock);
865 schedule();
866 spin_lock(&sbi->s_fc_lock);
867 }
868 finish_wait(&ei->i_fc_wait, &wait);
869 }
870 spin_unlock(&sbi->s_fc_lock);
871 ret = jbd2_submit_inode_data(ei->jinode);
872 if (ret)
873 return ret;
874 spin_lock(&sbi->s_fc_lock);
875 }
876 spin_unlock(&sbi->s_fc_lock);
877
878 return ret;
879}
880
881/* Wait for completion of data for all the fast commit inodes */
882static int ext4_fc_wait_inode_data_all(journal_t *journal)
883{
884 struct super_block *sb = (struct super_block *)(journal->j_private);
885 struct ext4_sb_info *sbi = EXT4_SB(sb);
886 struct ext4_inode_info *pos, *n;
887 int ret = 0;
888
889 spin_lock(&sbi->s_fc_lock);
890 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
891 if (!ext4_test_inode_state(&pos->vfs_inode,
892 EXT4_STATE_FC_COMMITTING))
893 continue;
894 spin_unlock(&sbi->s_fc_lock);
895
896 ret = jbd2_wait_inode_data(journal, pos->jinode);
897 if (ret)
898 return ret;
899 spin_lock(&sbi->s_fc_lock);
900 }
901 spin_unlock(&sbi->s_fc_lock);
902
903 return 0;
904}
905
906/* Commit all the directory entry updates */
907static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
908{
909 struct super_block *sb = (struct super_block *)(journal->j_private);
910 struct ext4_sb_info *sbi = EXT4_SB(sb);
911 struct ext4_fc_dentry_update *fc_dentry;
912 struct inode *inode;
913 struct list_head *pos, *n, *fcd_pos, *fcd_n;
914 struct ext4_inode_info *ei;
915 int ret;
916
917 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
918 return 0;
919 list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
920 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
921 fcd_list);
922 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
923 spin_unlock(&sbi->s_fc_lock);
924 if (!ext4_fc_add_dentry_tlv(
925 sb, fc_dentry->fcd_op,
926 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
927 fc_dentry->fcd_name.len,
928 fc_dentry->fcd_name.name, crc)) {
929 ret = -ENOSPC;
930 goto lock_and_exit;
931 }
932 spin_lock(&sbi->s_fc_lock);
933 continue;
934 }
935
936 inode = NULL;
937 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
938 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
939 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
940 inode = &ei->vfs_inode;
941 break;
942 }
943 }
944 /*
945 * If we don't find inode in our list, then it was deleted,
946 * in which case, we don't need to record it's create tag.
947 */
948 if (!inode)
949 continue;
950 spin_unlock(&sbi->s_fc_lock);
951
952 /*
953 * We first write the inode and then the create dirent. This
954 * allows the recovery code to create an unnamed inode first
955 * and then link it to a directory entry. This allows us
956 * to use namei.c routines almost as is and simplifies
957 * the recovery code.
958 */
959 ret = ext4_fc_write_inode(inode, crc);
960 if (ret)
961 goto lock_and_exit;
962
963 ret = ext4_fc_write_inode_data(inode, crc);
964 if (ret)
965 goto lock_and_exit;
966
967 if (!ext4_fc_add_dentry_tlv(
968 sb, fc_dentry->fcd_op,
969 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
970 fc_dentry->fcd_name.len,
971 fc_dentry->fcd_name.name, crc)) {
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700972 ret = -ENOSPC;
973 goto lock_and_exit;
974 }
975
976 spin_lock(&sbi->s_fc_lock);
977 }
978 return 0;
979lock_and_exit:
980 spin_lock(&sbi->s_fc_lock);
981 return ret;
982}
983
984static int ext4_fc_perform_commit(journal_t *journal)
985{
986 struct super_block *sb = (struct super_block *)(journal->j_private);
987 struct ext4_sb_info *sbi = EXT4_SB(sb);
988 struct ext4_inode_info *iter;
989 struct ext4_fc_head head;
990 struct list_head *pos;
991 struct inode *inode;
992 struct blk_plug plug;
993 int ret = 0;
994 u32 crc = 0;
995
996 ret = ext4_fc_submit_inode_data_all(journal);
997 if (ret)
998 return ret;
999
1000 ret = ext4_fc_wait_inode_data_all(journal);
1001 if (ret)
1002 return ret;
1003
1004 blk_start_plug(&plug);
1005 if (sbi->s_fc_bytes == 0) {
1006 /*
1007 * Add a head tag only if this is the first fast commit
1008 * in this TID.
1009 */
1010 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1011 head.fc_tid = cpu_to_le32(
1012 sbi->s_journal->j_running_transaction->t_tid);
1013 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1014 (u8 *)&head, &crc))
1015 goto out;
1016 }
1017
1018 spin_lock(&sbi->s_fc_lock);
1019 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1020 if (ret) {
1021 spin_unlock(&sbi->s_fc_lock);
1022 goto out;
1023 }
1024
1025 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1026 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1027 inode = &iter->vfs_inode;
1028 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1029 continue;
1030
1031 spin_unlock(&sbi->s_fc_lock);
1032 ret = ext4_fc_write_inode_data(inode, &crc);
1033 if (ret)
1034 goto out;
1035 ret = ext4_fc_write_inode(inode, &crc);
1036 if (ret)
1037 goto out;
1038 spin_lock(&sbi->s_fc_lock);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07001039 }
1040 spin_unlock(&sbi->s_fc_lock);
1041
1042 ret = ext4_fc_write_tail(sb, crc);
1043
1044out:
1045 blk_finish_plug(&plug);
1046 return ret;
1047}
1048
1049/*
1050 * The main commit entry point. Performs a fast commit for transaction
1051 * commit_tid if needed. If it's not possible to perform a fast commit
1052 * due to various reasons, we fall back to full commit. Returns 0
1053 * on success, error otherwise.
1054 */
1055int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1056{
1057 struct super_block *sb = (struct super_block *)(journal->j_private);
1058 struct ext4_sb_info *sbi = EXT4_SB(sb);
1059 int nblks = 0, ret, bsize = journal->j_blocksize;
1060 int subtid = atomic_read(&sbi->s_fc_subtid);
1061 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1062 ktime_t start_time, commit_time;
1063
1064 trace_ext4_fc_commit_start(sb);
1065
1066 start_time = ktime_get();
1067
1068 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1069 (ext4_fc_is_ineligible(sb))) {
1070 reason = EXT4_FC_REASON_INELIGIBLE;
1071 goto out;
1072 }
1073
1074restart_fc:
1075 ret = jbd2_fc_begin_commit(journal, commit_tid);
1076 if (ret == -EALREADY) {
1077 /* There was an ongoing commit, check if we need to restart */
1078 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1079 commit_tid > journal->j_commit_sequence)
1080 goto restart_fc;
1081 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1082 goto out;
1083 } else if (ret) {
1084 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1085 reason = EXT4_FC_REASON_FC_START_FAILED;
1086 goto out;
1087 }
1088
1089 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1090 ret = ext4_fc_perform_commit(journal);
1091 if (ret < 0) {
1092 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1093 reason = EXT4_FC_REASON_FC_FAILED;
1094 goto out;
1095 }
1096 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1097 ret = jbd2_fc_wait_bufs(journal, nblks);
1098 if (ret < 0) {
1099 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1100 reason = EXT4_FC_REASON_FC_FAILED;
1101 goto out;
1102 }
1103 atomic_inc(&sbi->s_fc_subtid);
1104 jbd2_fc_end_commit(journal);
1105out:
1106 /* Has any ineligible update happened since we started? */
1107 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1108 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1109 reason = EXT4_FC_REASON_INELIGIBLE;
1110 }
1111
1112 spin_lock(&sbi->s_fc_lock);
1113 if (reason != EXT4_FC_REASON_OK &&
1114 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1115 sbi->s_fc_stats.fc_ineligible_commits++;
1116 } else {
1117 sbi->s_fc_stats.fc_num_commits++;
1118 sbi->s_fc_stats.fc_numblks += nblks;
1119 }
1120 spin_unlock(&sbi->s_fc_lock);
1121 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1122 trace_ext4_fc_commit_stop(sb, nblks, reason);
1123 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1124 /*
1125 * weight the commit time higher than the average time so we don't
1126 * react too strongly to vast changes in the commit time
1127 */
1128 if (likely(sbi->s_fc_avg_commit_time))
1129 sbi->s_fc_avg_commit_time = (commit_time +
1130 sbi->s_fc_avg_commit_time * 3) / 4;
1131 else
1132 sbi->s_fc_avg_commit_time = commit_time;
1133 jbd_debug(1,
1134 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1135 nblks, reason, subtid);
1136 if (reason == EXT4_FC_REASON_FC_FAILED)
Harshad Shirwadkar0bce5772020-11-05 19:58:58 -08001137 return jbd2_fc_end_commit_fallback(journal);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07001138 if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1139 reason == EXT4_FC_REASON_INELIGIBLE)
1140 return jbd2_complete_transaction(journal, commit_tid);
1141 return 0;
1142}
1143
Harshad Shirwadkarff780b92020-10-15 13:37:56 -07001144/*
1145 * Fast commit cleanup routine. This is called after every fast commit and
1146 * full commit. full is true if we are called after a full commit.
1147 */
1148static void ext4_fc_cleanup(journal_t *journal, int full)
1149{
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07001150 struct super_block *sb = journal->j_private;
1151 struct ext4_sb_info *sbi = EXT4_SB(sb);
1152 struct ext4_inode_info *iter;
1153 struct ext4_fc_dentry_update *fc_dentry;
1154 struct list_head *pos, *n;
1155
1156 if (full && sbi->s_fc_bh)
1157 sbi->s_fc_bh = NULL;
1158
1159 jbd2_fc_release_bufs(journal);
1160
1161 spin_lock(&sbi->s_fc_lock);
1162 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1163 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1164 list_del_init(&iter->i_fc_list);
1165 ext4_clear_inode_state(&iter->vfs_inode,
1166 EXT4_STATE_FC_COMMITTING);
1167 ext4_fc_reset_inode(&iter->vfs_inode);
1168 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1169 smp_mb();
1170#if (BITS_PER_LONG < 64)
1171 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1172#else
1173 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1174#endif
1175 }
1176
1177 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1178 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1179 struct ext4_fc_dentry_update,
1180 fcd_list);
1181 list_del_init(&fc_dentry->fcd_list);
1182 spin_unlock(&sbi->s_fc_lock);
1183
1184 if (fc_dentry->fcd_name.name &&
1185 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1186 kfree(fc_dentry->fcd_name.name);
1187 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1188 spin_lock(&sbi->s_fc_lock);
1189 }
1190
1191 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1192 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1193 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1194 &sbi->s_fc_q[FC_Q_STAGING]);
1195
Harshad Shirwadkarababea72020-10-26 21:49:15 -07001196 sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
1197 sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07001198
1199 if (full)
1200 sbi->s_fc_bytes = 0;
1201 spin_unlock(&sbi->s_fc_lock);
1202 trace_ext4_fc_stats(sb);
Harshad Shirwadkarff780b92020-10-15 13:37:56 -07001203}
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07001204
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001205/* Ext4 Replay Path Routines */
1206
1207/* Get length of a particular tlv */
1208static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1209{
1210 return le16_to_cpu(tl->fc_len);
1211}
1212
1213/* Get a pointer to "value" of a tlv */
1214static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1215{
1216 return (u8 *)tl + sizeof(*tl);
1217}
1218
1219/* Helper struct for dentry replay routines */
1220struct dentry_info_args {
1221 int parent_ino, dname_len, ino, inode_len;
1222 char *dname;
1223};
1224
1225static inline void tl_to_darg(struct dentry_info_args *darg,
1226 struct ext4_fc_tl *tl)
1227{
1228 struct ext4_fc_dentry_info *fcd;
1229
1230 fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1231
1232 darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1233 darg->ino = le32_to_cpu(fcd->fc_ino);
1234 darg->dname = fcd->fc_dname;
1235 darg->dname_len = ext4_fc_tag_len(tl) -
1236 sizeof(struct ext4_fc_dentry_info);
1237}
1238
1239/* Unlink replay function */
1240static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1241{
1242 struct inode *inode, *old_parent;
1243 struct qstr entry;
1244 struct dentry_info_args darg;
1245 int ret = 0;
1246
1247 tl_to_darg(&darg, tl);
1248
1249 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1250 darg.parent_ino, darg.dname_len);
1251
1252 entry.name = darg.dname;
1253 entry.len = darg.dname_len;
1254 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1255
1256 if (IS_ERR_OR_NULL(inode)) {
1257 jbd_debug(1, "Inode %d not found", darg.ino);
1258 return 0;
1259 }
1260
1261 old_parent = ext4_iget(sb, darg.parent_ino,
1262 EXT4_IGET_NORMAL);
1263 if (IS_ERR_OR_NULL(old_parent)) {
1264 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1265 iput(inode);
1266 return 0;
1267 }
1268
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -08001269 ret = __ext4_unlink(NULL, old_parent, &entry, inode);
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001270 /* -ENOENT ok coz it might not exist anymore. */
1271 if (ret == -ENOENT)
1272 ret = 0;
1273 iput(old_parent);
1274 iput(inode);
1275 return ret;
1276}
1277
1278static int ext4_fc_replay_link_internal(struct super_block *sb,
1279 struct dentry_info_args *darg,
1280 struct inode *inode)
1281{
1282 struct inode *dir = NULL;
1283 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1284 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1285 int ret = 0;
1286
1287 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1288 if (IS_ERR(dir)) {
1289 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1290 dir = NULL;
1291 goto out;
1292 }
1293
1294 dentry_dir = d_obtain_alias(dir);
1295 if (IS_ERR(dentry_dir)) {
1296 jbd_debug(1, "Failed to obtain dentry");
1297 dentry_dir = NULL;
1298 goto out;
1299 }
1300
1301 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1302 if (!dentry_inode) {
1303 jbd_debug(1, "Inode dentry not created.");
1304 ret = -ENOMEM;
1305 goto out;
1306 }
1307
1308 ret = __ext4_link(dir, inode, dentry_inode);
1309 /*
1310 * It's possible that link already existed since data blocks
1311 * for the dir in question got persisted before we crashed OR
1312 * we replayed this tag and crashed before the entire replay
1313 * could complete.
1314 */
1315 if (ret && ret != -EEXIST) {
1316 jbd_debug(1, "Failed to link\n");
1317 goto out;
1318 }
1319
1320 ret = 0;
1321out:
1322 if (dentry_dir) {
1323 d_drop(dentry_dir);
1324 dput(dentry_dir);
1325 } else if (dir) {
1326 iput(dir);
1327 }
1328 if (dentry_inode) {
1329 d_drop(dentry_inode);
1330 dput(dentry_inode);
1331 }
1332
1333 return ret;
1334}
1335
1336/* Link replay function */
1337static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1338{
1339 struct inode *inode;
1340 struct dentry_info_args darg;
1341 int ret = 0;
1342
1343 tl_to_darg(&darg, tl);
1344 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1345 darg.parent_ino, darg.dname_len);
1346
1347 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1348 if (IS_ERR_OR_NULL(inode)) {
1349 jbd_debug(1, "Inode not found.");
1350 return 0;
1351 }
1352
1353 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1354 iput(inode);
1355 return ret;
1356}
1357
1358/*
1359 * Record all the modified inodes during replay. We use this later to setup
1360 * block bitmaps correctly.
1361 */
1362static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1363{
1364 struct ext4_fc_replay_state *state;
1365 int i;
1366
1367 state = &EXT4_SB(sb)->s_fc_replay_state;
1368 for (i = 0; i < state->fc_modified_inodes_used; i++)
1369 if (state->fc_modified_inodes[i] == ino)
1370 return 0;
1371 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1372 state->fc_modified_inodes_size +=
1373 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1374 state->fc_modified_inodes = krealloc(
1375 state->fc_modified_inodes, sizeof(int) *
1376 state->fc_modified_inodes_size,
1377 GFP_KERNEL);
1378 if (!state->fc_modified_inodes)
1379 return -ENOMEM;
1380 }
1381 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1382 return 0;
1383}
1384
1385/*
1386 * Inode replay function
1387 */
1388static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1389{
1390 struct ext4_fc_inode *fc_inode;
1391 struct ext4_inode *raw_inode;
1392 struct ext4_inode *raw_fc_inode;
1393 struct inode *inode = NULL;
1394 struct ext4_iloc iloc;
1395 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1396 struct ext4_extent_header *eh;
1397
1398 fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1399
1400 ino = le32_to_cpu(fc_inode->fc_ino);
1401 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1402
1403 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1404 if (!IS_ERR_OR_NULL(inode)) {
1405 ext4_ext_clear_bb(inode);
1406 iput(inode);
1407 }
1408
1409 ext4_fc_record_modified_inode(sb, ino);
1410
1411 raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1412 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1413 if (ret)
1414 goto out;
1415
1416 inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1417 raw_inode = ext4_raw_inode(&iloc);
1418
1419 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1420 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1421 inode_len - offsetof(struct ext4_inode, i_generation));
1422 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1423 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1424 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1425 memset(eh, 0, sizeof(*eh));
1426 eh->eh_magic = EXT4_EXT_MAGIC;
1427 eh->eh_max = cpu_to_le16(
1428 (sizeof(raw_inode->i_block) -
1429 sizeof(struct ext4_extent_header))
1430 / sizeof(struct ext4_extent));
1431 }
1432 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1433 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1434 sizeof(raw_inode->i_block));
1435 }
1436
1437 /* Immediately update the inode on disk. */
1438 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1439 if (ret)
1440 goto out;
1441 ret = sync_dirty_buffer(iloc.bh);
1442 if (ret)
1443 goto out;
1444 ret = ext4_mark_inode_used(sb, ino);
1445 if (ret)
1446 goto out;
1447
1448 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1449 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1450 if (IS_ERR_OR_NULL(inode)) {
1451 jbd_debug(1, "Inode not found.");
1452 return -EFSCORRUPTED;
1453 }
1454
1455 /*
1456 * Our allocator could have made different decisions than before
1457 * crashing. This should be fixed but until then, we calculate
1458 * the number of blocks the inode.
1459 */
1460 ext4_ext_replay_set_iblocks(inode);
1461
1462 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1463 ext4_reset_inode_seed(inode);
1464
1465 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1466 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1467 sync_dirty_buffer(iloc.bh);
1468 brelse(iloc.bh);
1469out:
1470 iput(inode);
1471 if (!ret)
1472 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1473
1474 return 0;
1475}
1476
1477/*
1478 * Dentry create replay function.
1479 *
1480 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1481 * inode for which we are trying to create a dentry here, should already have
1482 * been replayed before we start here.
1483 */
1484static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1485{
1486 int ret = 0;
1487 struct inode *inode = NULL;
1488 struct inode *dir = NULL;
1489 struct dentry_info_args darg;
1490
1491 tl_to_darg(&darg, tl);
1492
1493 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1494 darg.parent_ino, darg.dname_len);
1495
1496 /* This takes care of update group descriptor and other metadata */
1497 ret = ext4_mark_inode_used(sb, darg.ino);
1498 if (ret)
1499 goto out;
1500
1501 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1502 if (IS_ERR_OR_NULL(inode)) {
1503 jbd_debug(1, "inode %d not found.", darg.ino);
1504 inode = NULL;
1505 ret = -EINVAL;
1506 goto out;
1507 }
1508
1509 if (S_ISDIR(inode->i_mode)) {
1510 /*
1511 * If we are creating a directory, we need to make sure that the
1512 * dot and dot dot dirents are setup properly.
1513 */
1514 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1515 if (IS_ERR_OR_NULL(dir)) {
1516 jbd_debug(1, "Dir %d not found.", darg.ino);
1517 goto out;
1518 }
1519 ret = ext4_init_new_dir(NULL, dir, inode);
1520 iput(dir);
1521 if (ret) {
1522 ret = 0;
1523 goto out;
1524 }
1525 }
1526 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1527 if (ret)
1528 goto out;
1529 set_nlink(inode, 1);
1530 ext4_mark_inode_dirty(NULL, inode);
1531out:
1532 if (inode)
1533 iput(inode);
1534 return ret;
1535}
1536
1537/*
1538 * Record physical disk regions which are in use as per fast commit area. Our
1539 * simple replay phase allocator excludes these regions from allocation.
1540 */
1541static int ext4_fc_record_regions(struct super_block *sb, int ino,
1542 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1543{
1544 struct ext4_fc_replay_state *state;
1545 struct ext4_fc_alloc_region *region;
1546
1547 state = &EXT4_SB(sb)->s_fc_replay_state;
1548 if (state->fc_regions_used == state->fc_regions_size) {
1549 state->fc_regions_size +=
1550 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1551 state->fc_regions = krealloc(
1552 state->fc_regions,
1553 state->fc_regions_size *
1554 sizeof(struct ext4_fc_alloc_region),
1555 GFP_KERNEL);
1556 if (!state->fc_regions)
1557 return -ENOMEM;
1558 }
1559 region = &state->fc_regions[state->fc_regions_used++];
1560 region->ino = ino;
1561 region->lblk = lblk;
1562 region->pblk = pblk;
1563 region->len = len;
1564
1565 return 0;
1566}
1567
1568/* Replay add range tag */
1569static int ext4_fc_replay_add_range(struct super_block *sb,
1570 struct ext4_fc_tl *tl)
1571{
1572 struct ext4_fc_add_range *fc_add_ex;
1573 struct ext4_extent newex, *ex;
1574 struct inode *inode;
1575 ext4_lblk_t start, cur;
1576 int remaining, len;
1577 ext4_fsblk_t start_pblk;
1578 struct ext4_map_blocks map;
1579 struct ext4_ext_path *path = NULL;
1580 int ret;
1581
1582 fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1583 ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1584
1585 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1586 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1587 ext4_ext_get_actual_len(ex));
1588
1589 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1590 EXT4_IGET_NORMAL);
1591 if (IS_ERR_OR_NULL(inode)) {
1592 jbd_debug(1, "Inode not found.");
1593 return 0;
1594 }
1595
1596 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1597
1598 start = le32_to_cpu(ex->ee_block);
1599 start_pblk = ext4_ext_pblock(ex);
1600 len = ext4_ext_get_actual_len(ex);
1601
1602 cur = start;
1603 remaining = len;
1604 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1605 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1606 inode->i_ino);
1607
1608 while (remaining > 0) {
1609 map.m_lblk = cur;
1610 map.m_len = remaining;
1611 map.m_pblk = 0;
1612 ret = ext4_map_blocks(NULL, inode, &map, 0);
1613
1614 if (ret < 0) {
1615 iput(inode);
1616 return 0;
1617 }
1618
1619 if (ret == 0) {
1620 /* Range is not mapped */
1621 path = ext4_find_extent(inode, cur, NULL, 0);
Harshad Shirwadkar8c9be1e2020-10-27 13:43:42 -07001622 if (IS_ERR(path)) {
1623 iput(inode);
1624 return 0;
1625 }
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001626 memset(&newex, 0, sizeof(newex));
1627 newex.ee_block = cpu_to_le32(cur);
1628 ext4_ext_store_pblock(
1629 &newex, start_pblk + cur - start);
1630 newex.ee_len = cpu_to_le16(map.m_len);
1631 if (ext4_ext_is_unwritten(ex))
1632 ext4_ext_mark_unwritten(&newex);
1633 down_write(&EXT4_I(inode)->i_data_sem);
1634 ret = ext4_ext_insert_extent(
1635 NULL, inode, &path, &newex, 0);
1636 up_write((&EXT4_I(inode)->i_data_sem));
1637 ext4_ext_drop_refs(path);
1638 kfree(path);
1639 if (ret) {
1640 iput(inode);
1641 return 0;
1642 }
1643 goto next;
1644 }
1645
1646 if (start_pblk + cur - start != map.m_pblk) {
1647 /*
1648 * Logical to physical mapping changed. This can happen
1649 * if this range was removed and then reallocated to
1650 * map to new physical blocks during a fast commit.
1651 */
1652 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1653 ext4_ext_is_unwritten(ex),
1654 start_pblk + cur - start);
1655 if (ret) {
1656 iput(inode);
1657 return 0;
1658 }
1659 /*
1660 * Mark the old blocks as free since they aren't used
1661 * anymore. We maintain an array of all the modified
1662 * inodes. In case these blocks are still used at either
1663 * a different logical range in the same inode or in
1664 * some different inode, we will mark them as allocated
1665 * at the end of the FC replay using our array of
1666 * modified inodes.
1667 */
1668 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1669 goto next;
1670 }
1671
1672 /* Range is mapped and needs a state change */
1673 jbd_debug(1, "Converting from %d to %d %lld",
1674 map.m_flags & EXT4_MAP_UNWRITTEN,
1675 ext4_ext_is_unwritten(ex), map.m_pblk);
1676 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1677 ext4_ext_is_unwritten(ex), map.m_pblk);
1678 if (ret) {
1679 iput(inode);
1680 return 0;
1681 }
1682 /*
1683 * We may have split the extent tree while toggling the state.
1684 * Try to shrink the extent tree now.
1685 */
1686 ext4_ext_replay_shrink_inode(inode, start + len);
1687next:
1688 cur += map.m_len;
1689 remaining -= map.m_len;
1690 }
1691 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1692 sb->s_blocksize_bits);
1693 iput(inode);
1694 return 0;
1695}
1696
1697/* Replay DEL_RANGE tag */
1698static int
1699ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1700{
1701 struct inode *inode;
1702 struct ext4_fc_del_range *lrange;
1703 struct ext4_map_blocks map;
1704 ext4_lblk_t cur, remaining;
1705 int ret;
1706
1707 lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1708 cur = le32_to_cpu(lrange->fc_lblk);
1709 remaining = le32_to_cpu(lrange->fc_len);
1710
1711 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1712 le32_to_cpu(lrange->fc_ino), cur, remaining);
1713
1714 inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1715 if (IS_ERR_OR_NULL(inode)) {
1716 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1717 return 0;
1718 }
1719
1720 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1721
1722 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1723 inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1724 le32_to_cpu(lrange->fc_len));
1725 while (remaining > 0) {
1726 map.m_lblk = cur;
1727 map.m_len = remaining;
1728
1729 ret = ext4_map_blocks(NULL, inode, &map, 0);
1730 if (ret < 0) {
1731 iput(inode);
1732 return 0;
1733 }
1734 if (ret > 0) {
1735 remaining -= ret;
1736 cur += ret;
1737 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1738 } else {
1739 remaining -= map.m_len;
1740 cur += map.m_len;
1741 }
1742 }
1743
1744 ret = ext4_punch_hole(inode,
1745 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1746 le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits);
1747 if (ret)
1748 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1749 ext4_ext_replay_shrink_inode(inode,
1750 i_size_read(inode) >> sb->s_blocksize_bits);
1751 ext4_mark_inode_dirty(NULL, inode);
1752 iput(inode);
1753
1754 return 0;
1755}
1756
1757static inline const char *tag2str(u16 tag)
1758{
1759 switch (tag) {
1760 case EXT4_FC_TAG_LINK:
1761 return "TAG_ADD_ENTRY";
1762 case EXT4_FC_TAG_UNLINK:
1763 return "TAG_DEL_ENTRY";
1764 case EXT4_FC_TAG_ADD_RANGE:
1765 return "TAG_ADD_RANGE";
1766 case EXT4_FC_TAG_CREAT:
1767 return "TAG_CREAT_DENTRY";
1768 case EXT4_FC_TAG_DEL_RANGE:
1769 return "TAG_DEL_RANGE";
1770 case EXT4_FC_TAG_INODE:
1771 return "TAG_INODE";
1772 case EXT4_FC_TAG_PAD:
1773 return "TAG_PAD";
1774 case EXT4_FC_TAG_TAIL:
1775 return "TAG_TAIL";
1776 case EXT4_FC_TAG_HEAD:
1777 return "TAG_HEAD";
1778 default:
1779 return "TAG_ERROR";
1780 }
1781}
1782
1783static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1784{
1785 struct ext4_fc_replay_state *state;
1786 struct inode *inode;
1787 struct ext4_ext_path *path = NULL;
1788 struct ext4_map_blocks map;
1789 int i, ret, j;
1790 ext4_lblk_t cur, end;
1791
1792 state = &EXT4_SB(sb)->s_fc_replay_state;
1793 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1794 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1795 EXT4_IGET_NORMAL);
1796 if (IS_ERR_OR_NULL(inode)) {
1797 jbd_debug(1, "Inode %d not found.",
1798 state->fc_modified_inodes[i]);
1799 continue;
1800 }
1801 cur = 0;
1802 end = EXT_MAX_BLOCKS;
1803 while (cur < end) {
1804 map.m_lblk = cur;
1805 map.m_len = end - cur;
1806
1807 ret = ext4_map_blocks(NULL, inode, &map, 0);
1808 if (ret < 0)
1809 break;
1810
1811 if (ret > 0) {
1812 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1813 if (!IS_ERR_OR_NULL(path)) {
1814 for (j = 0; j < path->p_depth; j++)
1815 ext4_mb_mark_bb(inode->i_sb,
1816 path[j].p_block, 1, 1);
1817 ext4_ext_drop_refs(path);
1818 kfree(path);
1819 }
1820 cur += ret;
1821 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1822 map.m_len, 1);
1823 } else {
1824 cur = cur + (map.m_len ? map.m_len : 1);
1825 }
1826 }
1827 iput(inode);
1828 }
1829}
1830
1831/*
1832 * Check if block is in excluded regions for block allocation. The simple
1833 * allocator that runs during replay phase is calls this function to see
1834 * if it is okay to use a block.
1835 */
1836bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1837{
1838 int i;
1839 struct ext4_fc_replay_state *state;
1840
1841 state = &EXT4_SB(sb)->s_fc_replay_state;
1842 for (i = 0; i < state->fc_regions_valid; i++) {
1843 if (state->fc_regions[i].ino == 0 ||
1844 state->fc_regions[i].len == 0)
1845 continue;
1846 if (blk >= state->fc_regions[i].pblk &&
1847 blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1848 return true;
1849 }
1850 return false;
1851}
1852
1853/* Cleanup function called after replay */
1854void ext4_fc_replay_cleanup(struct super_block *sb)
1855{
1856 struct ext4_sb_info *sbi = EXT4_SB(sb);
1857
1858 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1859 kfree(sbi->s_fc_replay_state.fc_regions);
1860 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1861}
1862
1863/*
1864 * Recovery Scan phase handler
1865 *
1866 * This function is called during the scan phase and is responsible
1867 * for doing following things:
1868 * - Make sure the fast commit area has valid tags for replay
1869 * - Count number of tags that need to be replayed by the replay handler
1870 * - Verify CRC
1871 * - Create a list of excluded blocks for allocation during replay phase
1872 *
1873 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1874 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1875 * to indicate that scan has finished and JBD2 can now start replay phase.
1876 * It returns a negative error to indicate that there was an error. At the end
1877 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1878 * to indicate the number of tags that need to replayed during the replay phase.
1879 */
1880static int ext4_fc_replay_scan(journal_t *journal,
1881 struct buffer_head *bh, int off,
1882 tid_t expected_tid)
1883{
1884 struct super_block *sb = journal->j_private;
1885 struct ext4_sb_info *sbi = EXT4_SB(sb);
1886 struct ext4_fc_replay_state *state;
1887 int ret = JBD2_FC_REPLAY_CONTINUE;
1888 struct ext4_fc_add_range *ext;
1889 struct ext4_fc_tl *tl;
1890 struct ext4_fc_tail *tail;
1891 __u8 *start, *end;
1892 struct ext4_fc_head *head;
1893 struct ext4_extent *ex;
1894
1895 state = &sbi->s_fc_replay_state;
1896
1897 start = (u8 *)bh->b_data;
1898 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1899
1900 if (state->fc_replay_expected_off == 0) {
1901 state->fc_cur_tag = 0;
1902 state->fc_replay_num_tags = 0;
1903 state->fc_crc = 0;
1904 state->fc_regions = NULL;
1905 state->fc_regions_valid = state->fc_regions_used =
1906 state->fc_regions_size = 0;
1907 /* Check if we can stop early */
1908 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1909 != EXT4_FC_TAG_HEAD)
1910 return 0;
1911 }
1912
1913 if (off != state->fc_replay_expected_off) {
1914 ret = -EFSCORRUPTED;
1915 goto out_err;
1916 }
1917
1918 state->fc_replay_expected_off++;
1919 fc_for_each_tl(start, end, tl) {
1920 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1921 tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1922 switch (le16_to_cpu(tl->fc_tag)) {
1923 case EXT4_FC_TAG_ADD_RANGE:
1924 ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1925 ex = (struct ext4_extent *)&ext->fc_ex;
1926 ret = ext4_fc_record_regions(sb,
1927 le32_to_cpu(ext->fc_ino),
1928 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1929 ext4_ext_get_actual_len(ex));
1930 if (ret < 0)
1931 break;
1932 ret = JBD2_FC_REPLAY_CONTINUE;
1933 fallthrough;
1934 case EXT4_FC_TAG_DEL_RANGE:
1935 case EXT4_FC_TAG_LINK:
1936 case EXT4_FC_TAG_UNLINK:
1937 case EXT4_FC_TAG_CREAT:
1938 case EXT4_FC_TAG_INODE:
1939 case EXT4_FC_TAG_PAD:
1940 state->fc_cur_tag++;
1941 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1942 sizeof(*tl) + ext4_fc_tag_len(tl));
1943 break;
1944 case EXT4_FC_TAG_TAIL:
1945 state->fc_cur_tag++;
1946 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1947 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1948 sizeof(*tl) +
1949 offsetof(struct ext4_fc_tail,
1950 fc_crc));
1951 if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1952 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1953 state->fc_replay_num_tags = state->fc_cur_tag;
1954 state->fc_regions_valid =
1955 state->fc_regions_used;
1956 } else {
1957 ret = state->fc_replay_num_tags ?
1958 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1959 }
1960 state->fc_crc = 0;
1961 break;
1962 case EXT4_FC_TAG_HEAD:
1963 head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1964 if (le32_to_cpu(head->fc_features) &
1965 ~EXT4_FC_SUPPORTED_FEATURES) {
1966 ret = -EOPNOTSUPP;
1967 break;
1968 }
1969 if (le32_to_cpu(head->fc_tid) != expected_tid) {
1970 ret = JBD2_FC_REPLAY_STOP;
1971 break;
1972 }
1973 state->fc_cur_tag++;
1974 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1975 sizeof(*tl) + ext4_fc_tag_len(tl));
1976 break;
1977 default:
1978 ret = state->fc_replay_num_tags ?
1979 JBD2_FC_REPLAY_STOP : -ECANCELED;
1980 }
1981 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1982 break;
1983 }
1984
1985out_err:
1986 trace_ext4_fc_replay_scan(sb, ret, off);
1987 return ret;
1988}
1989
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07001990/*
1991 * Main recovery path entry point.
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001992 * The meaning of return codes is similar as above.
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07001993 */
1994static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
1995 enum passtype pass, int off, tid_t expected_tid)
1996{
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001997 struct super_block *sb = journal->j_private;
1998 struct ext4_sb_info *sbi = EXT4_SB(sb);
1999 struct ext4_fc_tl *tl;
2000 __u8 *start, *end;
2001 int ret = JBD2_FC_REPLAY_CONTINUE;
2002 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2003 struct ext4_fc_tail *tail;
2004
2005 if (pass == PASS_SCAN) {
2006 state->fc_current_pass = PASS_SCAN;
2007 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2008 }
2009
2010 if (state->fc_current_pass != pass) {
2011 state->fc_current_pass = pass;
2012 sbi->s_mount_state |= EXT4_FC_REPLAY;
2013 }
2014 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2015 jbd_debug(1, "Replay stops\n");
2016 ext4_fc_set_bitmaps_and_counters(sb);
2017 return 0;
2018 }
2019
2020#ifdef CONFIG_EXT4_DEBUG
2021 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2022 pr_warn("Dropping fc block %d because max_replay set\n", off);
2023 return JBD2_FC_REPLAY_STOP;
2024 }
2025#endif
2026
2027 start = (u8 *)bh->b_data;
2028 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2029
2030 fc_for_each_tl(start, end, tl) {
2031 if (state->fc_replay_num_tags == 0) {
2032 ret = JBD2_FC_REPLAY_STOP;
2033 ext4_fc_set_bitmaps_and_counters(sb);
2034 break;
2035 }
2036 jbd_debug(3, "Replay phase, tag:%s\n",
2037 tag2str(le16_to_cpu(tl->fc_tag)));
2038 state->fc_replay_num_tags--;
2039 switch (le16_to_cpu(tl->fc_tag)) {
2040 case EXT4_FC_TAG_LINK:
2041 ret = ext4_fc_replay_link(sb, tl);
2042 break;
2043 case EXT4_FC_TAG_UNLINK:
2044 ret = ext4_fc_replay_unlink(sb, tl);
2045 break;
2046 case EXT4_FC_TAG_ADD_RANGE:
2047 ret = ext4_fc_replay_add_range(sb, tl);
2048 break;
2049 case EXT4_FC_TAG_CREAT:
2050 ret = ext4_fc_replay_create(sb, tl);
2051 break;
2052 case EXT4_FC_TAG_DEL_RANGE:
2053 ret = ext4_fc_replay_del_range(sb, tl);
2054 break;
2055 case EXT4_FC_TAG_INODE:
2056 ret = ext4_fc_replay_inode(sb, tl);
2057 break;
2058 case EXT4_FC_TAG_PAD:
2059 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2060 ext4_fc_tag_len(tl), 0);
2061 break;
2062 case EXT4_FC_TAG_TAIL:
2063 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2064 ext4_fc_tag_len(tl), 0);
2065 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2066 WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2067 break;
2068 case EXT4_FC_TAG_HEAD:
2069 break;
2070 default:
2071 trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2072 ext4_fc_tag_len(tl), 0);
2073 ret = -ECANCELED;
2074 break;
2075 }
2076 if (ret < 0)
2077 break;
2078 ret = JBD2_FC_REPLAY_CONTINUE;
2079 }
2080 return ret;
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07002081}
2082
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07002083void ext4_fc_init(struct super_block *sb, journal_t *journal)
2084{
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07002085 /*
2086 * We set replay callback even if fast commit disabled because we may
2087 * could still have fast commit blocks that need to be replayed even if
2088 * fast commit has now been turned off.
2089 */
2090 journal->j_fc_replay_callback = ext4_fc_replay;
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07002091 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2092 return;
Harshad Shirwadkarff780b92020-10-15 13:37:56 -07002093 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07002094}
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07002095
Harshad Shirwadkarce8c59d2020-10-15 13:38:01 -07002096const char *fc_ineligible_reasons[] = {
2097 "Extended attributes changed",
2098 "Cross rename",
2099 "Journal flag changed",
2100 "Insufficient memory",
2101 "Swap boot",
2102 "Resize",
2103 "Dir renamed",
2104 "Falloc range op",
2105 "FC Commit Failed"
2106};
2107
2108int ext4_fc_info_show(struct seq_file *seq, void *v)
2109{
2110 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2111 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2112 int i;
2113
2114 if (v != SEQ_START_TOKEN)
2115 return 0;
2116
2117 seq_printf(seq,
2118 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2119 stats->fc_num_commits, stats->fc_ineligible_commits,
2120 stats->fc_numblks,
2121 div_u64(sbi->s_fc_avg_commit_time, 1000));
2122 seq_puts(seq, "Ineligible reasons:\n");
2123 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2124 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2125 stats->fc_ineligible_reason_count[i]);
2126
2127 return 0;
2128}
2129
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07002130int __init ext4_fc_init_dentry_cache(void)
2131{
2132 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2133 SLAB_RECLAIM_ACCOUNT);
2134
2135 if (ext4_fc_dentry_cachep == NULL)
2136 return -ENOMEM;
2137
2138 return 0;
2139}