blob: fc5a5e6a581d48b5d92a0c51d84a1bc3f1f6bd18 [file] [log] [blame]
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07001// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -070010#include "ext4.h"
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -070011#include "ext4_jbd2.h"
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -070012#include "ext4_extents.h"
13#include "mballoc.h"
14
15/*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligiblity is marked by calling one of the
70 * two following functions:
71 *
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
74 *
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
83 *
84 * Atomicity of commits
85 * --------------------
86 * In order to gaurantee atomicity during the commit operation, fast commit
87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
93 *
94 * - Create a new file A and remove existing file B
95 * - fsync()
96 * - Append contents to file A
97 * - Truncate file A
98 * - fsync()
99 *
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
103 *
104 * Replay code should thus check for all the valid tails in the FC area.
105 *
106 * TODOs
107 * -----
108 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109 * eligible update must be protected within ext4_fc_start_update() and
110 * ext4_fc_stop_update(). These routines are called at much higher
111 * routines. This can be made more fine grained by combining with
112 * ext4_journal_start().
113 *
114 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115 *
116 * 3) Handle more ineligible cases.
117 */
118
119#include <trace/events/ext4.h>
120static struct kmem_cache *ext4_fc_dentry_cachep;
121
122static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123{
124 BUFFER_TRACE(bh, "");
125 if (uptodate) {
126 ext4_debug("%s: Block %lld up-to-date",
127 __func__, bh->b_blocknr);
128 set_buffer_uptodate(bh);
129 } else {
130 ext4_debug("%s: Block %lld not up-to-date",
131 __func__, bh->b_blocknr);
132 clear_buffer_uptodate(bh);
133 }
134
135 unlock_buffer(bh);
136}
137
138static inline void ext4_fc_reset_inode(struct inode *inode)
139{
140 struct ext4_inode_info *ei = EXT4_I(inode);
141
142 ei->i_fc_lblk_start = 0;
143 ei->i_fc_lblk_len = 0;
144}
145
146void ext4_fc_init_inode(struct inode *inode)
147{
148 struct ext4_inode_info *ei = EXT4_I(inode);
149
150 ext4_fc_reset_inode(inode);
151 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152 INIT_LIST_HEAD(&ei->i_fc_list);
153 init_waitqueue_head(&ei->i_fc_wait);
154 atomic_set(&ei->i_fc_updates, 0);
155 ei->i_fc_committed_subtid = 0;
156}
157
Harshad Shirwadkarf6634e22020-11-05 19:59:02 -0800158/* This function must be called with sbi->s_fc_lock held. */
159static void ext4_fc_wait_committing_inode(struct inode *inode)
160{
161 wait_queue_head_t *wq;
162 struct ext4_inode_info *ei = EXT4_I(inode);
163
164#if (BITS_PER_LONG < 64)
165 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
166 EXT4_STATE_FC_COMMITTING);
167 wq = bit_waitqueue(&ei->i_state_flags,
168 EXT4_STATE_FC_COMMITTING);
169#else
170 DEFINE_WAIT_BIT(wait, &ei->i_flags,
171 EXT4_STATE_FC_COMMITTING);
172 wq = bit_waitqueue(&ei->i_flags,
173 EXT4_STATE_FC_COMMITTING);
174#endif
175 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
176 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
177 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
178 schedule();
179 finish_wait(wq, &wait.wq_entry);
180}
181
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700182/*
183 * Inform Ext4's fast about start of an inode update
184 *
185 * This function is called by the high level call VFS callbacks before
186 * performing any inode update. This function blocks if there's an ongoing
187 * fast commit on the inode in question.
188 */
189void ext4_fc_start_update(struct inode *inode)
190{
191 struct ext4_inode_info *ei = EXT4_I(inode);
192
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700193 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
194 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700195 return;
196
197restart:
198 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
199 if (list_empty(&ei->i_fc_list))
200 goto out;
201
202 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
Harshad Shirwadkarf6634e22020-11-05 19:59:02 -0800203 ext4_fc_wait_committing_inode(inode);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700204 goto restart;
205 }
206out:
207 atomic_inc(&ei->i_fc_updates);
208 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
209}
210
211/*
212 * Stop inode update and wake up waiting fast commits if any.
213 */
214void ext4_fc_stop_update(struct inode *inode)
215{
216 struct ext4_inode_info *ei = EXT4_I(inode);
217
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700218 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
219 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700220 return;
221
222 if (atomic_dec_and_test(&ei->i_fc_updates))
223 wake_up_all(&ei->i_fc_wait);
224}
225
226/*
227 * Remove inode from fast commit list. If the inode is being committed
228 * we wait until inode commit is done.
229 */
230void ext4_fc_del(struct inode *inode)
231{
232 struct ext4_inode_info *ei = EXT4_I(inode);
233
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700234 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
235 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700236 return;
237
238restart:
239 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
240 if (list_empty(&ei->i_fc_list)) {
241 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
242 return;
243 }
244
245 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
Harshad Shirwadkarf6634e22020-11-05 19:59:02 -0800246 ext4_fc_wait_committing_inode(inode);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700247 goto restart;
248 }
Harshad Shirwadkarf6634e22020-11-05 19:59:02 -0800249 list_del_init(&ei->i_fc_list);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700250 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
251}
252
253/*
254 * Mark file system as fast commit ineligible. This means that next commit
255 * operation would result in a full jbd2 commit.
256 */
257void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
258{
259 struct ext4_sb_info *sbi = EXT4_SB(sb);
260
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700261 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
262 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
263 return;
264
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700265 sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700266 WARN_ON(reason >= EXT4_FC_REASON_MAX);
267 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
268}
269
270/*
271 * Start a fast commit ineligible update. Any commits that happen while
272 * such an operation is in progress fall back to full commits.
273 */
274void ext4_fc_start_ineligible(struct super_block *sb, int reason)
275{
276 struct ext4_sb_info *sbi = EXT4_SB(sb);
277
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700278 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
279 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
280 return;
281
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700282 WARN_ON(reason >= EXT4_FC_REASON_MAX);
283 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
284 atomic_inc(&sbi->s_fc_ineligible_updates);
285}
286
287/*
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700288 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700289 * to ensure that after stopping the ineligible update, at least one full
290 * commit takes place.
291 */
292void ext4_fc_stop_ineligible(struct super_block *sb)
293{
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700294 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
295 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
296 return;
297
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700298 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700299 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
300}
301
302static inline int ext4_fc_is_ineligible(struct super_block *sb)
303{
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700304 return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) ||
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700305 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
306}
307
308/*
309 * Generic fast commit tracking function. If this is the first time this we are
310 * called after a full commit, we initialize fast commit fields and then call
311 * __fc_track_fn() with update = 0. If we have already been called after a full
312 * commit, we pass update = 1. Based on that, the track function can determine
313 * if it needs to track a field for the first time or if it needs to just
314 * update the previously tracked value.
315 *
316 * If enqueue is set, this function enqueues the inode in fast commit list.
317 */
318static int ext4_fc_track_template(
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800319 handle_t *handle, struct inode *inode,
320 int (*__fc_track_fn)(struct inode *, void *, bool),
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700321 void *args, int enqueue)
322{
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700323 bool update = false;
324 struct ext4_inode_info *ei = EXT4_I(inode);
325 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800326 tid_t tid = 0;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700327 int ret;
328
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700329 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
330 (sbi->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700331 return -EOPNOTSUPP;
332
333 if (ext4_fc_is_ineligible(inode->i_sb))
334 return -EINVAL;
335
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800336 tid = handle->h_transaction->t_tid;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700337 mutex_lock(&ei->i_fc_lock);
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800338 if (tid == ei->i_sync_tid) {
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700339 update = true;
340 } else {
341 ext4_fc_reset_inode(inode);
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800342 ei->i_sync_tid = tid;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700343 }
344 ret = __fc_track_fn(inode, args, update);
345 mutex_unlock(&ei->i_fc_lock);
346
347 if (!enqueue)
348 return ret;
349
350 spin_lock(&sbi->s_fc_lock);
351 if (list_empty(&EXT4_I(inode)->i_fc_list))
352 list_add_tail(&EXT4_I(inode)->i_fc_list,
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700353 (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ?
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700354 &sbi->s_fc_q[FC_Q_STAGING] :
355 &sbi->s_fc_q[FC_Q_MAIN]);
356 spin_unlock(&sbi->s_fc_lock);
357
358 return ret;
359}
360
361struct __track_dentry_update_args {
362 struct dentry *dentry;
363 int op;
364};
365
366/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
367static int __track_dentry_update(struct inode *inode, void *arg, bool update)
368{
369 struct ext4_fc_dentry_update *node;
370 struct ext4_inode_info *ei = EXT4_I(inode);
371 struct __track_dentry_update_args *dentry_update =
372 (struct __track_dentry_update_args *)arg;
373 struct dentry *dentry = dentry_update->dentry;
374 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
375
376 mutex_unlock(&ei->i_fc_lock);
377 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
378 if (!node) {
Harshad Shirwadkarb21ebf12020-11-05 19:58:51 -0800379 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700380 mutex_lock(&ei->i_fc_lock);
381 return -ENOMEM;
382 }
383
384 node->fcd_op = dentry_update->op;
385 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
386 node->fcd_ino = inode->i_ino;
387 if (dentry->d_name.len > DNAME_INLINE_LEN) {
388 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
389 if (!node->fcd_name.name) {
390 kmem_cache_free(ext4_fc_dentry_cachep, node);
391 ext4_fc_mark_ineligible(inode->i_sb,
Harshad Shirwadkarb21ebf12020-11-05 19:58:51 -0800392 EXT4_FC_REASON_NOMEM);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700393 mutex_lock(&ei->i_fc_lock);
394 return -ENOMEM;
395 }
396 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
397 dentry->d_name.len);
398 } else {
399 memcpy(node->fcd_iname, dentry->d_name.name,
400 dentry->d_name.len);
401 node->fcd_name.name = node->fcd_iname;
402 }
403 node->fcd_name.len = dentry->d_name.len;
404
405 spin_lock(&sbi->s_fc_lock);
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700406 if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700407 list_add_tail(&node->fcd_list,
408 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
409 else
410 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
411 spin_unlock(&sbi->s_fc_lock);
412 mutex_lock(&ei->i_fc_lock);
413
414 return 0;
415}
416
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800417void __ext4_fc_track_unlink(handle_t *handle,
418 struct inode *inode, struct dentry *dentry)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700419{
420 struct __track_dentry_update_args args;
421 int ret;
422
423 args.dentry = dentry;
424 args.op = EXT4_FC_TAG_UNLINK;
425
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800426 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700427 (void *)&args, 0);
428 trace_ext4_fc_track_unlink(inode, dentry, ret);
429}
430
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800431void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
432{
433 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
434}
435
436void __ext4_fc_track_link(handle_t *handle,
437 struct inode *inode, struct dentry *dentry)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700438{
439 struct __track_dentry_update_args args;
440 int ret;
441
442 args.dentry = dentry;
443 args.op = EXT4_FC_TAG_LINK;
444
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800445 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700446 (void *)&args, 0);
447 trace_ext4_fc_track_link(inode, dentry, ret);
448}
449
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800450void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
451{
452 __ext4_fc_track_link(handle, d_inode(dentry), dentry);
453}
454
455void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700456{
457 struct __track_dentry_update_args args;
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800458 struct inode *inode = d_inode(dentry);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700459 int ret;
460
461 args.dentry = dentry;
462 args.op = EXT4_FC_TAG_CREAT;
463
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800464 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700465 (void *)&args, 0);
466 trace_ext4_fc_track_create(inode, dentry, ret);
467}
468
469/* __track_fn for inode tracking */
470static int __track_inode(struct inode *inode, void *arg, bool update)
471{
472 if (update)
473 return -EEXIST;
474
475 EXT4_I(inode)->i_fc_lblk_len = 0;
476
477 return 0;
478}
479
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800480void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700481{
482 int ret;
483
484 if (S_ISDIR(inode->i_mode))
485 return;
486
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800487 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700488 trace_ext4_fc_track_inode(inode, ret);
489}
490
491struct __track_range_args {
492 ext4_lblk_t start, end;
493};
494
495/* __track_fn for tracking data updates */
496static int __track_range(struct inode *inode, void *arg, bool update)
497{
498 struct ext4_inode_info *ei = EXT4_I(inode);
499 ext4_lblk_t oldstart;
500 struct __track_range_args *__arg =
501 (struct __track_range_args *)arg;
502
503 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
504 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
505 return -ECANCELED;
506 }
507
508 oldstart = ei->i_fc_lblk_start;
509
510 if (update && ei->i_fc_lblk_len > 0) {
511 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
512 ei->i_fc_lblk_len =
513 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
514 ei->i_fc_lblk_start + 1;
515 } else {
516 ei->i_fc_lblk_start = __arg->start;
517 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
518 }
519
520 return 0;
521}
522
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800523void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700524 ext4_lblk_t end)
525{
526 struct __track_range_args args;
527 int ret;
528
529 if (S_ISDIR(inode->i_mode))
530 return;
531
532 args.start = start;
533 args.end = end;
534
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800535 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700536
537 trace_ext4_fc_track_range(inode, start, end, ret);
538}
539
540static void ext4_fc_submit_bh(struct super_block *sb)
541{
542 int write_flags = REQ_SYNC;
543 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
544
545 if (test_opt(sb, BARRIER))
546 write_flags |= REQ_FUA | REQ_PREFLUSH;
547 lock_buffer(bh);
548 clear_buffer_dirty(bh);
549 set_buffer_uptodate(bh);
550 bh->b_end_io = ext4_end_buffer_io_sync;
551 submit_bh(REQ_OP_WRITE, write_flags, bh);
552 EXT4_SB(sb)->s_fc_bh = NULL;
553}
554
555/* Ext4 commit path routines */
556
557/* memzero and update CRC */
558static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
559 u32 *crc)
560{
561 void *ret;
562
563 ret = memset(dst, 0, len);
564 if (crc)
565 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
566 return ret;
567}
568
569/*
570 * Allocate len bytes on a fast commit buffer.
571 *
572 * During the commit time this function is used to manage fast commit
573 * block space. We don't split a fast commit log onto different
574 * blocks. So this function makes sure that if there's not enough space
575 * on the current block, the remaining space in the current block is
576 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
577 * new block is from jbd2 and CRC is updated to reflect the padding
578 * we added.
579 */
580static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
581{
582 struct ext4_fc_tl *tl;
583 struct ext4_sb_info *sbi = EXT4_SB(sb);
584 struct buffer_head *bh;
585 int bsize = sbi->s_journal->j_blocksize;
586 int ret, off = sbi->s_fc_bytes % bsize;
587 int pad_len;
588
589 /*
590 * After allocating len, we should have space at least for a 0 byte
591 * padding.
592 */
593 if (len + sizeof(struct ext4_fc_tl) > bsize)
594 return NULL;
595
596 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
597 /*
598 * Only allocate from current buffer if we have enough space for
599 * this request AND we have space to add a zero byte padding.
600 */
601 if (!sbi->s_fc_bh) {
602 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
603 if (ret)
604 return NULL;
605 sbi->s_fc_bh = bh;
606 }
607 sbi->s_fc_bytes += len;
608 return sbi->s_fc_bh->b_data + off;
609 }
610 /* Need to add PAD tag */
611 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
612 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
613 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
614 tl->fc_len = cpu_to_le16(pad_len);
615 if (crc)
616 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
617 if (pad_len > 0)
618 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
619 ext4_fc_submit_bh(sb);
620
621 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
622 if (ret)
623 return NULL;
624 sbi->s_fc_bh = bh;
625 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
626 return sbi->s_fc_bh->b_data;
627}
628
629/* memcpy to fc reserved space and update CRC */
630static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
631 int len, u32 *crc)
632{
633 if (crc)
634 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
635 return memcpy(dst, src, len);
636}
637
638/*
639 * Complete a fast commit by writing tail tag.
640 *
641 * Writing tail tag marks the end of a fast commit. In order to guarantee
642 * atomicity, after writing tail tag, even if there's space remaining
643 * in the block, next commit shouldn't use it. That's why tail tag
644 * has the length as that of the remaining space on the block.
645 */
646static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
647{
648 struct ext4_sb_info *sbi = EXT4_SB(sb);
649 struct ext4_fc_tl tl;
650 struct ext4_fc_tail tail;
651 int off, bsize = sbi->s_journal->j_blocksize;
652 u8 *dst;
653
654 /*
655 * ext4_fc_reserve_space takes care of allocating an extra block if
656 * there's no enough space on this block for accommodating this tail.
657 */
658 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
659 if (!dst)
660 return -ENOSPC;
661
662 off = sbi->s_fc_bytes % bsize;
663
664 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
665 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
666 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
667
668 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
669 dst += sizeof(tl);
670 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
671 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
672 dst += sizeof(tail.fc_tid);
673 tail.fc_crc = cpu_to_le32(crc);
674 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
675
676 ext4_fc_submit_bh(sb);
677
678 return 0;
679}
680
681/*
682 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
683 * Returns false if there's not enough space.
684 */
685static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
686 u32 *crc)
687{
688 struct ext4_fc_tl tl;
689 u8 *dst;
690
691 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
692 if (!dst)
693 return false;
694
695 tl.fc_tag = cpu_to_le16(tag);
696 tl.fc_len = cpu_to_le16(len);
697
698 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
699 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
700
701 return true;
702}
703
704/* Same as above, but adds dentry tlv. */
705static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
706 int parent_ino, int ino, int dlen,
707 const unsigned char *dname,
708 u32 *crc)
709{
710 struct ext4_fc_dentry_info fcd;
711 struct ext4_fc_tl tl;
712 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
713 crc);
714
715 if (!dst)
716 return false;
717
718 fcd.fc_parent_ino = cpu_to_le32(parent_ino);
719 fcd.fc_ino = cpu_to_le32(ino);
720 tl.fc_tag = cpu_to_le16(tag);
721 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
722 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
723 dst += sizeof(tl);
724 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
725 dst += sizeof(fcd);
726 ext4_fc_memcpy(sb, dst, dname, dlen, crc);
727 dst += dlen;
728
729 return true;
730}
731
732/*
733 * Writes inode in the fast commit space under TLV with tag @tag.
734 * Returns 0 on success, error on failure.
735 */
736static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
737{
738 struct ext4_inode_info *ei = EXT4_I(inode);
739 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
740 int ret;
741 struct ext4_iloc iloc;
742 struct ext4_fc_inode fc_inode;
743 struct ext4_fc_tl tl;
744 u8 *dst;
745
746 ret = ext4_get_inode_loc(inode, &iloc);
747 if (ret)
748 return ret;
749
750 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
751 inode_len += ei->i_extra_isize;
752
753 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
754 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
755 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
756
757 dst = ext4_fc_reserve_space(inode->i_sb,
758 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
759 if (!dst)
760 return -ECANCELED;
761
762 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
763 return -ECANCELED;
764 dst += sizeof(tl);
765 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
766 return -ECANCELED;
767 dst += sizeof(fc_inode);
768 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
769 inode_len, crc))
770 return -ECANCELED;
771
772 return 0;
773}
774
775/*
776 * Writes updated data ranges for the inode in question. Updates CRC.
777 * Returns 0 on success, error otherwise.
778 */
779static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
780{
781 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
782 struct ext4_inode_info *ei = EXT4_I(inode);
783 struct ext4_map_blocks map;
784 struct ext4_fc_add_range fc_ext;
785 struct ext4_fc_del_range lrange;
786 struct ext4_extent *ex;
787 int ret;
788
789 mutex_lock(&ei->i_fc_lock);
790 if (ei->i_fc_lblk_len == 0) {
791 mutex_unlock(&ei->i_fc_lock);
792 return 0;
793 }
794 old_blk_size = ei->i_fc_lblk_start;
795 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
796 ei->i_fc_lblk_len = 0;
797 mutex_unlock(&ei->i_fc_lock);
798
799 cur_lblk_off = old_blk_size;
800 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
801 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
802
803 while (cur_lblk_off <= new_blk_size) {
804 map.m_lblk = cur_lblk_off;
805 map.m_len = new_blk_size - cur_lblk_off + 1;
806 ret = ext4_map_blocks(NULL, inode, &map, 0);
807 if (ret < 0)
808 return -ECANCELED;
809
810 if (map.m_len == 0) {
811 cur_lblk_off++;
812 continue;
813 }
814
815 if (ret == 0) {
816 lrange.fc_ino = cpu_to_le32(inode->i_ino);
817 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
818 lrange.fc_len = cpu_to_le32(map.m_len);
819 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
820 sizeof(lrange), (u8 *)&lrange, crc))
821 return -ENOSPC;
822 } else {
823 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
824 ex = (struct ext4_extent *)&fc_ext.fc_ex;
825 ex->ee_block = cpu_to_le32(map.m_lblk);
826 ex->ee_len = cpu_to_le16(map.m_len);
827 ext4_ext_store_pblock(ex, map.m_pblk);
828 if (map.m_flags & EXT4_MAP_UNWRITTEN)
829 ext4_ext_mark_unwritten(ex);
830 else
831 ext4_ext_mark_initialized(ex);
832 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
833 sizeof(fc_ext), (u8 *)&fc_ext, crc))
834 return -ENOSPC;
835 }
836
837 cur_lblk_off += map.m_len;
838 }
839
840 return 0;
841}
842
843
844/* Submit data for all the fast commit inodes */
845static int ext4_fc_submit_inode_data_all(journal_t *journal)
846{
847 struct super_block *sb = (struct super_block *)(journal->j_private);
848 struct ext4_sb_info *sbi = EXT4_SB(sb);
849 struct ext4_inode_info *ei;
850 struct list_head *pos;
851 int ret = 0;
852
853 spin_lock(&sbi->s_fc_lock);
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700854 sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700855 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
856 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
857 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
858 while (atomic_read(&ei->i_fc_updates)) {
859 DEFINE_WAIT(wait);
860
861 prepare_to_wait(&ei->i_fc_wait, &wait,
862 TASK_UNINTERRUPTIBLE);
863 if (atomic_read(&ei->i_fc_updates)) {
864 spin_unlock(&sbi->s_fc_lock);
865 schedule();
866 spin_lock(&sbi->s_fc_lock);
867 }
868 finish_wait(&ei->i_fc_wait, &wait);
869 }
870 spin_unlock(&sbi->s_fc_lock);
871 ret = jbd2_submit_inode_data(ei->jinode);
872 if (ret)
873 return ret;
874 spin_lock(&sbi->s_fc_lock);
875 }
876 spin_unlock(&sbi->s_fc_lock);
877
878 return ret;
879}
880
881/* Wait for completion of data for all the fast commit inodes */
882static int ext4_fc_wait_inode_data_all(journal_t *journal)
883{
884 struct super_block *sb = (struct super_block *)(journal->j_private);
885 struct ext4_sb_info *sbi = EXT4_SB(sb);
886 struct ext4_inode_info *pos, *n;
887 int ret = 0;
888
889 spin_lock(&sbi->s_fc_lock);
890 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
891 if (!ext4_test_inode_state(&pos->vfs_inode,
892 EXT4_STATE_FC_COMMITTING))
893 continue;
894 spin_unlock(&sbi->s_fc_lock);
895
896 ret = jbd2_wait_inode_data(journal, pos->jinode);
897 if (ret)
898 return ret;
899 spin_lock(&sbi->s_fc_lock);
900 }
901 spin_unlock(&sbi->s_fc_lock);
902
903 return 0;
904}
905
906/* Commit all the directory entry updates */
907static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
908{
909 struct super_block *sb = (struct super_block *)(journal->j_private);
910 struct ext4_sb_info *sbi = EXT4_SB(sb);
911 struct ext4_fc_dentry_update *fc_dentry;
912 struct inode *inode;
913 struct list_head *pos, *n, *fcd_pos, *fcd_n;
914 struct ext4_inode_info *ei;
915 int ret;
916
917 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
918 return 0;
919 list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
920 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
921 fcd_list);
922 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
923 spin_unlock(&sbi->s_fc_lock);
924 if (!ext4_fc_add_dentry_tlv(
925 sb, fc_dentry->fcd_op,
926 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
927 fc_dentry->fcd_name.len,
928 fc_dentry->fcd_name.name, crc)) {
929 ret = -ENOSPC;
930 goto lock_and_exit;
931 }
932 spin_lock(&sbi->s_fc_lock);
933 continue;
934 }
935
936 inode = NULL;
937 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
938 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
939 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
940 inode = &ei->vfs_inode;
941 break;
942 }
943 }
944 /*
945 * If we don't find inode in our list, then it was deleted,
946 * in which case, we don't need to record it's create tag.
947 */
948 if (!inode)
949 continue;
950 spin_unlock(&sbi->s_fc_lock);
951
952 /*
953 * We first write the inode and then the create dirent. This
954 * allows the recovery code to create an unnamed inode first
955 * and then link it to a directory entry. This allows us
956 * to use namei.c routines almost as is and simplifies
957 * the recovery code.
958 */
959 ret = ext4_fc_write_inode(inode, crc);
960 if (ret)
961 goto lock_and_exit;
962
963 ret = ext4_fc_write_inode_data(inode, crc);
964 if (ret)
965 goto lock_and_exit;
966
967 if (!ext4_fc_add_dentry_tlv(
968 sb, fc_dentry->fcd_op,
969 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
970 fc_dentry->fcd_name.len,
971 fc_dentry->fcd_name.name, crc)) {
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700972 ret = -ENOSPC;
973 goto lock_and_exit;
974 }
975
976 spin_lock(&sbi->s_fc_lock);
977 }
978 return 0;
979lock_and_exit:
980 spin_lock(&sbi->s_fc_lock);
981 return ret;
982}
983
984static int ext4_fc_perform_commit(journal_t *journal)
985{
986 struct super_block *sb = (struct super_block *)(journal->j_private);
987 struct ext4_sb_info *sbi = EXT4_SB(sb);
988 struct ext4_inode_info *iter;
989 struct ext4_fc_head head;
990 struct list_head *pos;
991 struct inode *inode;
992 struct blk_plug plug;
993 int ret = 0;
994 u32 crc = 0;
995
996 ret = ext4_fc_submit_inode_data_all(journal);
997 if (ret)
998 return ret;
999
1000 ret = ext4_fc_wait_inode_data_all(journal);
1001 if (ret)
1002 return ret;
1003
1004 blk_start_plug(&plug);
1005 if (sbi->s_fc_bytes == 0) {
1006 /*
1007 * Add a head tag only if this is the first fast commit
1008 * in this TID.
1009 */
1010 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1011 head.fc_tid = cpu_to_le32(
1012 sbi->s_journal->j_running_transaction->t_tid);
1013 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1014 (u8 *)&head, &crc))
1015 goto out;
1016 }
1017
1018 spin_lock(&sbi->s_fc_lock);
1019 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1020 if (ret) {
1021 spin_unlock(&sbi->s_fc_lock);
1022 goto out;
1023 }
1024
1025 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1026 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1027 inode = &iter->vfs_inode;
1028 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1029 continue;
1030
1031 spin_unlock(&sbi->s_fc_lock);
1032 ret = ext4_fc_write_inode_data(inode, &crc);
1033 if (ret)
1034 goto out;
1035 ret = ext4_fc_write_inode(inode, &crc);
1036 if (ret)
1037 goto out;
1038 spin_lock(&sbi->s_fc_lock);
1039 EXT4_I(inode)->i_fc_committed_subtid =
1040 atomic_read(&sbi->s_fc_subtid);
1041 }
1042 spin_unlock(&sbi->s_fc_lock);
1043
1044 ret = ext4_fc_write_tail(sb, crc);
1045
1046out:
1047 blk_finish_plug(&plug);
1048 return ret;
1049}
1050
1051/*
1052 * The main commit entry point. Performs a fast commit for transaction
1053 * commit_tid if needed. If it's not possible to perform a fast commit
1054 * due to various reasons, we fall back to full commit. Returns 0
1055 * on success, error otherwise.
1056 */
1057int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1058{
1059 struct super_block *sb = (struct super_block *)(journal->j_private);
1060 struct ext4_sb_info *sbi = EXT4_SB(sb);
1061 int nblks = 0, ret, bsize = journal->j_blocksize;
1062 int subtid = atomic_read(&sbi->s_fc_subtid);
1063 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1064 ktime_t start_time, commit_time;
1065
1066 trace_ext4_fc_commit_start(sb);
1067
1068 start_time = ktime_get();
1069
1070 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1071 (ext4_fc_is_ineligible(sb))) {
1072 reason = EXT4_FC_REASON_INELIGIBLE;
1073 goto out;
1074 }
1075
1076restart_fc:
1077 ret = jbd2_fc_begin_commit(journal, commit_tid);
1078 if (ret == -EALREADY) {
1079 /* There was an ongoing commit, check if we need to restart */
1080 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1081 commit_tid > journal->j_commit_sequence)
1082 goto restart_fc;
1083 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1084 goto out;
1085 } else if (ret) {
1086 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1087 reason = EXT4_FC_REASON_FC_START_FAILED;
1088 goto out;
1089 }
1090
1091 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1092 ret = ext4_fc_perform_commit(journal);
1093 if (ret < 0) {
1094 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1095 reason = EXT4_FC_REASON_FC_FAILED;
1096 goto out;
1097 }
1098 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1099 ret = jbd2_fc_wait_bufs(journal, nblks);
1100 if (ret < 0) {
1101 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1102 reason = EXT4_FC_REASON_FC_FAILED;
1103 goto out;
1104 }
1105 atomic_inc(&sbi->s_fc_subtid);
1106 jbd2_fc_end_commit(journal);
1107out:
1108 /* Has any ineligible update happened since we started? */
1109 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1110 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1111 reason = EXT4_FC_REASON_INELIGIBLE;
1112 }
1113
1114 spin_lock(&sbi->s_fc_lock);
1115 if (reason != EXT4_FC_REASON_OK &&
1116 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1117 sbi->s_fc_stats.fc_ineligible_commits++;
1118 } else {
1119 sbi->s_fc_stats.fc_num_commits++;
1120 sbi->s_fc_stats.fc_numblks += nblks;
1121 }
1122 spin_unlock(&sbi->s_fc_lock);
1123 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1124 trace_ext4_fc_commit_stop(sb, nblks, reason);
1125 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1126 /*
1127 * weight the commit time higher than the average time so we don't
1128 * react too strongly to vast changes in the commit time
1129 */
1130 if (likely(sbi->s_fc_avg_commit_time))
1131 sbi->s_fc_avg_commit_time = (commit_time +
1132 sbi->s_fc_avg_commit_time * 3) / 4;
1133 else
1134 sbi->s_fc_avg_commit_time = commit_time;
1135 jbd_debug(1,
1136 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1137 nblks, reason, subtid);
1138 if (reason == EXT4_FC_REASON_FC_FAILED)
Harshad Shirwadkar0bce5772020-11-05 19:58:58 -08001139 return jbd2_fc_end_commit_fallback(journal);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07001140 if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1141 reason == EXT4_FC_REASON_INELIGIBLE)
1142 return jbd2_complete_transaction(journal, commit_tid);
1143 return 0;
1144}
1145
Harshad Shirwadkarff780b92020-10-15 13:37:56 -07001146/*
1147 * Fast commit cleanup routine. This is called after every fast commit and
1148 * full commit. full is true if we are called after a full commit.
1149 */
1150static void ext4_fc_cleanup(journal_t *journal, int full)
1151{
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07001152 struct super_block *sb = journal->j_private;
1153 struct ext4_sb_info *sbi = EXT4_SB(sb);
1154 struct ext4_inode_info *iter;
1155 struct ext4_fc_dentry_update *fc_dentry;
1156 struct list_head *pos, *n;
1157
1158 if (full && sbi->s_fc_bh)
1159 sbi->s_fc_bh = NULL;
1160
1161 jbd2_fc_release_bufs(journal);
1162
1163 spin_lock(&sbi->s_fc_lock);
1164 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1165 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1166 list_del_init(&iter->i_fc_list);
1167 ext4_clear_inode_state(&iter->vfs_inode,
1168 EXT4_STATE_FC_COMMITTING);
1169 ext4_fc_reset_inode(&iter->vfs_inode);
1170 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1171 smp_mb();
1172#if (BITS_PER_LONG < 64)
1173 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1174#else
1175 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1176#endif
1177 }
1178
1179 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1180 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1181 struct ext4_fc_dentry_update,
1182 fcd_list);
1183 list_del_init(&fc_dentry->fcd_list);
1184 spin_unlock(&sbi->s_fc_lock);
1185
1186 if (fc_dentry->fcd_name.name &&
1187 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1188 kfree(fc_dentry->fcd_name.name);
1189 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1190 spin_lock(&sbi->s_fc_lock);
1191 }
1192
1193 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1194 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1195 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1196 &sbi->s_fc_q[FC_Q_STAGING]);
1197
Harshad Shirwadkarababea72020-10-26 21:49:15 -07001198 sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
1199 sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07001200
1201 if (full)
1202 sbi->s_fc_bytes = 0;
1203 spin_unlock(&sbi->s_fc_lock);
1204 trace_ext4_fc_stats(sb);
Harshad Shirwadkarff780b92020-10-15 13:37:56 -07001205}
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07001206
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001207/* Ext4 Replay Path Routines */
1208
1209/* Get length of a particular tlv */
1210static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1211{
1212 return le16_to_cpu(tl->fc_len);
1213}
1214
1215/* Get a pointer to "value" of a tlv */
1216static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1217{
1218 return (u8 *)tl + sizeof(*tl);
1219}
1220
1221/* Helper struct for dentry replay routines */
1222struct dentry_info_args {
1223 int parent_ino, dname_len, ino, inode_len;
1224 char *dname;
1225};
1226
1227static inline void tl_to_darg(struct dentry_info_args *darg,
1228 struct ext4_fc_tl *tl)
1229{
1230 struct ext4_fc_dentry_info *fcd;
1231
1232 fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1233
1234 darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1235 darg->ino = le32_to_cpu(fcd->fc_ino);
1236 darg->dname = fcd->fc_dname;
1237 darg->dname_len = ext4_fc_tag_len(tl) -
1238 sizeof(struct ext4_fc_dentry_info);
1239}
1240
1241/* Unlink replay function */
1242static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1243{
1244 struct inode *inode, *old_parent;
1245 struct qstr entry;
1246 struct dentry_info_args darg;
1247 int ret = 0;
1248
1249 tl_to_darg(&darg, tl);
1250
1251 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1252 darg.parent_ino, darg.dname_len);
1253
1254 entry.name = darg.dname;
1255 entry.len = darg.dname_len;
1256 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1257
1258 if (IS_ERR_OR_NULL(inode)) {
1259 jbd_debug(1, "Inode %d not found", darg.ino);
1260 return 0;
1261 }
1262
1263 old_parent = ext4_iget(sb, darg.parent_ino,
1264 EXT4_IGET_NORMAL);
1265 if (IS_ERR_OR_NULL(old_parent)) {
1266 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1267 iput(inode);
1268 return 0;
1269 }
1270
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -08001271 ret = __ext4_unlink(NULL, old_parent, &entry, inode);
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001272 /* -ENOENT ok coz it might not exist anymore. */
1273 if (ret == -ENOENT)
1274 ret = 0;
1275 iput(old_parent);
1276 iput(inode);
1277 return ret;
1278}
1279
1280static int ext4_fc_replay_link_internal(struct super_block *sb,
1281 struct dentry_info_args *darg,
1282 struct inode *inode)
1283{
1284 struct inode *dir = NULL;
1285 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1286 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1287 int ret = 0;
1288
1289 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1290 if (IS_ERR(dir)) {
1291 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1292 dir = NULL;
1293 goto out;
1294 }
1295
1296 dentry_dir = d_obtain_alias(dir);
1297 if (IS_ERR(dentry_dir)) {
1298 jbd_debug(1, "Failed to obtain dentry");
1299 dentry_dir = NULL;
1300 goto out;
1301 }
1302
1303 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1304 if (!dentry_inode) {
1305 jbd_debug(1, "Inode dentry not created.");
1306 ret = -ENOMEM;
1307 goto out;
1308 }
1309
1310 ret = __ext4_link(dir, inode, dentry_inode);
1311 /*
1312 * It's possible that link already existed since data blocks
1313 * for the dir in question got persisted before we crashed OR
1314 * we replayed this tag and crashed before the entire replay
1315 * could complete.
1316 */
1317 if (ret && ret != -EEXIST) {
1318 jbd_debug(1, "Failed to link\n");
1319 goto out;
1320 }
1321
1322 ret = 0;
1323out:
1324 if (dentry_dir) {
1325 d_drop(dentry_dir);
1326 dput(dentry_dir);
1327 } else if (dir) {
1328 iput(dir);
1329 }
1330 if (dentry_inode) {
1331 d_drop(dentry_inode);
1332 dput(dentry_inode);
1333 }
1334
1335 return ret;
1336}
1337
1338/* Link replay function */
1339static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1340{
1341 struct inode *inode;
1342 struct dentry_info_args darg;
1343 int ret = 0;
1344
1345 tl_to_darg(&darg, tl);
1346 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1347 darg.parent_ino, darg.dname_len);
1348
1349 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1350 if (IS_ERR_OR_NULL(inode)) {
1351 jbd_debug(1, "Inode not found.");
1352 return 0;
1353 }
1354
1355 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1356 iput(inode);
1357 return ret;
1358}
1359
1360/*
1361 * Record all the modified inodes during replay. We use this later to setup
1362 * block bitmaps correctly.
1363 */
1364static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1365{
1366 struct ext4_fc_replay_state *state;
1367 int i;
1368
1369 state = &EXT4_SB(sb)->s_fc_replay_state;
1370 for (i = 0; i < state->fc_modified_inodes_used; i++)
1371 if (state->fc_modified_inodes[i] == ino)
1372 return 0;
1373 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1374 state->fc_modified_inodes_size +=
1375 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1376 state->fc_modified_inodes = krealloc(
1377 state->fc_modified_inodes, sizeof(int) *
1378 state->fc_modified_inodes_size,
1379 GFP_KERNEL);
1380 if (!state->fc_modified_inodes)
1381 return -ENOMEM;
1382 }
1383 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1384 return 0;
1385}
1386
1387/*
1388 * Inode replay function
1389 */
1390static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1391{
1392 struct ext4_fc_inode *fc_inode;
1393 struct ext4_inode *raw_inode;
1394 struct ext4_inode *raw_fc_inode;
1395 struct inode *inode = NULL;
1396 struct ext4_iloc iloc;
1397 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1398 struct ext4_extent_header *eh;
1399
1400 fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1401
1402 ino = le32_to_cpu(fc_inode->fc_ino);
1403 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1404
1405 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1406 if (!IS_ERR_OR_NULL(inode)) {
1407 ext4_ext_clear_bb(inode);
1408 iput(inode);
1409 }
1410
1411 ext4_fc_record_modified_inode(sb, ino);
1412
1413 raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1414 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1415 if (ret)
1416 goto out;
1417
1418 inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1419 raw_inode = ext4_raw_inode(&iloc);
1420
1421 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1422 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1423 inode_len - offsetof(struct ext4_inode, i_generation));
1424 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1425 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1426 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1427 memset(eh, 0, sizeof(*eh));
1428 eh->eh_magic = EXT4_EXT_MAGIC;
1429 eh->eh_max = cpu_to_le16(
1430 (sizeof(raw_inode->i_block) -
1431 sizeof(struct ext4_extent_header))
1432 / sizeof(struct ext4_extent));
1433 }
1434 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1435 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1436 sizeof(raw_inode->i_block));
1437 }
1438
1439 /* Immediately update the inode on disk. */
1440 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1441 if (ret)
1442 goto out;
1443 ret = sync_dirty_buffer(iloc.bh);
1444 if (ret)
1445 goto out;
1446 ret = ext4_mark_inode_used(sb, ino);
1447 if (ret)
1448 goto out;
1449
1450 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1451 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1452 if (IS_ERR_OR_NULL(inode)) {
1453 jbd_debug(1, "Inode not found.");
1454 return -EFSCORRUPTED;
1455 }
1456
1457 /*
1458 * Our allocator could have made different decisions than before
1459 * crashing. This should be fixed but until then, we calculate
1460 * the number of blocks the inode.
1461 */
1462 ext4_ext_replay_set_iblocks(inode);
1463
1464 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1465 ext4_reset_inode_seed(inode);
1466
1467 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1468 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1469 sync_dirty_buffer(iloc.bh);
1470 brelse(iloc.bh);
1471out:
1472 iput(inode);
1473 if (!ret)
1474 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1475
1476 return 0;
1477}
1478
1479/*
1480 * Dentry create replay function.
1481 *
1482 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1483 * inode for which we are trying to create a dentry here, should already have
1484 * been replayed before we start here.
1485 */
1486static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1487{
1488 int ret = 0;
1489 struct inode *inode = NULL;
1490 struct inode *dir = NULL;
1491 struct dentry_info_args darg;
1492
1493 tl_to_darg(&darg, tl);
1494
1495 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1496 darg.parent_ino, darg.dname_len);
1497
1498 /* This takes care of update group descriptor and other metadata */
1499 ret = ext4_mark_inode_used(sb, darg.ino);
1500 if (ret)
1501 goto out;
1502
1503 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1504 if (IS_ERR_OR_NULL(inode)) {
1505 jbd_debug(1, "inode %d not found.", darg.ino);
1506 inode = NULL;
1507 ret = -EINVAL;
1508 goto out;
1509 }
1510
1511 if (S_ISDIR(inode->i_mode)) {
1512 /*
1513 * If we are creating a directory, we need to make sure that the
1514 * dot and dot dot dirents are setup properly.
1515 */
1516 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1517 if (IS_ERR_OR_NULL(dir)) {
1518 jbd_debug(1, "Dir %d not found.", darg.ino);
1519 goto out;
1520 }
1521 ret = ext4_init_new_dir(NULL, dir, inode);
1522 iput(dir);
1523 if (ret) {
1524 ret = 0;
1525 goto out;
1526 }
1527 }
1528 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1529 if (ret)
1530 goto out;
1531 set_nlink(inode, 1);
1532 ext4_mark_inode_dirty(NULL, inode);
1533out:
1534 if (inode)
1535 iput(inode);
1536 return ret;
1537}
1538
1539/*
1540 * Record physical disk regions which are in use as per fast commit area. Our
1541 * simple replay phase allocator excludes these regions from allocation.
1542 */
1543static int ext4_fc_record_regions(struct super_block *sb, int ino,
1544 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1545{
1546 struct ext4_fc_replay_state *state;
1547 struct ext4_fc_alloc_region *region;
1548
1549 state = &EXT4_SB(sb)->s_fc_replay_state;
1550 if (state->fc_regions_used == state->fc_regions_size) {
1551 state->fc_regions_size +=
1552 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1553 state->fc_regions = krealloc(
1554 state->fc_regions,
1555 state->fc_regions_size *
1556 sizeof(struct ext4_fc_alloc_region),
1557 GFP_KERNEL);
1558 if (!state->fc_regions)
1559 return -ENOMEM;
1560 }
1561 region = &state->fc_regions[state->fc_regions_used++];
1562 region->ino = ino;
1563 region->lblk = lblk;
1564 region->pblk = pblk;
1565 region->len = len;
1566
1567 return 0;
1568}
1569
1570/* Replay add range tag */
1571static int ext4_fc_replay_add_range(struct super_block *sb,
1572 struct ext4_fc_tl *tl)
1573{
1574 struct ext4_fc_add_range *fc_add_ex;
1575 struct ext4_extent newex, *ex;
1576 struct inode *inode;
1577 ext4_lblk_t start, cur;
1578 int remaining, len;
1579 ext4_fsblk_t start_pblk;
1580 struct ext4_map_blocks map;
1581 struct ext4_ext_path *path = NULL;
1582 int ret;
1583
1584 fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1585 ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1586
1587 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1588 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1589 ext4_ext_get_actual_len(ex));
1590
1591 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1592 EXT4_IGET_NORMAL);
1593 if (IS_ERR_OR_NULL(inode)) {
1594 jbd_debug(1, "Inode not found.");
1595 return 0;
1596 }
1597
1598 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1599
1600 start = le32_to_cpu(ex->ee_block);
1601 start_pblk = ext4_ext_pblock(ex);
1602 len = ext4_ext_get_actual_len(ex);
1603
1604 cur = start;
1605 remaining = len;
1606 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1607 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1608 inode->i_ino);
1609
1610 while (remaining > 0) {
1611 map.m_lblk = cur;
1612 map.m_len = remaining;
1613 map.m_pblk = 0;
1614 ret = ext4_map_blocks(NULL, inode, &map, 0);
1615
1616 if (ret < 0) {
1617 iput(inode);
1618 return 0;
1619 }
1620
1621 if (ret == 0) {
1622 /* Range is not mapped */
1623 path = ext4_find_extent(inode, cur, NULL, 0);
Harshad Shirwadkar8c9be1e2020-10-27 13:43:42 -07001624 if (IS_ERR(path)) {
1625 iput(inode);
1626 return 0;
1627 }
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001628 memset(&newex, 0, sizeof(newex));
1629 newex.ee_block = cpu_to_le32(cur);
1630 ext4_ext_store_pblock(
1631 &newex, start_pblk + cur - start);
1632 newex.ee_len = cpu_to_le16(map.m_len);
1633 if (ext4_ext_is_unwritten(ex))
1634 ext4_ext_mark_unwritten(&newex);
1635 down_write(&EXT4_I(inode)->i_data_sem);
1636 ret = ext4_ext_insert_extent(
1637 NULL, inode, &path, &newex, 0);
1638 up_write((&EXT4_I(inode)->i_data_sem));
1639 ext4_ext_drop_refs(path);
1640 kfree(path);
1641 if (ret) {
1642 iput(inode);
1643 return 0;
1644 }
1645 goto next;
1646 }
1647
1648 if (start_pblk + cur - start != map.m_pblk) {
1649 /*
1650 * Logical to physical mapping changed. This can happen
1651 * if this range was removed and then reallocated to
1652 * map to new physical blocks during a fast commit.
1653 */
1654 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1655 ext4_ext_is_unwritten(ex),
1656 start_pblk + cur - start);
1657 if (ret) {
1658 iput(inode);
1659 return 0;
1660 }
1661 /*
1662 * Mark the old blocks as free since they aren't used
1663 * anymore. We maintain an array of all the modified
1664 * inodes. In case these blocks are still used at either
1665 * a different logical range in the same inode or in
1666 * some different inode, we will mark them as allocated
1667 * at the end of the FC replay using our array of
1668 * modified inodes.
1669 */
1670 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1671 goto next;
1672 }
1673
1674 /* Range is mapped and needs a state change */
1675 jbd_debug(1, "Converting from %d to %d %lld",
1676 map.m_flags & EXT4_MAP_UNWRITTEN,
1677 ext4_ext_is_unwritten(ex), map.m_pblk);
1678 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1679 ext4_ext_is_unwritten(ex), map.m_pblk);
1680 if (ret) {
1681 iput(inode);
1682 return 0;
1683 }
1684 /*
1685 * We may have split the extent tree while toggling the state.
1686 * Try to shrink the extent tree now.
1687 */
1688 ext4_ext_replay_shrink_inode(inode, start + len);
1689next:
1690 cur += map.m_len;
1691 remaining -= map.m_len;
1692 }
1693 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1694 sb->s_blocksize_bits);
1695 iput(inode);
1696 return 0;
1697}
1698
1699/* Replay DEL_RANGE tag */
1700static int
1701ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1702{
1703 struct inode *inode;
1704 struct ext4_fc_del_range *lrange;
1705 struct ext4_map_blocks map;
1706 ext4_lblk_t cur, remaining;
1707 int ret;
1708
1709 lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1710 cur = le32_to_cpu(lrange->fc_lblk);
1711 remaining = le32_to_cpu(lrange->fc_len);
1712
1713 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1714 le32_to_cpu(lrange->fc_ino), cur, remaining);
1715
1716 inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1717 if (IS_ERR_OR_NULL(inode)) {
1718 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1719 return 0;
1720 }
1721
1722 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1723
1724 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1725 inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1726 le32_to_cpu(lrange->fc_len));
1727 while (remaining > 0) {
1728 map.m_lblk = cur;
1729 map.m_len = remaining;
1730
1731 ret = ext4_map_blocks(NULL, inode, &map, 0);
1732 if (ret < 0) {
1733 iput(inode);
1734 return 0;
1735 }
1736 if (ret > 0) {
1737 remaining -= ret;
1738 cur += ret;
1739 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1740 } else {
1741 remaining -= map.m_len;
1742 cur += map.m_len;
1743 }
1744 }
1745
1746 ret = ext4_punch_hole(inode,
1747 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1748 le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits);
1749 if (ret)
1750 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1751 ext4_ext_replay_shrink_inode(inode,
1752 i_size_read(inode) >> sb->s_blocksize_bits);
1753 ext4_mark_inode_dirty(NULL, inode);
1754 iput(inode);
1755
1756 return 0;
1757}
1758
1759static inline const char *tag2str(u16 tag)
1760{
1761 switch (tag) {
1762 case EXT4_FC_TAG_LINK:
1763 return "TAG_ADD_ENTRY";
1764 case EXT4_FC_TAG_UNLINK:
1765 return "TAG_DEL_ENTRY";
1766 case EXT4_FC_TAG_ADD_RANGE:
1767 return "TAG_ADD_RANGE";
1768 case EXT4_FC_TAG_CREAT:
1769 return "TAG_CREAT_DENTRY";
1770 case EXT4_FC_TAG_DEL_RANGE:
1771 return "TAG_DEL_RANGE";
1772 case EXT4_FC_TAG_INODE:
1773 return "TAG_INODE";
1774 case EXT4_FC_TAG_PAD:
1775 return "TAG_PAD";
1776 case EXT4_FC_TAG_TAIL:
1777 return "TAG_TAIL";
1778 case EXT4_FC_TAG_HEAD:
1779 return "TAG_HEAD";
1780 default:
1781 return "TAG_ERROR";
1782 }
1783}
1784
1785static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1786{
1787 struct ext4_fc_replay_state *state;
1788 struct inode *inode;
1789 struct ext4_ext_path *path = NULL;
1790 struct ext4_map_blocks map;
1791 int i, ret, j;
1792 ext4_lblk_t cur, end;
1793
1794 state = &EXT4_SB(sb)->s_fc_replay_state;
1795 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1796 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1797 EXT4_IGET_NORMAL);
1798 if (IS_ERR_OR_NULL(inode)) {
1799 jbd_debug(1, "Inode %d not found.",
1800 state->fc_modified_inodes[i]);
1801 continue;
1802 }
1803 cur = 0;
1804 end = EXT_MAX_BLOCKS;
1805 while (cur < end) {
1806 map.m_lblk = cur;
1807 map.m_len = end - cur;
1808
1809 ret = ext4_map_blocks(NULL, inode, &map, 0);
1810 if (ret < 0)
1811 break;
1812
1813 if (ret > 0) {
1814 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1815 if (!IS_ERR_OR_NULL(path)) {
1816 for (j = 0; j < path->p_depth; j++)
1817 ext4_mb_mark_bb(inode->i_sb,
1818 path[j].p_block, 1, 1);
1819 ext4_ext_drop_refs(path);
1820 kfree(path);
1821 }
1822 cur += ret;
1823 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1824 map.m_len, 1);
1825 } else {
1826 cur = cur + (map.m_len ? map.m_len : 1);
1827 }
1828 }
1829 iput(inode);
1830 }
1831}
1832
1833/*
1834 * Check if block is in excluded regions for block allocation. The simple
1835 * allocator that runs during replay phase is calls this function to see
1836 * if it is okay to use a block.
1837 */
1838bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1839{
1840 int i;
1841 struct ext4_fc_replay_state *state;
1842
1843 state = &EXT4_SB(sb)->s_fc_replay_state;
1844 for (i = 0; i < state->fc_regions_valid; i++) {
1845 if (state->fc_regions[i].ino == 0 ||
1846 state->fc_regions[i].len == 0)
1847 continue;
1848 if (blk >= state->fc_regions[i].pblk &&
1849 blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1850 return true;
1851 }
1852 return false;
1853}
1854
1855/* Cleanup function called after replay */
1856void ext4_fc_replay_cleanup(struct super_block *sb)
1857{
1858 struct ext4_sb_info *sbi = EXT4_SB(sb);
1859
1860 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1861 kfree(sbi->s_fc_replay_state.fc_regions);
1862 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1863}
1864
1865/*
1866 * Recovery Scan phase handler
1867 *
1868 * This function is called during the scan phase and is responsible
1869 * for doing following things:
1870 * - Make sure the fast commit area has valid tags for replay
1871 * - Count number of tags that need to be replayed by the replay handler
1872 * - Verify CRC
1873 * - Create a list of excluded blocks for allocation during replay phase
1874 *
1875 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1876 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1877 * to indicate that scan has finished and JBD2 can now start replay phase.
1878 * It returns a negative error to indicate that there was an error. At the end
1879 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1880 * to indicate the number of tags that need to replayed during the replay phase.
1881 */
1882static int ext4_fc_replay_scan(journal_t *journal,
1883 struct buffer_head *bh, int off,
1884 tid_t expected_tid)
1885{
1886 struct super_block *sb = journal->j_private;
1887 struct ext4_sb_info *sbi = EXT4_SB(sb);
1888 struct ext4_fc_replay_state *state;
1889 int ret = JBD2_FC_REPLAY_CONTINUE;
1890 struct ext4_fc_add_range *ext;
1891 struct ext4_fc_tl *tl;
1892 struct ext4_fc_tail *tail;
1893 __u8 *start, *end;
1894 struct ext4_fc_head *head;
1895 struct ext4_extent *ex;
1896
1897 state = &sbi->s_fc_replay_state;
1898
1899 start = (u8 *)bh->b_data;
1900 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1901
1902 if (state->fc_replay_expected_off == 0) {
1903 state->fc_cur_tag = 0;
1904 state->fc_replay_num_tags = 0;
1905 state->fc_crc = 0;
1906 state->fc_regions = NULL;
1907 state->fc_regions_valid = state->fc_regions_used =
1908 state->fc_regions_size = 0;
1909 /* Check if we can stop early */
1910 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1911 != EXT4_FC_TAG_HEAD)
1912 return 0;
1913 }
1914
1915 if (off != state->fc_replay_expected_off) {
1916 ret = -EFSCORRUPTED;
1917 goto out_err;
1918 }
1919
1920 state->fc_replay_expected_off++;
1921 fc_for_each_tl(start, end, tl) {
1922 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1923 tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1924 switch (le16_to_cpu(tl->fc_tag)) {
1925 case EXT4_FC_TAG_ADD_RANGE:
1926 ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1927 ex = (struct ext4_extent *)&ext->fc_ex;
1928 ret = ext4_fc_record_regions(sb,
1929 le32_to_cpu(ext->fc_ino),
1930 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1931 ext4_ext_get_actual_len(ex));
1932 if (ret < 0)
1933 break;
1934 ret = JBD2_FC_REPLAY_CONTINUE;
1935 fallthrough;
1936 case EXT4_FC_TAG_DEL_RANGE:
1937 case EXT4_FC_TAG_LINK:
1938 case EXT4_FC_TAG_UNLINK:
1939 case EXT4_FC_TAG_CREAT:
1940 case EXT4_FC_TAG_INODE:
1941 case EXT4_FC_TAG_PAD:
1942 state->fc_cur_tag++;
1943 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1944 sizeof(*tl) + ext4_fc_tag_len(tl));
1945 break;
1946 case EXT4_FC_TAG_TAIL:
1947 state->fc_cur_tag++;
1948 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1949 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1950 sizeof(*tl) +
1951 offsetof(struct ext4_fc_tail,
1952 fc_crc));
1953 if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1954 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1955 state->fc_replay_num_tags = state->fc_cur_tag;
1956 state->fc_regions_valid =
1957 state->fc_regions_used;
1958 } else {
1959 ret = state->fc_replay_num_tags ?
1960 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1961 }
1962 state->fc_crc = 0;
1963 break;
1964 case EXT4_FC_TAG_HEAD:
1965 head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1966 if (le32_to_cpu(head->fc_features) &
1967 ~EXT4_FC_SUPPORTED_FEATURES) {
1968 ret = -EOPNOTSUPP;
1969 break;
1970 }
1971 if (le32_to_cpu(head->fc_tid) != expected_tid) {
1972 ret = JBD2_FC_REPLAY_STOP;
1973 break;
1974 }
1975 state->fc_cur_tag++;
1976 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1977 sizeof(*tl) + ext4_fc_tag_len(tl));
1978 break;
1979 default:
1980 ret = state->fc_replay_num_tags ?
1981 JBD2_FC_REPLAY_STOP : -ECANCELED;
1982 }
1983 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1984 break;
1985 }
1986
1987out_err:
1988 trace_ext4_fc_replay_scan(sb, ret, off);
1989 return ret;
1990}
1991
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07001992/*
1993 * Main recovery path entry point.
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001994 * The meaning of return codes is similar as above.
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07001995 */
1996static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
1997 enum passtype pass, int off, tid_t expected_tid)
1998{
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001999 struct super_block *sb = journal->j_private;
2000 struct ext4_sb_info *sbi = EXT4_SB(sb);
2001 struct ext4_fc_tl *tl;
2002 __u8 *start, *end;
2003 int ret = JBD2_FC_REPLAY_CONTINUE;
2004 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2005 struct ext4_fc_tail *tail;
2006
2007 if (pass == PASS_SCAN) {
2008 state->fc_current_pass = PASS_SCAN;
2009 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2010 }
2011
2012 if (state->fc_current_pass != pass) {
2013 state->fc_current_pass = pass;
2014 sbi->s_mount_state |= EXT4_FC_REPLAY;
2015 }
2016 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2017 jbd_debug(1, "Replay stops\n");
2018 ext4_fc_set_bitmaps_and_counters(sb);
2019 return 0;
2020 }
2021
2022#ifdef CONFIG_EXT4_DEBUG
2023 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2024 pr_warn("Dropping fc block %d because max_replay set\n", off);
2025 return JBD2_FC_REPLAY_STOP;
2026 }
2027#endif
2028
2029 start = (u8 *)bh->b_data;
2030 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2031
2032 fc_for_each_tl(start, end, tl) {
2033 if (state->fc_replay_num_tags == 0) {
2034 ret = JBD2_FC_REPLAY_STOP;
2035 ext4_fc_set_bitmaps_and_counters(sb);
2036 break;
2037 }
2038 jbd_debug(3, "Replay phase, tag:%s\n",
2039 tag2str(le16_to_cpu(tl->fc_tag)));
2040 state->fc_replay_num_tags--;
2041 switch (le16_to_cpu(tl->fc_tag)) {
2042 case EXT4_FC_TAG_LINK:
2043 ret = ext4_fc_replay_link(sb, tl);
2044 break;
2045 case EXT4_FC_TAG_UNLINK:
2046 ret = ext4_fc_replay_unlink(sb, tl);
2047 break;
2048 case EXT4_FC_TAG_ADD_RANGE:
2049 ret = ext4_fc_replay_add_range(sb, tl);
2050 break;
2051 case EXT4_FC_TAG_CREAT:
2052 ret = ext4_fc_replay_create(sb, tl);
2053 break;
2054 case EXT4_FC_TAG_DEL_RANGE:
2055 ret = ext4_fc_replay_del_range(sb, tl);
2056 break;
2057 case EXT4_FC_TAG_INODE:
2058 ret = ext4_fc_replay_inode(sb, tl);
2059 break;
2060 case EXT4_FC_TAG_PAD:
2061 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2062 ext4_fc_tag_len(tl), 0);
2063 break;
2064 case EXT4_FC_TAG_TAIL:
2065 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2066 ext4_fc_tag_len(tl), 0);
2067 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2068 WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2069 break;
2070 case EXT4_FC_TAG_HEAD:
2071 break;
2072 default:
2073 trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2074 ext4_fc_tag_len(tl), 0);
2075 ret = -ECANCELED;
2076 break;
2077 }
2078 if (ret < 0)
2079 break;
2080 ret = JBD2_FC_REPLAY_CONTINUE;
2081 }
2082 return ret;
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07002083}
2084
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07002085void ext4_fc_init(struct super_block *sb, journal_t *journal)
2086{
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07002087 /*
2088 * We set replay callback even if fast commit disabled because we may
2089 * could still have fast commit blocks that need to be replayed even if
2090 * fast commit has now been turned off.
2091 */
2092 journal->j_fc_replay_callback = ext4_fc_replay;
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07002093 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2094 return;
Harshad Shirwadkarff780b92020-10-15 13:37:56 -07002095 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07002096}
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07002097
Harshad Shirwadkarce8c59d2020-10-15 13:38:01 -07002098const char *fc_ineligible_reasons[] = {
2099 "Extended attributes changed",
2100 "Cross rename",
2101 "Journal flag changed",
2102 "Insufficient memory",
2103 "Swap boot",
2104 "Resize",
2105 "Dir renamed",
2106 "Falloc range op",
2107 "FC Commit Failed"
2108};
2109
2110int ext4_fc_info_show(struct seq_file *seq, void *v)
2111{
2112 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2113 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2114 int i;
2115
2116 if (v != SEQ_START_TOKEN)
2117 return 0;
2118
2119 seq_printf(seq,
2120 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2121 stats->fc_num_commits, stats->fc_ineligible_commits,
2122 stats->fc_numblks,
2123 div_u64(sbi->s_fc_avg_commit_time, 1000));
2124 seq_puts(seq, "Ineligible reasons:\n");
2125 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2126 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2127 stats->fc_ineligible_reason_count[i]);
2128
2129 return 0;
2130}
2131
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07002132int __init ext4_fc_init_dentry_cache(void)
2133{
2134 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2135 SLAB_RECLAIM_ACCOUNT);
2136
2137 if (ext4_fc_dentry_cachep == NULL)
2138 return -ENOMEM;
2139
2140 return 0;
2141}