blob: 639b2a308c7ba67914592efae7e6f1b8a5b2103f [file] [log] [blame]
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07001// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -070010#include "ext4.h"
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -070011#include "ext4_jbd2.h"
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -070012#include "ext4_extents.h"
13#include "mballoc.h"
14
15/*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligiblity is marked by calling one of the
70 * two following functions:
71 *
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
74 *
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
83 *
84 * Atomicity of commits
85 * --------------------
Harshad Shirwadkara7407622020-11-05 19:59:03 -080086 * In order to guarantee atomicity during the commit operation, fast commit
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -070087 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
93 *
94 * - Create a new file A and remove existing file B
95 * - fsync()
96 * - Append contents to file A
97 * - Truncate file A
98 * - fsync()
99 *
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
103 *
104 * Replay code should thus check for all the valid tails in the FC area.
105 *
106 * TODOs
107 * -----
108 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109 * eligible update must be protected within ext4_fc_start_update() and
110 * ext4_fc_stop_update(). These routines are called at much higher
111 * routines. This can be made more fine grained by combining with
112 * ext4_journal_start().
113 *
114 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115 *
116 * 3) Handle more ineligible cases.
117 */
118
119#include <trace/events/ext4.h>
120static struct kmem_cache *ext4_fc_dentry_cachep;
121
122static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123{
124 BUFFER_TRACE(bh, "");
125 if (uptodate) {
126 ext4_debug("%s: Block %lld up-to-date",
127 __func__, bh->b_blocknr);
128 set_buffer_uptodate(bh);
129 } else {
130 ext4_debug("%s: Block %lld not up-to-date",
131 __func__, bh->b_blocknr);
132 clear_buffer_uptodate(bh);
133 }
134
135 unlock_buffer(bh);
136}
137
138static inline void ext4_fc_reset_inode(struct inode *inode)
139{
140 struct ext4_inode_info *ei = EXT4_I(inode);
141
142 ei->i_fc_lblk_start = 0;
143 ei->i_fc_lblk_len = 0;
144}
145
146void ext4_fc_init_inode(struct inode *inode)
147{
148 struct ext4_inode_info *ei = EXT4_I(inode);
149
150 ext4_fc_reset_inode(inode);
151 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152 INIT_LIST_HEAD(&ei->i_fc_list);
153 init_waitqueue_head(&ei->i_fc_wait);
154 atomic_set(&ei->i_fc_updates, 0);
155 ei->i_fc_committed_subtid = 0;
156}
157
Harshad Shirwadkarf6634e22020-11-05 19:59:02 -0800158/* This function must be called with sbi->s_fc_lock held. */
159static void ext4_fc_wait_committing_inode(struct inode *inode)
160{
161 wait_queue_head_t *wq;
162 struct ext4_inode_info *ei = EXT4_I(inode);
163
164#if (BITS_PER_LONG < 64)
165 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
166 EXT4_STATE_FC_COMMITTING);
167 wq = bit_waitqueue(&ei->i_state_flags,
168 EXT4_STATE_FC_COMMITTING);
169#else
170 DEFINE_WAIT_BIT(wait, &ei->i_flags,
171 EXT4_STATE_FC_COMMITTING);
172 wq = bit_waitqueue(&ei->i_flags,
173 EXT4_STATE_FC_COMMITTING);
174#endif
175 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
176 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
177 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
178 schedule();
179 finish_wait(wq, &wait.wq_entry);
180}
181
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700182/*
183 * Inform Ext4's fast about start of an inode update
184 *
185 * This function is called by the high level call VFS callbacks before
186 * performing any inode update. This function blocks if there's an ongoing
187 * fast commit on the inode in question.
188 */
189void ext4_fc_start_update(struct inode *inode)
190{
191 struct ext4_inode_info *ei = EXT4_I(inode);
192
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700193 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
194 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700195 return;
196
197restart:
198 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
199 if (list_empty(&ei->i_fc_list))
200 goto out;
201
202 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
Harshad Shirwadkarf6634e22020-11-05 19:59:02 -0800203 ext4_fc_wait_committing_inode(inode);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700204 goto restart;
205 }
206out:
207 atomic_inc(&ei->i_fc_updates);
208 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
209}
210
211/*
212 * Stop inode update and wake up waiting fast commits if any.
213 */
214void ext4_fc_stop_update(struct inode *inode)
215{
216 struct ext4_inode_info *ei = EXT4_I(inode);
217
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700218 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
219 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700220 return;
221
222 if (atomic_dec_and_test(&ei->i_fc_updates))
223 wake_up_all(&ei->i_fc_wait);
224}
225
226/*
227 * Remove inode from fast commit list. If the inode is being committed
228 * we wait until inode commit is done.
229 */
230void ext4_fc_del(struct inode *inode)
231{
232 struct ext4_inode_info *ei = EXT4_I(inode);
233
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700234 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
235 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700236 return;
237
238restart:
239 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
240 if (list_empty(&ei->i_fc_list)) {
241 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
242 return;
243 }
244
245 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
Harshad Shirwadkarf6634e22020-11-05 19:59:02 -0800246 ext4_fc_wait_committing_inode(inode);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700247 goto restart;
248 }
Harshad Shirwadkarf6634e22020-11-05 19:59:02 -0800249 list_del_init(&ei->i_fc_list);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700250 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
251}
252
253/*
254 * Mark file system as fast commit ineligible. This means that next commit
255 * operation would result in a full jbd2 commit.
256 */
257void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
258{
259 struct ext4_sb_info *sbi = EXT4_SB(sb);
260
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700261 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
262 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
263 return;
264
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700265 sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700266 WARN_ON(reason >= EXT4_FC_REASON_MAX);
267 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
268}
269
270/*
271 * Start a fast commit ineligible update. Any commits that happen while
272 * such an operation is in progress fall back to full commits.
273 */
274void ext4_fc_start_ineligible(struct super_block *sb, int reason)
275{
276 struct ext4_sb_info *sbi = EXT4_SB(sb);
277
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700278 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
279 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
280 return;
281
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700282 WARN_ON(reason >= EXT4_FC_REASON_MAX);
283 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
284 atomic_inc(&sbi->s_fc_ineligible_updates);
285}
286
287/*
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700288 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700289 * to ensure that after stopping the ineligible update, at least one full
290 * commit takes place.
291 */
292void ext4_fc_stop_ineligible(struct super_block *sb)
293{
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700294 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
295 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
296 return;
297
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700298 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700299 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
300}
301
302static inline int ext4_fc_is_ineligible(struct super_block *sb)
303{
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700304 return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) ||
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700305 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
306}
307
308/*
309 * Generic fast commit tracking function. If this is the first time this we are
310 * called after a full commit, we initialize fast commit fields and then call
311 * __fc_track_fn() with update = 0. If we have already been called after a full
312 * commit, we pass update = 1. Based on that, the track function can determine
313 * if it needs to track a field for the first time or if it needs to just
314 * update the previously tracked value.
315 *
316 * If enqueue is set, this function enqueues the inode in fast commit list.
317 */
318static int ext4_fc_track_template(
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800319 handle_t *handle, struct inode *inode,
320 int (*__fc_track_fn)(struct inode *, void *, bool),
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700321 void *args, int enqueue)
322{
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700323 bool update = false;
324 struct ext4_inode_info *ei = EXT4_I(inode);
325 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800326 tid_t tid = 0;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700327 int ret;
328
Harshad Shirwadkar8016e292020-10-15 13:37:59 -0700329 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
330 (sbi->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700331 return -EOPNOTSUPP;
332
333 if (ext4_fc_is_ineligible(inode->i_sb))
334 return -EINVAL;
335
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800336 tid = handle->h_transaction->t_tid;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700337 mutex_lock(&ei->i_fc_lock);
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800338 if (tid == ei->i_sync_tid) {
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700339 update = true;
340 } else {
341 ext4_fc_reset_inode(inode);
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800342 ei->i_sync_tid = tid;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700343 }
344 ret = __fc_track_fn(inode, args, update);
345 mutex_unlock(&ei->i_fc_lock);
346
347 if (!enqueue)
348 return ret;
349
350 spin_lock(&sbi->s_fc_lock);
351 if (list_empty(&EXT4_I(inode)->i_fc_list))
352 list_add_tail(&EXT4_I(inode)->i_fc_list,
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700353 (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ?
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700354 &sbi->s_fc_q[FC_Q_STAGING] :
355 &sbi->s_fc_q[FC_Q_MAIN]);
356 spin_unlock(&sbi->s_fc_lock);
357
358 return ret;
359}
360
361struct __track_dentry_update_args {
362 struct dentry *dentry;
363 int op;
364};
365
366/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
367static int __track_dentry_update(struct inode *inode, void *arg, bool update)
368{
369 struct ext4_fc_dentry_update *node;
370 struct ext4_inode_info *ei = EXT4_I(inode);
371 struct __track_dentry_update_args *dentry_update =
372 (struct __track_dentry_update_args *)arg;
373 struct dentry *dentry = dentry_update->dentry;
374 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
375
376 mutex_unlock(&ei->i_fc_lock);
377 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
378 if (!node) {
Harshad Shirwadkarb21ebf12020-11-05 19:58:51 -0800379 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700380 mutex_lock(&ei->i_fc_lock);
381 return -ENOMEM;
382 }
383
384 node->fcd_op = dentry_update->op;
385 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
386 node->fcd_ino = inode->i_ino;
387 if (dentry->d_name.len > DNAME_INLINE_LEN) {
388 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
389 if (!node->fcd_name.name) {
390 kmem_cache_free(ext4_fc_dentry_cachep, node);
391 ext4_fc_mark_ineligible(inode->i_sb,
Harshad Shirwadkarb21ebf12020-11-05 19:58:51 -0800392 EXT4_FC_REASON_NOMEM);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700393 mutex_lock(&ei->i_fc_lock);
394 return -ENOMEM;
395 }
396 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
397 dentry->d_name.len);
398 } else {
399 memcpy(node->fcd_iname, dentry->d_name.name,
400 dentry->d_name.len);
401 node->fcd_name.name = node->fcd_iname;
402 }
403 node->fcd_name.len = dentry->d_name.len;
404
405 spin_lock(&sbi->s_fc_lock);
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700406 if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700407 list_add_tail(&node->fcd_list,
408 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
409 else
410 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
411 spin_unlock(&sbi->s_fc_lock);
412 mutex_lock(&ei->i_fc_lock);
413
414 return 0;
415}
416
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800417void __ext4_fc_track_unlink(handle_t *handle,
418 struct inode *inode, struct dentry *dentry)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700419{
420 struct __track_dentry_update_args args;
421 int ret;
422
423 args.dentry = dentry;
424 args.op = EXT4_FC_TAG_UNLINK;
425
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800426 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700427 (void *)&args, 0);
428 trace_ext4_fc_track_unlink(inode, dentry, ret);
429}
430
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800431void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
432{
433 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
434}
435
436void __ext4_fc_track_link(handle_t *handle,
437 struct inode *inode, struct dentry *dentry)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700438{
439 struct __track_dentry_update_args args;
440 int ret;
441
442 args.dentry = dentry;
443 args.op = EXT4_FC_TAG_LINK;
444
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800445 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700446 (void *)&args, 0);
447 trace_ext4_fc_track_link(inode, dentry, ret);
448}
449
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800450void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
451{
452 __ext4_fc_track_link(handle, d_inode(dentry), dentry);
453}
454
455void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700456{
457 struct __track_dentry_update_args args;
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800458 struct inode *inode = d_inode(dentry);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700459 int ret;
460
461 args.dentry = dentry;
462 args.op = EXT4_FC_TAG_CREAT;
463
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800464 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700465 (void *)&args, 0);
466 trace_ext4_fc_track_create(inode, dentry, ret);
467}
468
469/* __track_fn for inode tracking */
470static int __track_inode(struct inode *inode, void *arg, bool update)
471{
472 if (update)
473 return -EEXIST;
474
475 EXT4_I(inode)->i_fc_lblk_len = 0;
476
477 return 0;
478}
479
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800480void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700481{
482 int ret;
483
484 if (S_ISDIR(inode->i_mode))
485 return;
486
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800487 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700488 trace_ext4_fc_track_inode(inode, ret);
489}
490
491struct __track_range_args {
492 ext4_lblk_t start, end;
493};
494
495/* __track_fn for tracking data updates */
496static int __track_range(struct inode *inode, void *arg, bool update)
497{
498 struct ext4_inode_info *ei = EXT4_I(inode);
499 ext4_lblk_t oldstart;
500 struct __track_range_args *__arg =
501 (struct __track_range_args *)arg;
502
503 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
504 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
505 return -ECANCELED;
506 }
507
508 oldstart = ei->i_fc_lblk_start;
509
510 if (update && ei->i_fc_lblk_len > 0) {
511 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
512 ei->i_fc_lblk_len =
513 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
514 ei->i_fc_lblk_start + 1;
515 } else {
516 ei->i_fc_lblk_start = __arg->start;
517 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
518 }
519
520 return 0;
521}
522
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800523void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700524 ext4_lblk_t end)
525{
526 struct __track_range_args args;
527 int ret;
528
529 if (S_ISDIR(inode->i_mode))
530 return;
531
532 args.start = start;
533 args.end = end;
534
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -0800535 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700536
537 trace_ext4_fc_track_range(inode, start, end, ret);
538}
539
540static void ext4_fc_submit_bh(struct super_block *sb)
541{
542 int write_flags = REQ_SYNC;
543 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
544
Harshad Shirwadkara7407622020-11-05 19:59:03 -0800545 /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700546 if (test_opt(sb, BARRIER))
547 write_flags |= REQ_FUA | REQ_PREFLUSH;
548 lock_buffer(bh);
549 clear_buffer_dirty(bh);
550 set_buffer_uptodate(bh);
551 bh->b_end_io = ext4_end_buffer_io_sync;
552 submit_bh(REQ_OP_WRITE, write_flags, bh);
553 EXT4_SB(sb)->s_fc_bh = NULL;
554}
555
556/* Ext4 commit path routines */
557
558/* memzero and update CRC */
559static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
560 u32 *crc)
561{
562 void *ret;
563
564 ret = memset(dst, 0, len);
565 if (crc)
566 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
567 return ret;
568}
569
570/*
571 * Allocate len bytes on a fast commit buffer.
572 *
573 * During the commit time this function is used to manage fast commit
574 * block space. We don't split a fast commit log onto different
575 * blocks. So this function makes sure that if there's not enough space
576 * on the current block, the remaining space in the current block is
577 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
578 * new block is from jbd2 and CRC is updated to reflect the padding
579 * we added.
580 */
581static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
582{
583 struct ext4_fc_tl *tl;
584 struct ext4_sb_info *sbi = EXT4_SB(sb);
585 struct buffer_head *bh;
586 int bsize = sbi->s_journal->j_blocksize;
587 int ret, off = sbi->s_fc_bytes % bsize;
588 int pad_len;
589
590 /*
591 * After allocating len, we should have space at least for a 0 byte
592 * padding.
593 */
594 if (len + sizeof(struct ext4_fc_tl) > bsize)
595 return NULL;
596
597 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
598 /*
599 * Only allocate from current buffer if we have enough space for
600 * this request AND we have space to add a zero byte padding.
601 */
602 if (!sbi->s_fc_bh) {
603 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
604 if (ret)
605 return NULL;
606 sbi->s_fc_bh = bh;
607 }
608 sbi->s_fc_bytes += len;
609 return sbi->s_fc_bh->b_data + off;
610 }
611 /* Need to add PAD tag */
612 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
613 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
614 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
615 tl->fc_len = cpu_to_le16(pad_len);
616 if (crc)
617 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
618 if (pad_len > 0)
619 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
620 ext4_fc_submit_bh(sb);
621
622 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
623 if (ret)
624 return NULL;
625 sbi->s_fc_bh = bh;
626 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
627 return sbi->s_fc_bh->b_data;
628}
629
630/* memcpy to fc reserved space and update CRC */
631static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
632 int len, u32 *crc)
633{
634 if (crc)
635 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
636 return memcpy(dst, src, len);
637}
638
639/*
640 * Complete a fast commit by writing tail tag.
641 *
642 * Writing tail tag marks the end of a fast commit. In order to guarantee
643 * atomicity, after writing tail tag, even if there's space remaining
644 * in the block, next commit shouldn't use it. That's why tail tag
645 * has the length as that of the remaining space on the block.
646 */
647static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
648{
649 struct ext4_sb_info *sbi = EXT4_SB(sb);
650 struct ext4_fc_tl tl;
651 struct ext4_fc_tail tail;
652 int off, bsize = sbi->s_journal->j_blocksize;
653 u8 *dst;
654
655 /*
656 * ext4_fc_reserve_space takes care of allocating an extra block if
657 * there's no enough space on this block for accommodating this tail.
658 */
659 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
660 if (!dst)
661 return -ENOSPC;
662
663 off = sbi->s_fc_bytes % bsize;
664
665 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
666 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
667 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
668
669 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
670 dst += sizeof(tl);
671 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
672 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
673 dst += sizeof(tail.fc_tid);
674 tail.fc_crc = cpu_to_le32(crc);
675 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
676
677 ext4_fc_submit_bh(sb);
678
679 return 0;
680}
681
682/*
683 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
684 * Returns false if there's not enough space.
685 */
686static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
687 u32 *crc)
688{
689 struct ext4_fc_tl tl;
690 u8 *dst;
691
692 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
693 if (!dst)
694 return false;
695
696 tl.fc_tag = cpu_to_le16(tag);
697 tl.fc_len = cpu_to_le16(len);
698
699 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
700 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
701
702 return true;
703}
704
705/* Same as above, but adds dentry tlv. */
706static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
707 int parent_ino, int ino, int dlen,
708 const unsigned char *dname,
709 u32 *crc)
710{
711 struct ext4_fc_dentry_info fcd;
712 struct ext4_fc_tl tl;
713 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
714 crc);
715
716 if (!dst)
717 return false;
718
719 fcd.fc_parent_ino = cpu_to_le32(parent_ino);
720 fcd.fc_ino = cpu_to_le32(ino);
721 tl.fc_tag = cpu_to_le16(tag);
722 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
723 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
724 dst += sizeof(tl);
725 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
726 dst += sizeof(fcd);
727 ext4_fc_memcpy(sb, dst, dname, dlen, crc);
728 dst += dlen;
729
730 return true;
731}
732
733/*
734 * Writes inode in the fast commit space under TLV with tag @tag.
735 * Returns 0 on success, error on failure.
736 */
737static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
738{
739 struct ext4_inode_info *ei = EXT4_I(inode);
740 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
741 int ret;
742 struct ext4_iloc iloc;
743 struct ext4_fc_inode fc_inode;
744 struct ext4_fc_tl tl;
745 u8 *dst;
746
747 ret = ext4_get_inode_loc(inode, &iloc);
748 if (ret)
749 return ret;
750
751 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
752 inode_len += ei->i_extra_isize;
753
754 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
755 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
756 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
757
758 dst = ext4_fc_reserve_space(inode->i_sb,
759 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
760 if (!dst)
761 return -ECANCELED;
762
763 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
764 return -ECANCELED;
765 dst += sizeof(tl);
766 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
767 return -ECANCELED;
768 dst += sizeof(fc_inode);
769 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
770 inode_len, crc))
771 return -ECANCELED;
772
773 return 0;
774}
775
776/*
777 * Writes updated data ranges for the inode in question. Updates CRC.
778 * Returns 0 on success, error otherwise.
779 */
780static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
781{
782 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
783 struct ext4_inode_info *ei = EXT4_I(inode);
784 struct ext4_map_blocks map;
785 struct ext4_fc_add_range fc_ext;
786 struct ext4_fc_del_range lrange;
787 struct ext4_extent *ex;
788 int ret;
789
790 mutex_lock(&ei->i_fc_lock);
791 if (ei->i_fc_lblk_len == 0) {
792 mutex_unlock(&ei->i_fc_lock);
793 return 0;
794 }
795 old_blk_size = ei->i_fc_lblk_start;
796 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
797 ei->i_fc_lblk_len = 0;
798 mutex_unlock(&ei->i_fc_lock);
799
800 cur_lblk_off = old_blk_size;
801 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
802 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
803
804 while (cur_lblk_off <= new_blk_size) {
805 map.m_lblk = cur_lblk_off;
806 map.m_len = new_blk_size - cur_lblk_off + 1;
807 ret = ext4_map_blocks(NULL, inode, &map, 0);
808 if (ret < 0)
809 return -ECANCELED;
810
811 if (map.m_len == 0) {
812 cur_lblk_off++;
813 continue;
814 }
815
816 if (ret == 0) {
817 lrange.fc_ino = cpu_to_le32(inode->i_ino);
818 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
819 lrange.fc_len = cpu_to_le32(map.m_len);
820 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
821 sizeof(lrange), (u8 *)&lrange, crc))
822 return -ENOSPC;
823 } else {
824 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
825 ex = (struct ext4_extent *)&fc_ext.fc_ex;
826 ex->ee_block = cpu_to_le32(map.m_lblk);
827 ex->ee_len = cpu_to_le16(map.m_len);
828 ext4_ext_store_pblock(ex, map.m_pblk);
829 if (map.m_flags & EXT4_MAP_UNWRITTEN)
830 ext4_ext_mark_unwritten(ex);
831 else
832 ext4_ext_mark_initialized(ex);
833 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
834 sizeof(fc_ext), (u8 *)&fc_ext, crc))
835 return -ENOSPC;
836 }
837
838 cur_lblk_off += map.m_len;
839 }
840
841 return 0;
842}
843
844
845/* Submit data for all the fast commit inodes */
846static int ext4_fc_submit_inode_data_all(journal_t *journal)
847{
848 struct super_block *sb = (struct super_block *)(journal->j_private);
849 struct ext4_sb_info *sbi = EXT4_SB(sb);
850 struct ext4_inode_info *ei;
851 struct list_head *pos;
852 int ret = 0;
853
854 spin_lock(&sbi->s_fc_lock);
Harshad Shirwadkarababea72020-10-26 21:49:15 -0700855 sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700856 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
857 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
858 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
859 while (atomic_read(&ei->i_fc_updates)) {
860 DEFINE_WAIT(wait);
861
862 prepare_to_wait(&ei->i_fc_wait, &wait,
863 TASK_UNINTERRUPTIBLE);
864 if (atomic_read(&ei->i_fc_updates)) {
865 spin_unlock(&sbi->s_fc_lock);
866 schedule();
867 spin_lock(&sbi->s_fc_lock);
868 }
869 finish_wait(&ei->i_fc_wait, &wait);
870 }
871 spin_unlock(&sbi->s_fc_lock);
872 ret = jbd2_submit_inode_data(ei->jinode);
873 if (ret)
874 return ret;
875 spin_lock(&sbi->s_fc_lock);
876 }
877 spin_unlock(&sbi->s_fc_lock);
878
879 return ret;
880}
881
882/* Wait for completion of data for all the fast commit inodes */
883static int ext4_fc_wait_inode_data_all(journal_t *journal)
884{
885 struct super_block *sb = (struct super_block *)(journal->j_private);
886 struct ext4_sb_info *sbi = EXT4_SB(sb);
887 struct ext4_inode_info *pos, *n;
888 int ret = 0;
889
890 spin_lock(&sbi->s_fc_lock);
891 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
892 if (!ext4_test_inode_state(&pos->vfs_inode,
893 EXT4_STATE_FC_COMMITTING))
894 continue;
895 spin_unlock(&sbi->s_fc_lock);
896
897 ret = jbd2_wait_inode_data(journal, pos->jinode);
898 if (ret)
899 return ret;
900 spin_lock(&sbi->s_fc_lock);
901 }
902 spin_unlock(&sbi->s_fc_lock);
903
904 return 0;
905}
906
907/* Commit all the directory entry updates */
908static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
909{
910 struct super_block *sb = (struct super_block *)(journal->j_private);
911 struct ext4_sb_info *sbi = EXT4_SB(sb);
912 struct ext4_fc_dentry_update *fc_dentry;
913 struct inode *inode;
914 struct list_head *pos, *n, *fcd_pos, *fcd_n;
915 struct ext4_inode_info *ei;
916 int ret;
917
918 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
919 return 0;
920 list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
921 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
922 fcd_list);
923 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
924 spin_unlock(&sbi->s_fc_lock);
925 if (!ext4_fc_add_dentry_tlv(
926 sb, fc_dentry->fcd_op,
927 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
928 fc_dentry->fcd_name.len,
929 fc_dentry->fcd_name.name, crc)) {
930 ret = -ENOSPC;
931 goto lock_and_exit;
932 }
933 spin_lock(&sbi->s_fc_lock);
934 continue;
935 }
936
937 inode = NULL;
938 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
939 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
940 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
941 inode = &ei->vfs_inode;
942 break;
943 }
944 }
945 /*
946 * If we don't find inode in our list, then it was deleted,
947 * in which case, we don't need to record it's create tag.
948 */
949 if (!inode)
950 continue;
951 spin_unlock(&sbi->s_fc_lock);
952
953 /*
954 * We first write the inode and then the create dirent. This
955 * allows the recovery code to create an unnamed inode first
956 * and then link it to a directory entry. This allows us
957 * to use namei.c routines almost as is and simplifies
958 * the recovery code.
959 */
960 ret = ext4_fc_write_inode(inode, crc);
961 if (ret)
962 goto lock_and_exit;
963
964 ret = ext4_fc_write_inode_data(inode, crc);
965 if (ret)
966 goto lock_and_exit;
967
968 if (!ext4_fc_add_dentry_tlv(
969 sb, fc_dentry->fcd_op,
970 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
971 fc_dentry->fcd_name.len,
972 fc_dentry->fcd_name.name, crc)) {
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -0700973 ret = -ENOSPC;
974 goto lock_and_exit;
975 }
976
977 spin_lock(&sbi->s_fc_lock);
978 }
979 return 0;
980lock_and_exit:
981 spin_lock(&sbi->s_fc_lock);
982 return ret;
983}
984
985static int ext4_fc_perform_commit(journal_t *journal)
986{
987 struct super_block *sb = (struct super_block *)(journal->j_private);
988 struct ext4_sb_info *sbi = EXT4_SB(sb);
989 struct ext4_inode_info *iter;
990 struct ext4_fc_head head;
991 struct list_head *pos;
992 struct inode *inode;
993 struct blk_plug plug;
994 int ret = 0;
995 u32 crc = 0;
996
997 ret = ext4_fc_submit_inode_data_all(journal);
998 if (ret)
999 return ret;
1000
1001 ret = ext4_fc_wait_inode_data_all(journal);
1002 if (ret)
1003 return ret;
1004
1005 blk_start_plug(&plug);
1006 if (sbi->s_fc_bytes == 0) {
1007 /*
1008 * Add a head tag only if this is the first fast commit
1009 * in this TID.
1010 */
1011 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1012 head.fc_tid = cpu_to_le32(
1013 sbi->s_journal->j_running_transaction->t_tid);
1014 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1015 (u8 *)&head, &crc))
1016 goto out;
1017 }
1018
1019 spin_lock(&sbi->s_fc_lock);
1020 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1021 if (ret) {
1022 spin_unlock(&sbi->s_fc_lock);
1023 goto out;
1024 }
1025
1026 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1027 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1028 inode = &iter->vfs_inode;
1029 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1030 continue;
1031
1032 spin_unlock(&sbi->s_fc_lock);
1033 ret = ext4_fc_write_inode_data(inode, &crc);
1034 if (ret)
1035 goto out;
1036 ret = ext4_fc_write_inode(inode, &crc);
1037 if (ret)
1038 goto out;
1039 spin_lock(&sbi->s_fc_lock);
1040 EXT4_I(inode)->i_fc_committed_subtid =
1041 atomic_read(&sbi->s_fc_subtid);
1042 }
1043 spin_unlock(&sbi->s_fc_lock);
1044
1045 ret = ext4_fc_write_tail(sb, crc);
1046
1047out:
1048 blk_finish_plug(&plug);
1049 return ret;
1050}
1051
1052/*
1053 * The main commit entry point. Performs a fast commit for transaction
1054 * commit_tid if needed. If it's not possible to perform a fast commit
1055 * due to various reasons, we fall back to full commit. Returns 0
1056 * on success, error otherwise.
1057 */
1058int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1059{
1060 struct super_block *sb = (struct super_block *)(journal->j_private);
1061 struct ext4_sb_info *sbi = EXT4_SB(sb);
1062 int nblks = 0, ret, bsize = journal->j_blocksize;
1063 int subtid = atomic_read(&sbi->s_fc_subtid);
1064 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1065 ktime_t start_time, commit_time;
1066
1067 trace_ext4_fc_commit_start(sb);
1068
1069 start_time = ktime_get();
1070
1071 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1072 (ext4_fc_is_ineligible(sb))) {
1073 reason = EXT4_FC_REASON_INELIGIBLE;
1074 goto out;
1075 }
1076
1077restart_fc:
1078 ret = jbd2_fc_begin_commit(journal, commit_tid);
1079 if (ret == -EALREADY) {
1080 /* There was an ongoing commit, check if we need to restart */
1081 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1082 commit_tid > journal->j_commit_sequence)
1083 goto restart_fc;
1084 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1085 goto out;
1086 } else if (ret) {
1087 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1088 reason = EXT4_FC_REASON_FC_START_FAILED;
1089 goto out;
1090 }
1091
1092 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1093 ret = ext4_fc_perform_commit(journal);
1094 if (ret < 0) {
1095 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1096 reason = EXT4_FC_REASON_FC_FAILED;
1097 goto out;
1098 }
1099 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1100 ret = jbd2_fc_wait_bufs(journal, nblks);
1101 if (ret < 0) {
1102 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1103 reason = EXT4_FC_REASON_FC_FAILED;
1104 goto out;
1105 }
1106 atomic_inc(&sbi->s_fc_subtid);
1107 jbd2_fc_end_commit(journal);
1108out:
1109 /* Has any ineligible update happened since we started? */
1110 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1111 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1112 reason = EXT4_FC_REASON_INELIGIBLE;
1113 }
1114
1115 spin_lock(&sbi->s_fc_lock);
1116 if (reason != EXT4_FC_REASON_OK &&
1117 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1118 sbi->s_fc_stats.fc_ineligible_commits++;
1119 } else {
1120 sbi->s_fc_stats.fc_num_commits++;
1121 sbi->s_fc_stats.fc_numblks += nblks;
1122 }
1123 spin_unlock(&sbi->s_fc_lock);
1124 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1125 trace_ext4_fc_commit_stop(sb, nblks, reason);
1126 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1127 /*
1128 * weight the commit time higher than the average time so we don't
1129 * react too strongly to vast changes in the commit time
1130 */
1131 if (likely(sbi->s_fc_avg_commit_time))
1132 sbi->s_fc_avg_commit_time = (commit_time +
1133 sbi->s_fc_avg_commit_time * 3) / 4;
1134 else
1135 sbi->s_fc_avg_commit_time = commit_time;
1136 jbd_debug(1,
1137 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1138 nblks, reason, subtid);
1139 if (reason == EXT4_FC_REASON_FC_FAILED)
Harshad Shirwadkar0bce5772020-11-05 19:58:58 -08001140 return jbd2_fc_end_commit_fallback(journal);
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07001141 if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1142 reason == EXT4_FC_REASON_INELIGIBLE)
1143 return jbd2_complete_transaction(journal, commit_tid);
1144 return 0;
1145}
1146
Harshad Shirwadkarff780b92020-10-15 13:37:56 -07001147/*
1148 * Fast commit cleanup routine. This is called after every fast commit and
1149 * full commit. full is true if we are called after a full commit.
1150 */
1151static void ext4_fc_cleanup(journal_t *journal, int full)
1152{
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07001153 struct super_block *sb = journal->j_private;
1154 struct ext4_sb_info *sbi = EXT4_SB(sb);
1155 struct ext4_inode_info *iter;
1156 struct ext4_fc_dentry_update *fc_dentry;
1157 struct list_head *pos, *n;
1158
1159 if (full && sbi->s_fc_bh)
1160 sbi->s_fc_bh = NULL;
1161
1162 jbd2_fc_release_bufs(journal);
1163
1164 spin_lock(&sbi->s_fc_lock);
1165 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1166 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1167 list_del_init(&iter->i_fc_list);
1168 ext4_clear_inode_state(&iter->vfs_inode,
1169 EXT4_STATE_FC_COMMITTING);
1170 ext4_fc_reset_inode(&iter->vfs_inode);
1171 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1172 smp_mb();
1173#if (BITS_PER_LONG < 64)
1174 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1175#else
1176 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1177#endif
1178 }
1179
1180 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1181 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1182 struct ext4_fc_dentry_update,
1183 fcd_list);
1184 list_del_init(&fc_dentry->fcd_list);
1185 spin_unlock(&sbi->s_fc_lock);
1186
1187 if (fc_dentry->fcd_name.name &&
1188 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1189 kfree(fc_dentry->fcd_name.name);
1190 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1191 spin_lock(&sbi->s_fc_lock);
1192 }
1193
1194 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1195 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1196 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1197 &sbi->s_fc_q[FC_Q_STAGING]);
1198
Harshad Shirwadkarababea72020-10-26 21:49:15 -07001199 sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
1200 sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07001201
1202 if (full)
1203 sbi->s_fc_bytes = 0;
1204 spin_unlock(&sbi->s_fc_lock);
1205 trace_ext4_fc_stats(sb);
Harshad Shirwadkarff780b92020-10-15 13:37:56 -07001206}
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07001207
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001208/* Ext4 Replay Path Routines */
1209
1210/* Get length of a particular tlv */
1211static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1212{
1213 return le16_to_cpu(tl->fc_len);
1214}
1215
1216/* Get a pointer to "value" of a tlv */
1217static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1218{
1219 return (u8 *)tl + sizeof(*tl);
1220}
1221
1222/* Helper struct for dentry replay routines */
1223struct dentry_info_args {
1224 int parent_ino, dname_len, ino, inode_len;
1225 char *dname;
1226};
1227
1228static inline void tl_to_darg(struct dentry_info_args *darg,
1229 struct ext4_fc_tl *tl)
1230{
1231 struct ext4_fc_dentry_info *fcd;
1232
1233 fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1234
1235 darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1236 darg->ino = le32_to_cpu(fcd->fc_ino);
1237 darg->dname = fcd->fc_dname;
1238 darg->dname_len = ext4_fc_tag_len(tl) -
1239 sizeof(struct ext4_fc_dentry_info);
1240}
1241
1242/* Unlink replay function */
1243static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1244{
1245 struct inode *inode, *old_parent;
1246 struct qstr entry;
1247 struct dentry_info_args darg;
1248 int ret = 0;
1249
1250 tl_to_darg(&darg, tl);
1251
1252 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1253 darg.parent_ino, darg.dname_len);
1254
1255 entry.name = darg.dname;
1256 entry.len = darg.dname_len;
1257 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1258
1259 if (IS_ERR_OR_NULL(inode)) {
1260 jbd_debug(1, "Inode %d not found", darg.ino);
1261 return 0;
1262 }
1263
1264 old_parent = ext4_iget(sb, darg.parent_ino,
1265 EXT4_IGET_NORMAL);
1266 if (IS_ERR_OR_NULL(old_parent)) {
1267 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1268 iput(inode);
1269 return 0;
1270 }
1271
Harshad Shirwadkara80f7fc2020-11-05 19:58:53 -08001272 ret = __ext4_unlink(NULL, old_parent, &entry, inode);
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001273 /* -ENOENT ok coz it might not exist anymore. */
1274 if (ret == -ENOENT)
1275 ret = 0;
1276 iput(old_parent);
1277 iput(inode);
1278 return ret;
1279}
1280
1281static int ext4_fc_replay_link_internal(struct super_block *sb,
1282 struct dentry_info_args *darg,
1283 struct inode *inode)
1284{
1285 struct inode *dir = NULL;
1286 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1287 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1288 int ret = 0;
1289
1290 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1291 if (IS_ERR(dir)) {
1292 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1293 dir = NULL;
1294 goto out;
1295 }
1296
1297 dentry_dir = d_obtain_alias(dir);
1298 if (IS_ERR(dentry_dir)) {
1299 jbd_debug(1, "Failed to obtain dentry");
1300 dentry_dir = NULL;
1301 goto out;
1302 }
1303
1304 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1305 if (!dentry_inode) {
1306 jbd_debug(1, "Inode dentry not created.");
1307 ret = -ENOMEM;
1308 goto out;
1309 }
1310
1311 ret = __ext4_link(dir, inode, dentry_inode);
1312 /*
1313 * It's possible that link already existed since data blocks
1314 * for the dir in question got persisted before we crashed OR
1315 * we replayed this tag and crashed before the entire replay
1316 * could complete.
1317 */
1318 if (ret && ret != -EEXIST) {
1319 jbd_debug(1, "Failed to link\n");
1320 goto out;
1321 }
1322
1323 ret = 0;
1324out:
1325 if (dentry_dir) {
1326 d_drop(dentry_dir);
1327 dput(dentry_dir);
1328 } else if (dir) {
1329 iput(dir);
1330 }
1331 if (dentry_inode) {
1332 d_drop(dentry_inode);
1333 dput(dentry_inode);
1334 }
1335
1336 return ret;
1337}
1338
1339/* Link replay function */
1340static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1341{
1342 struct inode *inode;
1343 struct dentry_info_args darg;
1344 int ret = 0;
1345
1346 tl_to_darg(&darg, tl);
1347 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1348 darg.parent_ino, darg.dname_len);
1349
1350 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1351 if (IS_ERR_OR_NULL(inode)) {
1352 jbd_debug(1, "Inode not found.");
1353 return 0;
1354 }
1355
1356 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1357 iput(inode);
1358 return ret;
1359}
1360
1361/*
1362 * Record all the modified inodes during replay. We use this later to setup
1363 * block bitmaps correctly.
1364 */
1365static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1366{
1367 struct ext4_fc_replay_state *state;
1368 int i;
1369
1370 state = &EXT4_SB(sb)->s_fc_replay_state;
1371 for (i = 0; i < state->fc_modified_inodes_used; i++)
1372 if (state->fc_modified_inodes[i] == ino)
1373 return 0;
1374 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1375 state->fc_modified_inodes_size +=
1376 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1377 state->fc_modified_inodes = krealloc(
1378 state->fc_modified_inodes, sizeof(int) *
1379 state->fc_modified_inodes_size,
1380 GFP_KERNEL);
1381 if (!state->fc_modified_inodes)
1382 return -ENOMEM;
1383 }
1384 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1385 return 0;
1386}
1387
1388/*
1389 * Inode replay function
1390 */
1391static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1392{
1393 struct ext4_fc_inode *fc_inode;
1394 struct ext4_inode *raw_inode;
1395 struct ext4_inode *raw_fc_inode;
1396 struct inode *inode = NULL;
1397 struct ext4_iloc iloc;
1398 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1399 struct ext4_extent_header *eh;
1400
1401 fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1402
1403 ino = le32_to_cpu(fc_inode->fc_ino);
1404 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1405
1406 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1407 if (!IS_ERR_OR_NULL(inode)) {
1408 ext4_ext_clear_bb(inode);
1409 iput(inode);
1410 }
1411
1412 ext4_fc_record_modified_inode(sb, ino);
1413
1414 raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1415 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1416 if (ret)
1417 goto out;
1418
1419 inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1420 raw_inode = ext4_raw_inode(&iloc);
1421
1422 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1423 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1424 inode_len - offsetof(struct ext4_inode, i_generation));
1425 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1426 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1427 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1428 memset(eh, 0, sizeof(*eh));
1429 eh->eh_magic = EXT4_EXT_MAGIC;
1430 eh->eh_max = cpu_to_le16(
1431 (sizeof(raw_inode->i_block) -
1432 sizeof(struct ext4_extent_header))
1433 / sizeof(struct ext4_extent));
1434 }
1435 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1436 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1437 sizeof(raw_inode->i_block));
1438 }
1439
1440 /* Immediately update the inode on disk. */
1441 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1442 if (ret)
1443 goto out;
1444 ret = sync_dirty_buffer(iloc.bh);
1445 if (ret)
1446 goto out;
1447 ret = ext4_mark_inode_used(sb, ino);
1448 if (ret)
1449 goto out;
1450
1451 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1452 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1453 if (IS_ERR_OR_NULL(inode)) {
1454 jbd_debug(1, "Inode not found.");
1455 return -EFSCORRUPTED;
1456 }
1457
1458 /*
1459 * Our allocator could have made different decisions than before
1460 * crashing. This should be fixed but until then, we calculate
1461 * the number of blocks the inode.
1462 */
1463 ext4_ext_replay_set_iblocks(inode);
1464
1465 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1466 ext4_reset_inode_seed(inode);
1467
1468 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1469 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1470 sync_dirty_buffer(iloc.bh);
1471 brelse(iloc.bh);
1472out:
1473 iput(inode);
1474 if (!ret)
1475 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1476
1477 return 0;
1478}
1479
1480/*
1481 * Dentry create replay function.
1482 *
1483 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1484 * inode for which we are trying to create a dentry here, should already have
1485 * been replayed before we start here.
1486 */
1487static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1488{
1489 int ret = 0;
1490 struct inode *inode = NULL;
1491 struct inode *dir = NULL;
1492 struct dentry_info_args darg;
1493
1494 tl_to_darg(&darg, tl);
1495
1496 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1497 darg.parent_ino, darg.dname_len);
1498
1499 /* This takes care of update group descriptor and other metadata */
1500 ret = ext4_mark_inode_used(sb, darg.ino);
1501 if (ret)
1502 goto out;
1503
1504 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1505 if (IS_ERR_OR_NULL(inode)) {
1506 jbd_debug(1, "inode %d not found.", darg.ino);
1507 inode = NULL;
1508 ret = -EINVAL;
1509 goto out;
1510 }
1511
1512 if (S_ISDIR(inode->i_mode)) {
1513 /*
1514 * If we are creating a directory, we need to make sure that the
1515 * dot and dot dot dirents are setup properly.
1516 */
1517 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1518 if (IS_ERR_OR_NULL(dir)) {
1519 jbd_debug(1, "Dir %d not found.", darg.ino);
1520 goto out;
1521 }
1522 ret = ext4_init_new_dir(NULL, dir, inode);
1523 iput(dir);
1524 if (ret) {
1525 ret = 0;
1526 goto out;
1527 }
1528 }
1529 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1530 if (ret)
1531 goto out;
1532 set_nlink(inode, 1);
1533 ext4_mark_inode_dirty(NULL, inode);
1534out:
1535 if (inode)
1536 iput(inode);
1537 return ret;
1538}
1539
1540/*
1541 * Record physical disk regions which are in use as per fast commit area. Our
1542 * simple replay phase allocator excludes these regions from allocation.
1543 */
1544static int ext4_fc_record_regions(struct super_block *sb, int ino,
1545 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1546{
1547 struct ext4_fc_replay_state *state;
1548 struct ext4_fc_alloc_region *region;
1549
1550 state = &EXT4_SB(sb)->s_fc_replay_state;
1551 if (state->fc_regions_used == state->fc_regions_size) {
1552 state->fc_regions_size +=
1553 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1554 state->fc_regions = krealloc(
1555 state->fc_regions,
1556 state->fc_regions_size *
1557 sizeof(struct ext4_fc_alloc_region),
1558 GFP_KERNEL);
1559 if (!state->fc_regions)
1560 return -ENOMEM;
1561 }
1562 region = &state->fc_regions[state->fc_regions_used++];
1563 region->ino = ino;
1564 region->lblk = lblk;
1565 region->pblk = pblk;
1566 region->len = len;
1567
1568 return 0;
1569}
1570
1571/* Replay add range tag */
1572static int ext4_fc_replay_add_range(struct super_block *sb,
1573 struct ext4_fc_tl *tl)
1574{
1575 struct ext4_fc_add_range *fc_add_ex;
1576 struct ext4_extent newex, *ex;
1577 struct inode *inode;
1578 ext4_lblk_t start, cur;
1579 int remaining, len;
1580 ext4_fsblk_t start_pblk;
1581 struct ext4_map_blocks map;
1582 struct ext4_ext_path *path = NULL;
1583 int ret;
1584
1585 fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1586 ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1587
1588 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1589 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1590 ext4_ext_get_actual_len(ex));
1591
1592 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1593 EXT4_IGET_NORMAL);
1594 if (IS_ERR_OR_NULL(inode)) {
1595 jbd_debug(1, "Inode not found.");
1596 return 0;
1597 }
1598
1599 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1600
1601 start = le32_to_cpu(ex->ee_block);
1602 start_pblk = ext4_ext_pblock(ex);
1603 len = ext4_ext_get_actual_len(ex);
1604
1605 cur = start;
1606 remaining = len;
1607 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1608 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1609 inode->i_ino);
1610
1611 while (remaining > 0) {
1612 map.m_lblk = cur;
1613 map.m_len = remaining;
1614 map.m_pblk = 0;
1615 ret = ext4_map_blocks(NULL, inode, &map, 0);
1616
1617 if (ret < 0) {
1618 iput(inode);
1619 return 0;
1620 }
1621
1622 if (ret == 0) {
1623 /* Range is not mapped */
1624 path = ext4_find_extent(inode, cur, NULL, 0);
Harshad Shirwadkar8c9be1e2020-10-27 13:43:42 -07001625 if (IS_ERR(path)) {
1626 iput(inode);
1627 return 0;
1628 }
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001629 memset(&newex, 0, sizeof(newex));
1630 newex.ee_block = cpu_to_le32(cur);
1631 ext4_ext_store_pblock(
1632 &newex, start_pblk + cur - start);
1633 newex.ee_len = cpu_to_le16(map.m_len);
1634 if (ext4_ext_is_unwritten(ex))
1635 ext4_ext_mark_unwritten(&newex);
1636 down_write(&EXT4_I(inode)->i_data_sem);
1637 ret = ext4_ext_insert_extent(
1638 NULL, inode, &path, &newex, 0);
1639 up_write((&EXT4_I(inode)->i_data_sem));
1640 ext4_ext_drop_refs(path);
1641 kfree(path);
1642 if (ret) {
1643 iput(inode);
1644 return 0;
1645 }
1646 goto next;
1647 }
1648
1649 if (start_pblk + cur - start != map.m_pblk) {
1650 /*
1651 * Logical to physical mapping changed. This can happen
1652 * if this range was removed and then reallocated to
1653 * map to new physical blocks during a fast commit.
1654 */
1655 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1656 ext4_ext_is_unwritten(ex),
1657 start_pblk + cur - start);
1658 if (ret) {
1659 iput(inode);
1660 return 0;
1661 }
1662 /*
1663 * Mark the old blocks as free since they aren't used
1664 * anymore. We maintain an array of all the modified
1665 * inodes. In case these blocks are still used at either
1666 * a different logical range in the same inode or in
1667 * some different inode, we will mark them as allocated
1668 * at the end of the FC replay using our array of
1669 * modified inodes.
1670 */
1671 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1672 goto next;
1673 }
1674
1675 /* Range is mapped and needs a state change */
1676 jbd_debug(1, "Converting from %d to %d %lld",
1677 map.m_flags & EXT4_MAP_UNWRITTEN,
1678 ext4_ext_is_unwritten(ex), map.m_pblk);
1679 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1680 ext4_ext_is_unwritten(ex), map.m_pblk);
1681 if (ret) {
1682 iput(inode);
1683 return 0;
1684 }
1685 /*
1686 * We may have split the extent tree while toggling the state.
1687 * Try to shrink the extent tree now.
1688 */
1689 ext4_ext_replay_shrink_inode(inode, start + len);
1690next:
1691 cur += map.m_len;
1692 remaining -= map.m_len;
1693 }
1694 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1695 sb->s_blocksize_bits);
1696 iput(inode);
1697 return 0;
1698}
1699
1700/* Replay DEL_RANGE tag */
1701static int
1702ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1703{
1704 struct inode *inode;
1705 struct ext4_fc_del_range *lrange;
1706 struct ext4_map_blocks map;
1707 ext4_lblk_t cur, remaining;
1708 int ret;
1709
1710 lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1711 cur = le32_to_cpu(lrange->fc_lblk);
1712 remaining = le32_to_cpu(lrange->fc_len);
1713
1714 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1715 le32_to_cpu(lrange->fc_ino), cur, remaining);
1716
1717 inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1718 if (IS_ERR_OR_NULL(inode)) {
1719 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1720 return 0;
1721 }
1722
1723 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1724
1725 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1726 inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1727 le32_to_cpu(lrange->fc_len));
1728 while (remaining > 0) {
1729 map.m_lblk = cur;
1730 map.m_len = remaining;
1731
1732 ret = ext4_map_blocks(NULL, inode, &map, 0);
1733 if (ret < 0) {
1734 iput(inode);
1735 return 0;
1736 }
1737 if (ret > 0) {
1738 remaining -= ret;
1739 cur += ret;
1740 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1741 } else {
1742 remaining -= map.m_len;
1743 cur += map.m_len;
1744 }
1745 }
1746
1747 ret = ext4_punch_hole(inode,
1748 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1749 le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits);
1750 if (ret)
1751 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1752 ext4_ext_replay_shrink_inode(inode,
1753 i_size_read(inode) >> sb->s_blocksize_bits);
1754 ext4_mark_inode_dirty(NULL, inode);
1755 iput(inode);
1756
1757 return 0;
1758}
1759
1760static inline const char *tag2str(u16 tag)
1761{
1762 switch (tag) {
1763 case EXT4_FC_TAG_LINK:
1764 return "TAG_ADD_ENTRY";
1765 case EXT4_FC_TAG_UNLINK:
1766 return "TAG_DEL_ENTRY";
1767 case EXT4_FC_TAG_ADD_RANGE:
1768 return "TAG_ADD_RANGE";
1769 case EXT4_FC_TAG_CREAT:
1770 return "TAG_CREAT_DENTRY";
1771 case EXT4_FC_TAG_DEL_RANGE:
1772 return "TAG_DEL_RANGE";
1773 case EXT4_FC_TAG_INODE:
1774 return "TAG_INODE";
1775 case EXT4_FC_TAG_PAD:
1776 return "TAG_PAD";
1777 case EXT4_FC_TAG_TAIL:
1778 return "TAG_TAIL";
1779 case EXT4_FC_TAG_HEAD:
1780 return "TAG_HEAD";
1781 default:
1782 return "TAG_ERROR";
1783 }
1784}
1785
1786static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1787{
1788 struct ext4_fc_replay_state *state;
1789 struct inode *inode;
1790 struct ext4_ext_path *path = NULL;
1791 struct ext4_map_blocks map;
1792 int i, ret, j;
1793 ext4_lblk_t cur, end;
1794
1795 state = &EXT4_SB(sb)->s_fc_replay_state;
1796 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1797 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1798 EXT4_IGET_NORMAL);
1799 if (IS_ERR_OR_NULL(inode)) {
1800 jbd_debug(1, "Inode %d not found.",
1801 state->fc_modified_inodes[i]);
1802 continue;
1803 }
1804 cur = 0;
1805 end = EXT_MAX_BLOCKS;
1806 while (cur < end) {
1807 map.m_lblk = cur;
1808 map.m_len = end - cur;
1809
1810 ret = ext4_map_blocks(NULL, inode, &map, 0);
1811 if (ret < 0)
1812 break;
1813
1814 if (ret > 0) {
1815 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1816 if (!IS_ERR_OR_NULL(path)) {
1817 for (j = 0; j < path->p_depth; j++)
1818 ext4_mb_mark_bb(inode->i_sb,
1819 path[j].p_block, 1, 1);
1820 ext4_ext_drop_refs(path);
1821 kfree(path);
1822 }
1823 cur += ret;
1824 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1825 map.m_len, 1);
1826 } else {
1827 cur = cur + (map.m_len ? map.m_len : 1);
1828 }
1829 }
1830 iput(inode);
1831 }
1832}
1833
1834/*
1835 * Check if block is in excluded regions for block allocation. The simple
1836 * allocator that runs during replay phase is calls this function to see
1837 * if it is okay to use a block.
1838 */
1839bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1840{
1841 int i;
1842 struct ext4_fc_replay_state *state;
1843
1844 state = &EXT4_SB(sb)->s_fc_replay_state;
1845 for (i = 0; i < state->fc_regions_valid; i++) {
1846 if (state->fc_regions[i].ino == 0 ||
1847 state->fc_regions[i].len == 0)
1848 continue;
1849 if (blk >= state->fc_regions[i].pblk &&
1850 blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1851 return true;
1852 }
1853 return false;
1854}
1855
1856/* Cleanup function called after replay */
1857void ext4_fc_replay_cleanup(struct super_block *sb)
1858{
1859 struct ext4_sb_info *sbi = EXT4_SB(sb);
1860
1861 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1862 kfree(sbi->s_fc_replay_state.fc_regions);
1863 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1864}
1865
1866/*
1867 * Recovery Scan phase handler
1868 *
1869 * This function is called during the scan phase and is responsible
1870 * for doing following things:
1871 * - Make sure the fast commit area has valid tags for replay
1872 * - Count number of tags that need to be replayed by the replay handler
1873 * - Verify CRC
1874 * - Create a list of excluded blocks for allocation during replay phase
1875 *
1876 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1877 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1878 * to indicate that scan has finished and JBD2 can now start replay phase.
1879 * It returns a negative error to indicate that there was an error. At the end
1880 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1881 * to indicate the number of tags that need to replayed during the replay phase.
1882 */
1883static int ext4_fc_replay_scan(journal_t *journal,
1884 struct buffer_head *bh, int off,
1885 tid_t expected_tid)
1886{
1887 struct super_block *sb = journal->j_private;
1888 struct ext4_sb_info *sbi = EXT4_SB(sb);
1889 struct ext4_fc_replay_state *state;
1890 int ret = JBD2_FC_REPLAY_CONTINUE;
1891 struct ext4_fc_add_range *ext;
1892 struct ext4_fc_tl *tl;
1893 struct ext4_fc_tail *tail;
1894 __u8 *start, *end;
1895 struct ext4_fc_head *head;
1896 struct ext4_extent *ex;
1897
1898 state = &sbi->s_fc_replay_state;
1899
1900 start = (u8 *)bh->b_data;
1901 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1902
1903 if (state->fc_replay_expected_off == 0) {
1904 state->fc_cur_tag = 0;
1905 state->fc_replay_num_tags = 0;
1906 state->fc_crc = 0;
1907 state->fc_regions = NULL;
1908 state->fc_regions_valid = state->fc_regions_used =
1909 state->fc_regions_size = 0;
1910 /* Check if we can stop early */
1911 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1912 != EXT4_FC_TAG_HEAD)
1913 return 0;
1914 }
1915
1916 if (off != state->fc_replay_expected_off) {
1917 ret = -EFSCORRUPTED;
1918 goto out_err;
1919 }
1920
1921 state->fc_replay_expected_off++;
1922 fc_for_each_tl(start, end, tl) {
1923 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1924 tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1925 switch (le16_to_cpu(tl->fc_tag)) {
1926 case EXT4_FC_TAG_ADD_RANGE:
1927 ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1928 ex = (struct ext4_extent *)&ext->fc_ex;
1929 ret = ext4_fc_record_regions(sb,
1930 le32_to_cpu(ext->fc_ino),
1931 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1932 ext4_ext_get_actual_len(ex));
1933 if (ret < 0)
1934 break;
1935 ret = JBD2_FC_REPLAY_CONTINUE;
1936 fallthrough;
1937 case EXT4_FC_TAG_DEL_RANGE:
1938 case EXT4_FC_TAG_LINK:
1939 case EXT4_FC_TAG_UNLINK:
1940 case EXT4_FC_TAG_CREAT:
1941 case EXT4_FC_TAG_INODE:
1942 case EXT4_FC_TAG_PAD:
1943 state->fc_cur_tag++;
1944 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1945 sizeof(*tl) + ext4_fc_tag_len(tl));
1946 break;
1947 case EXT4_FC_TAG_TAIL:
1948 state->fc_cur_tag++;
1949 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1950 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1951 sizeof(*tl) +
1952 offsetof(struct ext4_fc_tail,
1953 fc_crc));
1954 if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1955 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1956 state->fc_replay_num_tags = state->fc_cur_tag;
1957 state->fc_regions_valid =
1958 state->fc_regions_used;
1959 } else {
1960 ret = state->fc_replay_num_tags ?
1961 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1962 }
1963 state->fc_crc = 0;
1964 break;
1965 case EXT4_FC_TAG_HEAD:
1966 head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1967 if (le32_to_cpu(head->fc_features) &
1968 ~EXT4_FC_SUPPORTED_FEATURES) {
1969 ret = -EOPNOTSUPP;
1970 break;
1971 }
1972 if (le32_to_cpu(head->fc_tid) != expected_tid) {
1973 ret = JBD2_FC_REPLAY_STOP;
1974 break;
1975 }
1976 state->fc_cur_tag++;
1977 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1978 sizeof(*tl) + ext4_fc_tag_len(tl));
1979 break;
1980 default:
1981 ret = state->fc_replay_num_tags ?
1982 JBD2_FC_REPLAY_STOP : -ECANCELED;
1983 }
1984 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1985 break;
1986 }
1987
1988out_err:
1989 trace_ext4_fc_replay_scan(sb, ret, off);
1990 return ret;
1991}
1992
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07001993/*
1994 * Main recovery path entry point.
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07001995 * The meaning of return codes is similar as above.
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07001996 */
1997static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
1998 enum passtype pass, int off, tid_t expected_tid)
1999{
Harshad Shirwadkar8016e292020-10-15 13:37:59 -07002000 struct super_block *sb = journal->j_private;
2001 struct ext4_sb_info *sbi = EXT4_SB(sb);
2002 struct ext4_fc_tl *tl;
2003 __u8 *start, *end;
2004 int ret = JBD2_FC_REPLAY_CONTINUE;
2005 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2006 struct ext4_fc_tail *tail;
2007
2008 if (pass == PASS_SCAN) {
2009 state->fc_current_pass = PASS_SCAN;
2010 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2011 }
2012
2013 if (state->fc_current_pass != pass) {
2014 state->fc_current_pass = pass;
2015 sbi->s_mount_state |= EXT4_FC_REPLAY;
2016 }
2017 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2018 jbd_debug(1, "Replay stops\n");
2019 ext4_fc_set_bitmaps_and_counters(sb);
2020 return 0;
2021 }
2022
2023#ifdef CONFIG_EXT4_DEBUG
2024 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2025 pr_warn("Dropping fc block %d because max_replay set\n", off);
2026 return JBD2_FC_REPLAY_STOP;
2027 }
2028#endif
2029
2030 start = (u8 *)bh->b_data;
2031 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2032
2033 fc_for_each_tl(start, end, tl) {
2034 if (state->fc_replay_num_tags == 0) {
2035 ret = JBD2_FC_REPLAY_STOP;
2036 ext4_fc_set_bitmaps_and_counters(sb);
2037 break;
2038 }
2039 jbd_debug(3, "Replay phase, tag:%s\n",
2040 tag2str(le16_to_cpu(tl->fc_tag)));
2041 state->fc_replay_num_tags--;
2042 switch (le16_to_cpu(tl->fc_tag)) {
2043 case EXT4_FC_TAG_LINK:
2044 ret = ext4_fc_replay_link(sb, tl);
2045 break;
2046 case EXT4_FC_TAG_UNLINK:
2047 ret = ext4_fc_replay_unlink(sb, tl);
2048 break;
2049 case EXT4_FC_TAG_ADD_RANGE:
2050 ret = ext4_fc_replay_add_range(sb, tl);
2051 break;
2052 case EXT4_FC_TAG_CREAT:
2053 ret = ext4_fc_replay_create(sb, tl);
2054 break;
2055 case EXT4_FC_TAG_DEL_RANGE:
2056 ret = ext4_fc_replay_del_range(sb, tl);
2057 break;
2058 case EXT4_FC_TAG_INODE:
2059 ret = ext4_fc_replay_inode(sb, tl);
2060 break;
2061 case EXT4_FC_TAG_PAD:
2062 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2063 ext4_fc_tag_len(tl), 0);
2064 break;
2065 case EXT4_FC_TAG_TAIL:
2066 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2067 ext4_fc_tag_len(tl), 0);
2068 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2069 WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2070 break;
2071 case EXT4_FC_TAG_HEAD:
2072 break;
2073 default:
2074 trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2075 ext4_fc_tag_len(tl), 0);
2076 ret = -ECANCELED;
2077 break;
2078 }
2079 if (ret < 0)
2080 break;
2081 ret = JBD2_FC_REPLAY_CONTINUE;
2082 }
2083 return ret;
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07002084}
2085
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07002086void ext4_fc_init(struct super_block *sb, journal_t *journal)
2087{
Harshad Shirwadkar5b849b52020-10-15 13:37:58 -07002088 /*
2089 * We set replay callback even if fast commit disabled because we may
2090 * could still have fast commit blocks that need to be replayed even if
2091 * fast commit has now been turned off.
2092 */
2093 journal->j_fc_replay_callback = ext4_fc_replay;
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07002094 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2095 return;
Harshad Shirwadkarff780b92020-10-15 13:37:56 -07002096 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
Harshad Shirwadkar6866d7b2020-10-15 13:37:55 -07002097}
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07002098
Harshad Shirwadkarce8c59d2020-10-15 13:38:01 -07002099const char *fc_ineligible_reasons[] = {
2100 "Extended attributes changed",
2101 "Cross rename",
2102 "Journal flag changed",
2103 "Insufficient memory",
2104 "Swap boot",
2105 "Resize",
2106 "Dir renamed",
2107 "Falloc range op",
2108 "FC Commit Failed"
2109};
2110
2111int ext4_fc_info_show(struct seq_file *seq, void *v)
2112{
2113 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2114 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2115 int i;
2116
2117 if (v != SEQ_START_TOKEN)
2118 return 0;
2119
2120 seq_printf(seq,
2121 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2122 stats->fc_num_commits, stats->fc_ineligible_commits,
2123 stats->fc_numblks,
2124 div_u64(sbi->s_fc_avg_commit_time, 1000));
2125 seq_puts(seq, "Ineligible reasons:\n");
2126 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2127 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2128 stats->fc_ineligible_reason_count[i]);
2129
2130 return 0;
2131}
2132
Harshad Shirwadkaraa75f4d2020-10-15 13:37:57 -07002133int __init ext4_fc_init_dentry_cache(void)
2134{
2135 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2136 SLAB_RECLAIM_ACCOUNT);
2137
2138 if (ext4_fc_dentry_cachep == NULL)
2139 return -ENOMEM;
2140
2141 return 0;
2142}