Blame - fs/ext4/fast_commit.c - linux-5.10

blob: 447c8d93f48081c11ed85e6076df2617af4f20c0 [file] [log] [blame]

Harshad Shirwadkar	6866d7b	2020-10-15 13:37:55 -0700	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2
				3	/*
				4	* fs/ext4/fast_commit.c
				5	*
				6	* Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
				7	*
				8	* Ext4 fast commits routines.
				9	*/
Harshad Shirwadkar	aa75f4d	2020-10-15 13:37:57 -0700	[diff] [blame]	10	#include "ext4.h"
Harshad Shirwadkar	6866d7b	2020-10-15 13:37:55 -0700	[diff] [blame]	11	#include "ext4_jbd2.h"
Harshad Shirwadkar	aa75f4d	2020-10-15 13:37:57 -0700	[diff] [blame]	12	#include "ext4_extents.h"
				13	#include "mballoc.h"
				14
				15	/*
				16	* Ext4 Fast Commits
				17	* -----------------
				18	*
				19	* Ext4 fast commits implement fine grained journalling for Ext4.
				20	*
				21	* Fast commits are organized as a log of tag-length-value (TLV) structs. (See
				22	* struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
				23	* TLV during the recovery phase. For the scenarios for which we currently
				24	* don't have replay code, fast commit falls back to full commits.
				25	* Fast commits record delta in one of the following three categories.
				26	*
				27	* (A) Directory entry updates:
				28	*
				29	* - EXT4_FC_TAG_UNLINK - records directory entry unlink
				30	* - EXT4_FC_TAG_LINK - records directory entry link
				31	* - EXT4_FC_TAG_CREAT - records inode and directory entry creation
				32	*
				33	* (B) File specific data range updates:
				34	*
				35	* - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
				36	* - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
				37	*
				38	* (C) Inode metadata (mtime / ctime etc):
				39	*
				40	* - EXT4_FC_TAG_INODE - record the inode that should be replayed
				41	* during recovery. Note that iblocks field is
				42	* not replayed and instead derived during
				43	* replay.
				44	* Commit Operation
				45	* ----------------
				46	* With fast commits, we maintain all the directory entry operations in the
				47	* order in which they are issued in an in-memory queue. This queue is flushed
				48	* to disk during the commit operation. We also maintain a list of inodes
				49	* that need to be committed during a fast commit in another in memory queue of
				50	* inodes. During the commit operation, we commit in the following order:
				51	*
				52	* [1] Lock inodes for any further data updates by setting COMMITTING state
				53	* [2] Submit data buffers of all the inodes
				54	* [3] Wait for [2] to complete
				55	* [4] Commit all the directory entry updates in the fast commit space
				56	* [5] Commit all the changed inode structures
				57	* [6] Write tail tag (this tag ensures the atomicity, please read the following
				58	* section for more details).
				59	* [7] Wait for [4], [5] and [6] to complete.
				60	*
				61	* All the inode updates must call ext4_fc_start_update() before starting an
				62	* update. If such an ongoing update is present, fast commit waits for it to
				63	* complete. The completion of such an update is marked by
				64	* ext4_fc_stop_update().
				65	*
				66	* Fast Commit Ineligibility
				67	* -------------------------
				68	* Not all operations are supported by fast commits today (e.g extended
				69	* attributes). Fast commit ineligiblity is marked by calling one of the
				70	* two following functions:
				71	*
				72	* - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
				73	* back to full commit. This is useful in case of transient errors.
				74	*
				75	* - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
				76	* the fast commits happening between ext4_fc_start_ineligible() and
				77	* ext4_fc_stop_ineligible() and one fast commit after the call to
				78	* ext4_fc_stop_ineligible() to fall back to full commits. It is important to
				79	* make one more fast commit to fall back to full commit after stop call so
				80	* that it guaranteed that the fast commit ineligible operation contained
				81	* within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
				82	* followed by at least 1 full commit.
				83	*
				84	* Atomicity of commits
				85	* --------------------
				86	* In order to gaurantee atomicity during the commit operation, fast commit
				87	* uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
				88	* tag contains CRC of the contents and TID of the transaction after which
				89	* this fast commit should be applied. Recovery code replays fast commit
				90	* logs only if there's at least 1 valid tail present. For every fast commit
				91	* operation, there is 1 tail. This means, we may end up with multiple tails
				92	* in the fast commit space. Here's an example:
				93	*
				94	* - Create a new file A and remove existing file B
				95	* - fsync()
				96	* - Append contents to file A
				97	* - Truncate file A
				98	* - fsync()
				99	*
				100	* The fast commit space at the end of above operations would look like this:
				101	* [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
				102	* \|<--- Fast Commit 1 --->\|<--- Fast Commit 2 ---->\|
				103	*
				104	* Replay code should thus check for all the valid tails in the FC area.
				105	*
				106	* TODOs
				107	* -----
				108	* 1) Make fast commit atomic updates more fine grained. Today, a fast commit
				109	* eligible update must be protected within ext4_fc_start_update() and
				110	* ext4_fc_stop_update(). These routines are called at much higher
				111	* routines. This can be made more fine grained by combining with
				112	* ext4_journal_start().
				113	*
				114	* 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
				115	*
				116	* 3) Handle more ineligible cases.
				117	*/
				118
				119	#include <trace/events/ext4.h>
				120	static struct kmem_cache *ext4_fc_dentry_cachep;
				121
				122	static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
				123	{
				124	BUFFER_TRACE(bh, "");
				125	if (uptodate) {
				126	ext4_debug("%s: Block %lld up-to-date",
				127	__func__, bh->b_blocknr);
				128	set_buffer_uptodate(bh);
				129	} else {
				130	ext4_debug("%s: Block %lld not up-to-date",
				131	__func__, bh->b_blocknr);
				132	clear_buffer_uptodate(bh);
				133	}
				134
				135	unlock_buffer(bh);
				136	}
				137
				138	static inline void ext4_fc_reset_inode(struct inode *inode)
				139	{
				140	struct ext4_inode_info *ei = EXT4_I(inode);
				141
				142	ei->i_fc_lblk_start = 0;
				143	ei->i_fc_lblk_len = 0;
				144	}
				145
				146	void ext4_fc_init_inode(struct inode *inode)
				147	{
				148	struct ext4_inode_info *ei = EXT4_I(inode);
				149
				150	ext4_fc_reset_inode(inode);
				151	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
				152	INIT_LIST_HEAD(&ei->i_fc_list);
				153	init_waitqueue_head(&ei->i_fc_wait);
				154	atomic_set(&ei->i_fc_updates, 0);
				155	ei->i_fc_committed_subtid = 0;
				156	}
				157
				158	/*
				159	* Inform Ext4's fast about start of an inode update
				160	*
				161	* This function is called by the high level call VFS callbacks before
				162	* performing any inode update. This function blocks if there's an ongoing
				163	* fast commit on the inode in question.
				164	*/
				165	void ext4_fc_start_update(struct inode *inode)
				166	{
				167	struct ext4_inode_info *ei = EXT4_I(inode);
				168
Harshad Shirwadkar	8016e29	2020-10-15 13:37:59 -0700	[diff] [blame]	169	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) \|\|
				170	(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkar	aa75f4d	2020-10-15 13:37:57 -0700	[diff] [blame]	171	return;
				172
				173	restart:
				174	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				175	if (list_empty(&ei->i_fc_list))
				176	goto out;
				177
				178	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
				179	wait_queue_head_t *wq;
				180	#if (BITS_PER_LONG < 64)
				181	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
				182	EXT4_STATE_FC_COMMITTING);
				183	wq = bit_waitqueue(&ei->i_state_flags,
				184	EXT4_STATE_FC_COMMITTING);
				185	#else
				186	DEFINE_WAIT_BIT(wait, &ei->i_flags,
				187	EXT4_STATE_FC_COMMITTING);
				188	wq = bit_waitqueue(&ei->i_flags,
				189	EXT4_STATE_FC_COMMITTING);
				190	#endif
				191	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
				192	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				193	schedule();
				194	finish_wait(wq, &wait.wq_entry);
				195	goto restart;
				196	}
				197	out:
				198	atomic_inc(&ei->i_fc_updates);
				199	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				200	}
				201
				202	/*
				203	* Stop inode update and wake up waiting fast commits if any.
				204	*/
				205	void ext4_fc_stop_update(struct inode *inode)
				206	{
				207	struct ext4_inode_info *ei = EXT4_I(inode);
				208
Harshad Shirwadkar	8016e29	2020-10-15 13:37:59 -0700	[diff] [blame]	209	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) \|\|
				210	(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkar	aa75f4d	2020-10-15 13:37:57 -0700	[diff] [blame]	211	return;
				212
				213	if (atomic_dec_and_test(&ei->i_fc_updates))
				214	wake_up_all(&ei->i_fc_wait);
				215	}
				216
				217	/*
				218	* Remove inode from fast commit list. If the inode is being committed
				219	* we wait until inode commit is done.
				220	*/
				221	void ext4_fc_del(struct inode *inode)
				222	{
				223	struct ext4_inode_info *ei = EXT4_I(inode);
				224
Harshad Shirwadkar	8016e29	2020-10-15 13:37:59 -0700	[diff] [blame]	225	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) \|\|
				226	(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkar	aa75f4d	2020-10-15 13:37:57 -0700	[diff] [blame]	227	return;
				228
				229	restart:
				230	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				231	if (list_empty(&ei->i_fc_list)) {
				232	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				233	return;
				234	}
				235
				236	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
				237	wait_queue_head_t *wq;
				238	#if (BITS_PER_LONG < 64)
				239	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
				240	EXT4_STATE_FC_COMMITTING);
				241	wq = bit_waitqueue(&ei->i_state_flags,
				242	EXT4_STATE_FC_COMMITTING);
				243	#else
				244	DEFINE_WAIT_BIT(wait, &ei->i_flags,
				245	EXT4_STATE_FC_COMMITTING);
				246	wq = bit_waitqueue(&ei->i_flags,
				247	EXT4_STATE_FC_COMMITTING);
				248	#endif
				249	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
				250	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				251	schedule();
				252	finish_wait(wq, &wait.wq_entry);
				253	goto restart;
				254	}
				255	if (!list_empty(&ei->i_fc_list))
				256	list_del_init(&ei->i_fc_list);
				257	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
				258	}
				259
				260	/*
				261	* Mark file system as fast commit ineligible. This means that next commit
				262	* operation would result in a full jbd2 commit.
				263	*/
				264	void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
				265	{
				266	struct ext4_sb_info *sbi = EXT4_SB(sb);
				267
Harshad Shirwadkar	8016e29	2020-10-15 13:37:59 -0700	[diff] [blame]	268	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) \|\|
				269	(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
				270	return;
				271
Harshad Shirwadkar	aa75f4d	2020-10-15 13:37:57 -0700	[diff] [blame]	272	sbi->s_mount_state \|= EXT4_FC_INELIGIBLE;
				273	WARN_ON(reason >= EXT4_FC_REASON_MAX);
				274	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
				275	}
				276
				277	/*
				278	* Start a fast commit ineligible update. Any commits that happen while
				279	* such an operation is in progress fall back to full commits.
				280	*/
				281	void ext4_fc_start_ineligible(struct super_block *sb, int reason)
				282	{
				283	struct ext4_sb_info *sbi = EXT4_SB(sb);
				284
Harshad Shirwadkar	8016e29	2020-10-15 13:37:59 -0700	[diff] [blame]	285	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) \|\|
				286	(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
				287	return;
				288
Harshad Shirwadkar	aa75f4d	2020-10-15 13:37:57 -0700	[diff] [blame]	289	WARN_ON(reason >= EXT4_FC_REASON_MAX);
				290	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
				291	atomic_inc(&sbi->s_fc_ineligible_updates);
				292	}
				293
				294	/*
				295	* Stop a fast commit ineligible update. We set EXT4_FC_INELIGIBLE flag here
				296	* to ensure that after stopping the ineligible update, at least one full
				297	* commit takes place.
				298	*/
				299	void ext4_fc_stop_ineligible(struct super_block *sb)
				300	{
Harshad Shirwadkar	8016e29	2020-10-15 13:37:59 -0700	[diff] [blame]	301	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) \|\|
				302	(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
				303	return;
				304
Harshad Shirwadkar	aa75f4d	2020-10-15 13:37:57 -0700	[diff] [blame]	305	EXT4_SB(sb)->s_mount_state \|= EXT4_FC_INELIGIBLE;
				306	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
				307	}
				308
				309	static inline int ext4_fc_is_ineligible(struct super_block *sb)
				310	{
				311	return (EXT4_SB(sb)->s_mount_state & EXT4_FC_INELIGIBLE) \|\|
				312	atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
				313	}
				314
				315	/*
				316	* Generic fast commit tracking function. If this is the first time this we are
				317	* called after a full commit, we initialize fast commit fields and then call
				318	* __fc_track_fn() with update = 0. If we have already been called after a full
				319	* commit, we pass update = 1. Based on that, the track function can determine
				320	* if it needs to track a field for the first time or if it needs to just
				321	* update the previously tracked value.
				322	*
				323	* If enqueue is set, this function enqueues the inode in fast commit list.
				324	*/
				325	static int ext4_fc_track_template(
				326	struct inode inode, int (__fc_track_fn)(struct inode , void , bool),
				327	void *args, int enqueue)
				328	{
				329	tid_t running_txn_tid;
				330	bool update = false;
				331	struct ext4_inode_info *ei = EXT4_I(inode);
				332	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
				333	int ret;
				334
Harshad Shirwadkar	8016e29	2020-10-15 13:37:59 -0700	[diff] [blame]	335	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) \|\|
				336	(sbi->s_mount_state & EXT4_FC_REPLAY))
Harshad Shirwadkar	aa75f4d	2020-10-15 13:37:57 -0700	[diff] [blame]	337	return -EOPNOTSUPP;
				338
				339	if (ext4_fc_is_ineligible(inode->i_sb))
				340	return -EINVAL;
				341
				342	running_txn_tid = sbi->s_journal ?
				343	sbi->s_journal->j_commit_sequence + 1 : 0;
				344
				345	mutex_lock(&ei->i_fc_lock);
				346	if (running_txn_tid == ei->i_sync_tid) {
				347	update = true;
				348	} else {
				349	ext4_fc_reset_inode(inode);
				350	ei->i_sync_tid = running_txn_tid;
				351	}
				352	ret = __fc_track_fn(inode, args, update);
				353	mutex_unlock(&ei->i_fc_lock);
				354
				355	if (!enqueue)
				356	return ret;
				357
				358	spin_lock(&sbi->s_fc_lock);
				359	if (list_empty(&EXT4_I(inode)->i_fc_list))
				360	list_add_tail(&EXT4_I(inode)->i_fc_list,
				361	(sbi->s_mount_state & EXT4_FC_COMMITTING) ?
				362	&sbi->s_fc_q[FC_Q_STAGING] :
				363	&sbi->s_fc_q[FC_Q_MAIN]);
				364	spin_unlock(&sbi->s_fc_lock);
				365
				366	return ret;
				367	}
				368
				369	struct __track_dentry_update_args {
				370	struct dentry *dentry;
				371	int op;
				372	};
				373
				374	/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
				375	static int __track_dentry_update(struct inode inode, void arg, bool update)
				376	{
				377	struct ext4_fc_dentry_update *node;
				378	struct ext4_inode_info *ei = EXT4_I(inode);
				379	struct __track_dentry_update_args *dentry_update =
				380	(struct __track_dentry_update_args *)arg;
				381	struct dentry *dentry = dentry_update->dentry;
				382	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
				383
				384	mutex_unlock(&ei->i_fc_lock);
				385	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
				386	if (!node) {
				387	ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MEM);
				388	mutex_lock(&ei->i_fc_lock);
				389	return -ENOMEM;
				390	}
				391
				392	node->fcd_op = dentry_update->op;
				393	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
				394	node->fcd_ino = inode->i_ino;
				395	if (dentry->d_name.len > DNAME_INLINE_LEN) {
				396	node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
				397	if (!node->fcd_name.name) {
				398	kmem_cache_free(ext4_fc_dentry_cachep, node);
				399	ext4_fc_mark_ineligible(inode->i_sb,
				400	EXT4_FC_REASON_MEM);
				401	mutex_lock(&ei->i_fc_lock);
				402	return -ENOMEM;
				403	}
				404	memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
				405	dentry->d_name.len);
				406	} else {
				407	memcpy(node->fcd_iname, dentry->d_name.name,
				408	dentry->d_name.len);
				409	node->fcd_name.name = node->fcd_iname;
				410	}
				411	node->fcd_name.len = dentry->d_name.len;
				412
				413	spin_lock(&sbi->s_fc_lock);
				414	if (sbi->s_mount_state & EXT4_FC_COMMITTING)
				415	list_add_tail(&node->fcd_list,
				416	&sbi->s_fc_dentry_q[FC_Q_STAGING]);
				417	else
				418	list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
				419	spin_unlock(&sbi->s_fc_lock);
				420	mutex_lock(&ei->i_fc_lock);
				421
				422	return 0;
				423	}
				424
				425	void ext4_fc_track_unlink(struct inode inode, struct dentry dentry)
				426	{
				427	struct __track_dentry_update_args args;
				428	int ret;
				429
				430	args.dentry = dentry;
				431	args.op = EXT4_FC_TAG_UNLINK;
				432
				433	ret = ext4_fc_track_template(inode, __track_dentry_update,
				434	(void *)&args, 0);
				435	trace_ext4_fc_track_unlink(inode, dentry, ret);
				436	}
				437
				438	void ext4_fc_track_link(struct inode inode, struct dentry dentry)
				439	{
				440	struct __track_dentry_update_args args;
				441	int ret;
				442
				443	args.dentry = dentry;
				444	args.op = EXT4_FC_TAG_LINK;
				445
				446	ret = ext4_fc_track_template(inode, __track_dentry_update,
				447	(void *)&args, 0);
				448	trace_ext4_fc_track_link(inode, dentry, ret);
				449	}
				450
				451	void ext4_fc_track_create(struct inode inode, struct dentry dentry)
				452	{
				453	struct __track_dentry_update_args args;
				454	int ret;
				455
				456	args.dentry = dentry;
				457	args.op = EXT4_FC_TAG_CREAT;
				458
				459	ret = ext4_fc_track_template(inode, __track_dentry_update,
				460	(void *)&args, 0);
				461	trace_ext4_fc_track_create(inode, dentry, ret);
				462	}
				463
				464	/* __track_fn for inode tracking */
				465	static int __track_inode(struct inode inode, void arg, bool update)
				466	{
				467	if (update)
				468	return -EEXIST;
				469
				470	EXT4_I(inode)->i_fc_lblk_len = 0;
				471
				472	return 0;
				473	}
				474
				475	void ext4_fc_track_inode(struct inode *inode)
				476	{
				477	int ret;
				478
				479	if (S_ISDIR(inode->i_mode))
				480	return;
				481
				482	ret = ext4_fc_track_template(inode, __track_inode, NULL, 1);
				483	trace_ext4_fc_track_inode(inode, ret);
				484	}
				485
				486	struct __track_range_args {
				487	ext4_lblk_t start, end;
				488	};
				489
				490	/* __track_fn for tracking data updates */
				491	static int __track_range(struct inode inode, void arg, bool update)
				492	{
				493	struct ext4_inode_info *ei = EXT4_I(inode);
				494	ext4_lblk_t oldstart;
				495	struct __track_range_args *__arg =
				496	(struct __track_range_args *)arg;
				497
				498	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
				499	ext4_debug("Special inode %ld being modified\n", inode->i_ino);
				500	return -ECANCELED;
				501	}
				502
				503	oldstart = ei->i_fc_lblk_start;
				504
				505	if (update && ei->i_fc_lblk_len > 0) {
				506	ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
				507	ei->i_fc_lblk_len =
				508	max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
				509	ei->i_fc_lblk_start + 1;
				510	} else {
				511	ei->i_fc_lblk_start = __arg->start;
				512	ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
				513	}
				514
				515	return 0;
				516	}
				517
				518	void ext4_fc_track_range(struct inode *inode, ext4_lblk_t start,
				519	ext4_lblk_t end)
				520	{
				521	struct __track_range_args args;
				522	int ret;
				523
				524	if (S_ISDIR(inode->i_mode))
				525	return;
				526
				527	args.start = start;
				528	args.end = end;
				529
				530	ret = ext4_fc_track_template(inode, __track_range, &args, 1);
				531
				532	trace_ext4_fc_track_range(inode, start, end, ret);
				533	}
				534
				535	static void ext4_fc_submit_bh(struct super_block *sb)
				536	{
				537	int write_flags = REQ_SYNC;
				538	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
				539
				540	if (test_opt(sb, BARRIER))
				541	write_flags \|= REQ_FUA \| REQ_PREFLUSH;
				542	lock_buffer(bh);
				543	clear_buffer_dirty(bh);
				544	set_buffer_uptodate(bh);
				545	bh->b_end_io = ext4_end_buffer_io_sync;
				546	submit_bh(REQ_OP_WRITE, write_flags, bh);
				547	EXT4_SB(sb)->s_fc_bh = NULL;
				548	}
				549
				550	/* Ext4 commit path routines */
				551
				552	/* memzero and update CRC */
				553	static void ext4_fc_memzero(struct super_block sb, void *dst, int len,
				554	u32 *crc)
				555	{
				556	void *ret;
				557
				558	ret = memset(dst, 0, len);
				559	if (crc)
				560	crc = ext4_chksum(EXT4_SB(sb), crc, dst, len);
				561	return ret;
				562	}
				563
				564	/*
				565	* Allocate len bytes on a fast commit buffer.
				566	*
				567	* During the commit time this function is used to manage fast commit
				568	* block space. We don't split a fast commit log onto different
				569	* blocks. So this function makes sure that if there's not enough space
				570	* on the current block, the remaining space in the current block is
				571	* marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
				572	* new block is from jbd2 and CRC is updated to reflect the padding
				573	* we added.
				574	*/
				575	static u8 ext4_fc_reserve_space(struct super_block sb, int len, u32 *crc)
				576	{
				577	struct ext4_fc_tl *tl;
				578	struct ext4_sb_info *sbi = EXT4_SB(sb);
				579	struct buffer_head *bh;
				580	int bsize = sbi->s_journal->j_blocksize;
				581	int ret, off = sbi->s_fc_bytes % bsize;
				582	int pad_len;
				583
				584	/*
				585	* After allocating len, we should have space at least for a 0 byte
				586	* padding.
				587	*/
				588	if (len + sizeof(struct ext4_fc_tl) > bsize)
				589	return NULL;
				590
				591	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
				592	/*
				593	* Only allocate from current buffer if we have enough space for
				594	* this request AND we have space to add a zero byte padding.
				595	*/
				596	if (!sbi->s_fc_bh) {
				597	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
				598	if (ret)
				599	return NULL;
				600	sbi->s_fc_bh = bh;
				601	}
				602	sbi->s_fc_bytes += len;
				603	return sbi->s_fc_bh->b_data + off;
				604	}
				605	/* Need to add PAD tag */
				606	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
				607	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
				608	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
				609	tl->fc_len = cpu_to_le16(pad_len);
				610	if (crc)
				611	crc = ext4_chksum(sbi, crc, tl, sizeof(*tl));
				612	if (pad_len > 0)
				613	ext4_fc_memzero(sb, tl + 1, pad_len, crc);
				614	ext4_fc_submit_bh(sb);
				615
				616	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
				617	if (ret)
				618	return NULL;
				619	sbi->s_fc_bh = bh;
				620	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
				621	return sbi->s_fc_bh->b_data;
				622	}
				623
				624	/* memcpy to fc reserved space and update CRC */
				625	static void ext4_fc_memcpy(struct super_block sb, void dst, const void src,
				626	int len, u32 *crc)
				627	{
				628	if (crc)
				629	crc = ext4_chksum(EXT4_SB(sb), crc, src, len);
				630	return memcpy(dst, src, len);
				631	}
				632
				633	/*
				634	* Complete a fast commit by writing tail tag.
				635	*
				636	* Writing tail tag marks the end of a fast commit. In order to guarantee
				637	* atomicity, after writing tail tag, even if there's space remaining
				638	* in the block, next commit shouldn't use it. That's why tail tag
				639	* has the length as that of the remaining space on the block.
				640	*/
				641	static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
				642	{
				643	struct ext4_sb_info *sbi = EXT4_SB(sb);
				644	struct ext4_fc_tl tl;
				645	struct ext4_fc_tail tail;
				646	int off, bsize = sbi->s_journal->j_blocksize;
				647	u8 *dst;
				648
				649	/*
				650	* ext4_fc_reserve_space takes care of allocating an extra block if
				651	* there's no enough space on this block for accommodating this tail.
				652	*/
				653	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
				654	if (!dst)
				655	return -ENOSPC;
				656
				657	off = sbi->s_fc_bytes % bsize;
				658
				659	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
				660	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
				661	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
				662
				663	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
				664	dst += sizeof(tl);
				665	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
				666	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
				667	dst += sizeof(tail.fc_tid);
				668	tail.fc_crc = cpu_to_le32(crc);
				669	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
				670
				671	ext4_fc_submit_bh(sb);
				672
				673	return 0;
				674	}
				675
				676	/*
				677	* Adds tag, length, value and updates CRC. Returns true if tlv was added.
				678	* Returns false if there's not enough space.
				679	*/
				680	static bool ext4_fc_add_tlv(struct super_block sb, u16 tag, u16 len, u8 val,
				681	u32 *crc)
				682	{
				683	struct ext4_fc_tl tl;
				684	u8 *dst;
				685
				686	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
				687	if (!dst)
				688	return false;
				689
				690	tl.fc_tag = cpu_to_le16(tag);
				691	tl.fc_len = cpu_to_le16(len);
				692
				693	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
				694	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
				695
				696	return true;
				697	}
				698
				699	/* Same as above, but adds dentry tlv. */
				700	static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
				701	int parent_ino, int ino, int dlen,
				702	const unsigned char *dname,
				703	u32 *crc)
				704	{
				705	struct ext4_fc_dentry_info fcd;
				706	struct ext4_fc_tl tl;
				707	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
				708	crc);
				709
				710	if (!dst)
				711	return false;
				712
				713	fcd.fc_parent_ino = cpu_to_le32(parent_ino);
				714	fcd.fc_ino = cpu_to_le32(ino);
				715	tl.fc_tag = cpu_to_le16(tag);
				716	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
				717	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
				718	dst += sizeof(tl);
				719	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
				720	dst += sizeof(fcd);
				721	ext4_fc_memcpy(sb, dst, dname, dlen, crc);
				722	dst += dlen;
				723
				724	return true;
				725	}
				726
				727	/*
				728	* Writes inode in the fast commit space under TLV with tag @tag.
				729	* Returns 0 on success, error on failure.
				730	*/
				731	static int ext4_fc_write_inode(struct inode inode, u32 crc)
				732	{
				733	struct ext4_inode_info *ei = EXT4_I(inode);
				734	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
				735	int ret;
				736	struct ext4_iloc iloc;
				737	struct ext4_fc_inode fc_inode;
				738	struct ext4_fc_tl tl;
				739	u8 *dst;
				740
				741	ret = ext4_get_inode_loc(inode, &iloc);
				742	if (ret)
				743	return ret;
				744
				745	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
				746	inode_len += ei->i_extra_isize;
				747
				748	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
				749	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
				750	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
				751
				752	dst = ext4_fc_reserve_space(inode->i_sb,
				753	sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
				754	if (!dst)
				755	return -ECANCELED;
				756
				757	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
				758	return -ECANCELED;
				759	dst += sizeof(tl);
				760	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
				761	return -ECANCELED;
				762	dst += sizeof(fc_inode);
				763	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
				764	inode_len, crc))
				765	return -ECANCELED;
				766
				767	return 0;
				768	}
				769
				770	/*
				771	* Writes updated data ranges for the inode in question. Updates CRC.
				772	* Returns 0 on success, error otherwise.
				773	*/
				774	static int ext4_fc_write_inode_data(struct inode inode, u32 crc)
				775	{
				776	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
				777	struct ext4_inode_info *ei = EXT4_I(inode);
				778	struct ext4_map_blocks map;
				779	struct ext4_fc_add_range fc_ext;
				780	struct ext4_fc_del_range lrange;
				781	struct ext4_extent *ex;
				782	int ret;
				783
				784	mutex_lock(&ei->i_fc_lock);
				785	if (ei->i_fc_lblk_len == 0) {
				786	mutex_unlock(&ei->i_fc_lock);
				787	return 0;
				788	}
				789	old_blk_size = ei->i_fc_lblk_start;
				790	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
				791	ei->i_fc_lblk_len = 0;
				792	mutex_unlock(&ei->i_fc_lock);
				793
				794	cur_lblk_off = old_blk_size;
				795	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
				796	__func__, cur_lblk_off, new_blk_size, inode->i_ino);
				797
				798	while (cur_lblk_off <= new_blk_size) {
				799	map.m_lblk = cur_lblk_off;
				800	map.m_len = new_blk_size - cur_lblk_off + 1;
				801	ret = ext4_map_blocks(NULL, inode, &map, 0);
				802	if (ret < 0)
				803	return -ECANCELED;
				804
				805	if (map.m_len == 0) {
				806	cur_lblk_off++;
				807	continue;
				808	}
				809
				810	if (ret == 0) {
				811	lrange.fc_ino = cpu_to_le32(inode->i_ino);
				812	lrange.fc_lblk = cpu_to_le32(map.m_lblk);
				813	lrange.fc_len = cpu_to_le32(map.m_len);
				814	if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
				815	sizeof(lrange), (u8 *)&lrange, crc))
				816	return -ENOSPC;
				817	} else {
				818	fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
				819	ex = (struct ext4_extent *)&fc_ext.fc_ex;
				820	ex->ee_block = cpu_to_le32(map.m_lblk);
				821	ex->ee_len = cpu_to_le16(map.m_len);
				822	ext4_ext_store_pblock(ex, map.m_pblk);
				823	if (map.m_flags & EXT4_MAP_UNWRITTEN)
				824	ext4_ext_mark_unwritten(ex);
				825	else
				826	ext4_ext_mark_initialized(ex);
				827	if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
				828	sizeof(fc_ext), (u8 *)&fc_ext, crc))
				829	return -ENOSPC;
				830	}
				831
				832	cur_lblk_off += map.m_len;
				833	}
				834
				835	return 0;
				836	}
				837
				838
				839	/* Submit data for all the fast commit inodes */
				840	static int ext4_fc_submit_inode_data_all(journal_t *journal)
				841	{
				842	struct super_block sb = (struct super_block )(journal->j_private);
				843	struct ext4_sb_info *sbi = EXT4_SB(sb);
				844	struct ext4_inode_info *ei;
				845	struct list_head *pos;
				846	int ret = 0;
				847
				848	spin_lock(&sbi->s_fc_lock);
				849	sbi->s_mount_state \|= EXT4_FC_COMMITTING;
				850	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
				851	ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
				852	ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
				853	while (atomic_read(&ei->i_fc_updates)) {
				854	DEFINE_WAIT(wait);
				855
				856	prepare_to_wait(&ei->i_fc_wait, &wait,
				857	TASK_UNINTERRUPTIBLE);
				858	if (atomic_read(&ei->i_fc_updates)) {
				859	spin_unlock(&sbi->s_fc_lock);
				860	schedule();
				861	spin_lock(&sbi->s_fc_lock);
				862	}
				863	finish_wait(&ei->i_fc_wait, &wait);
				864	}
				865	spin_unlock(&sbi->s_fc_lock);
				866	ret = jbd2_submit_inode_data(ei->jinode);
				867	if (ret)
				868	return ret;
				869	spin_lock(&sbi->s_fc_lock);
				870	}
				871	spin_unlock(&sbi->s_fc_lock);
				872
				873	return ret;
				874	}
				875
				876	/* Wait for completion of data for all the fast commit inodes */
				877	static int ext4_fc_wait_inode_data_all(journal_t *journal)
				878	{
				879	struct super_block sb = (struct super_block )(journal->j_private);
				880	struct ext4_sb_info *sbi = EXT4_SB(sb);
				881	struct ext4_inode_info pos, n;
				882	int ret = 0;
				883
				884	spin_lock(&sbi->s_fc_lock);
				885	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
				886	if (!ext4_test_inode_state(&pos->vfs_inode,
				887	EXT4_STATE_FC_COMMITTING))
				888	continue;
				889	spin_unlock(&sbi->s_fc_lock);
				890
				891	ret = jbd2_wait_inode_data(journal, pos->jinode);
				892	if (ret)
				893	return ret;
				894	spin_lock(&sbi->s_fc_lock);
				895	}
				896	spin_unlock(&sbi->s_fc_lock);
				897
				898	return 0;
				899	}
				900
				901	/* Commit all the directory entry updates */
				902	static int ext4_fc_commit_dentry_updates(journal_t journal, u32 crc)
				903	{
				904	struct super_block sb = (struct super_block )(journal->j_private);
				905	struct ext4_sb_info *sbi = EXT4_SB(sb);
				906	struct ext4_fc_dentry_update *fc_dentry;
				907	struct inode *inode;
				908	struct list_head pos, n, fcd_pos, fcd_n;
				909	struct ext4_inode_info *ei;
				910	int ret;
				911
				912	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
				913	return 0;
				914	list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
				915	fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
				916	fcd_list);
				917	if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
				918	spin_unlock(&sbi->s_fc_lock);
				919	if (!ext4_fc_add_dentry_tlv(
				920	sb, fc_dentry->fcd_op,
				921	fc_dentry->fcd_parent, fc_dentry->fcd_ino,
				922	fc_dentry->fcd_name.len,
				923	fc_dentry->fcd_name.name, crc)) {
				924	ret = -ENOSPC;
				925	goto lock_and_exit;
				926	}
				927	spin_lock(&sbi->s_fc_lock);
				928	continue;
				929	}
				930
				931	inode = NULL;
				932	list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
				933	ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
				934	if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
				935	inode = &ei->vfs_inode;
				936	break;
				937	}
				938	}
				939	/*
				940	* If we don't find inode in our list, then it was deleted,
				941	* in which case, we don't need to record it's create tag.
				942	*/
				943	if (!inode)
				944	continue;
				945	spin_unlock(&sbi->s_fc_lock);
				946
				947	/*
				948	* We first write the inode and then the create dirent. This
				949	* allows the recovery code to create an unnamed inode first
				950	* and then link it to a directory entry. This allows us
				951	* to use namei.c routines almost as is and simplifies
				952	* the recovery code.
				953	*/
				954	ret = ext4_fc_write_inode(inode, crc);
				955	if (ret)
				956	goto lock_and_exit;
				957
				958	ret = ext4_fc_write_inode_data(inode, crc);
				959	if (ret)
				960	goto lock_and_exit;
				961
				962	if (!ext4_fc_add_dentry_tlv(
				963	sb, fc_dentry->fcd_op,
				964	fc_dentry->fcd_parent, fc_dentry->fcd_ino,
				965	fc_dentry->fcd_name.len,
				966	fc_dentry->fcd_name.name, crc)) {
				967	spin_lock(&sbi->s_fc_lock);
				968	ret = -ENOSPC;
				969	goto lock_and_exit;
				970	}
				971
				972	spin_lock(&sbi->s_fc_lock);
				973	}
				974	return 0;
				975	lock_and_exit:
				976	spin_lock(&sbi->s_fc_lock);
				977	return ret;
				978	}
				979
				980	static int ext4_fc_perform_commit(journal_t *journal)
				981	{
				982	struct super_block sb = (struct super_block )(journal->j_private);
				983	struct ext4_sb_info *sbi = EXT4_SB(sb);
				984	struct ext4_inode_info *iter;
				985	struct ext4_fc_head head;
				986	struct list_head *pos;
				987	struct inode *inode;
				988	struct blk_plug plug;
				989	int ret = 0;
				990	u32 crc = 0;
				991
				992	ret = ext4_fc_submit_inode_data_all(journal);
				993	if (ret)
				994	return ret;
				995
				996	ret = ext4_fc_wait_inode_data_all(journal);
				997	if (ret)
				998	return ret;
				999
				1000	blk_start_plug(&plug);
				1001	if (sbi->s_fc_bytes == 0) {
				1002	/*
				1003	* Add a head tag only if this is the first fast commit
				1004	* in this TID.
				1005	*/
				1006	head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
				1007	head.fc_tid = cpu_to_le32(
				1008	sbi->s_journal->j_running_transaction->t_tid);
				1009	if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
				1010	(u8 *)&head, &crc))
				1011	goto out;
				1012	}
				1013
				1014	spin_lock(&sbi->s_fc_lock);
				1015	ret = ext4_fc_commit_dentry_updates(journal, &crc);
				1016	if (ret) {
				1017	spin_unlock(&sbi->s_fc_lock);
				1018	goto out;
				1019	}
				1020
				1021	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
				1022	iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
				1023	inode = &iter->vfs_inode;
				1024	if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
				1025	continue;
				1026
				1027	spin_unlock(&sbi->s_fc_lock);
				1028	ret = ext4_fc_write_inode_data(inode, &crc);
				1029	if (ret)
				1030	goto out;
				1031	ret = ext4_fc_write_inode(inode, &crc);
				1032	if (ret)
				1033	goto out;
				1034	spin_lock(&sbi->s_fc_lock);
				1035	EXT4_I(inode)->i_fc_committed_subtid =
				1036	atomic_read(&sbi->s_fc_subtid);
				1037	}
				1038	spin_unlock(&sbi->s_fc_lock);
				1039
				1040	ret = ext4_fc_write_tail(sb, crc);
				1041
				1042	out:
				1043	blk_finish_plug(&plug);
				1044	return ret;
				1045	}
				1046
				1047	/*
				1048	* The main commit entry point. Performs a fast commit for transaction
				1049	* commit_tid if needed. If it's not possible to perform a fast commit
				1050	* due to various reasons, we fall back to full commit. Returns 0
				1051	* on success, error otherwise.
				1052	*/
				1053	int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
				1054	{
				1055	struct super_block sb = (struct super_block )(journal->j_private);
				1056	struct ext4_sb_info *sbi = EXT4_SB(sb);
				1057	int nblks = 0, ret, bsize = journal->j_blocksize;
				1058	int subtid = atomic_read(&sbi->s_fc_subtid);
				1059	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
				1060	ktime_t start_time, commit_time;
				1061
				1062	trace_ext4_fc_commit_start(sb);
				1063
				1064	start_time = ktime_get();
				1065
				1066	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) \|\|
				1067	(ext4_fc_is_ineligible(sb))) {
				1068	reason = EXT4_FC_REASON_INELIGIBLE;
				1069	goto out;
				1070	}
				1071
				1072	restart_fc:
				1073	ret = jbd2_fc_begin_commit(journal, commit_tid);
				1074	if (ret == -EALREADY) {
				1075	/* There was an ongoing commit, check if we need to restart */
				1076	if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
				1077	commit_tid > journal->j_commit_sequence)
				1078	goto restart_fc;
				1079	reason = EXT4_FC_REASON_ALREADY_COMMITTED;
				1080	goto out;
				1081	} else if (ret) {
				1082	sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
				1083	reason = EXT4_FC_REASON_FC_START_FAILED;
				1084	goto out;
				1085	}
				1086
				1087	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
				1088	ret = ext4_fc_perform_commit(journal);
				1089	if (ret < 0) {
				1090	sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
				1091	reason = EXT4_FC_REASON_FC_FAILED;
				1092	goto out;
				1093	}
				1094	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
				1095	ret = jbd2_fc_wait_bufs(journal, nblks);
				1096	if (ret < 0) {
				1097	sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
				1098	reason = EXT4_FC_REASON_FC_FAILED;
				1099	goto out;
				1100	}
				1101	atomic_inc(&sbi->s_fc_subtid);
				1102	jbd2_fc_end_commit(journal);
				1103	out:
				1104	/* Has any ineligible update happened since we started? */
				1105	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
				1106	sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
				1107	reason = EXT4_FC_REASON_INELIGIBLE;
				1108	}
				1109
				1110	spin_lock(&sbi->s_fc_lock);
				1111	if (reason != EXT4_FC_REASON_OK &&
				1112	reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
				1113	sbi->s_fc_stats.fc_ineligible_commits++;
				1114	} else {
				1115	sbi->s_fc_stats.fc_num_commits++;
				1116	sbi->s_fc_stats.fc_numblks += nblks;
				1117	}
				1118	spin_unlock(&sbi->s_fc_lock);
				1119	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
				1120	trace_ext4_fc_commit_stop(sb, nblks, reason);
				1121	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
				1122	/*
				1123	* weight the commit time higher than the average time so we don't
				1124	* react too strongly to vast changes in the commit time
				1125	*/
				1126	if (likely(sbi->s_fc_avg_commit_time))
				1127	sbi->s_fc_avg_commit_time = (commit_time +
				1128	sbi->s_fc_avg_commit_time * 3) / 4;
				1129	else
				1130	sbi->s_fc_avg_commit_time = commit_time;
				1131	jbd_debug(1,
				1132	"Fast commit ended with blks = %d, reason = %d, subtid - %d",
				1133	nblks, reason, subtid);
				1134	if (reason == EXT4_FC_REASON_FC_FAILED)
				1135	return jbd2_fc_end_commit_fallback(journal, commit_tid);
				1136	if (reason == EXT4_FC_REASON_FC_START_FAILED \|\|
				1137	reason == EXT4_FC_REASON_INELIGIBLE)
				1138	return jbd2_complete_transaction(journal, commit_tid);
				1139	return 0;
				1140	}
				1141
Harshad Shirwadkar	ff780b9	2020-10-15 13:37:56 -0700	[diff] [blame]	1142	/*
				1143	* Fast commit cleanup routine. This is called after every fast commit and
				1144	* full commit. full is true if we are called after a full commit.
				1145	*/
				1146	static void ext4_fc_cleanup(journal_t *journal, int full)
				1147	{
Harshad Shirwadkar	aa75f4d	2020-10-15 13:37:57 -0700	[diff] [blame]	1148	struct super_block *sb = journal->j_private;
				1149	struct ext4_sb_info *sbi = EXT4_SB(sb);
				1150	struct ext4_inode_info *iter;
				1151	struct ext4_fc_dentry_update *fc_dentry;
				1152	struct list_head pos, n;
				1153
				1154	if (full && sbi->s_fc_bh)
				1155	sbi->s_fc_bh = NULL;
				1156
				1157	jbd2_fc_release_bufs(journal);
				1158
				1159	spin_lock(&sbi->s_fc_lock);
				1160	list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
				1161	iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
				1162	list_del_init(&iter->i_fc_list);
				1163	ext4_clear_inode_state(&iter->vfs_inode,
				1164	EXT4_STATE_FC_COMMITTING);
				1165	ext4_fc_reset_inode(&iter->vfs_inode);
				1166	/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
				1167	smp_mb();
				1168	#if (BITS_PER_LONG < 64)
				1169	wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
				1170	#else
				1171	wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
				1172	#endif
				1173	}
				1174
				1175	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
				1176	fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
				1177	struct ext4_fc_dentry_update,
				1178	fcd_list);
				1179	list_del_init(&fc_dentry->fcd_list);
				1180	spin_unlock(&sbi->s_fc_lock);
				1181
				1182	if (fc_dentry->fcd_name.name &&
				1183	fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
				1184	kfree(fc_dentry->fcd_name.name);
				1185	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
				1186	spin_lock(&sbi->s_fc_lock);
				1187	}
				1188
				1189	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
				1190	&sbi->s_fc_dentry_q[FC_Q_MAIN]);
				1191	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
				1192	&sbi->s_fc_q[FC_Q_STAGING]);
				1193
				1194	sbi->s_mount_state &= ~EXT4_FC_COMMITTING;
				1195	sbi->s_mount_state &= ~EXT4_FC_INELIGIBLE;
				1196
				1197	if (full)
				1198	sbi->s_fc_bytes = 0;
				1199	spin_unlock(&sbi->s_fc_lock);
				1200	trace_ext4_fc_stats(sb);
Harshad Shirwadkar	ff780b9	2020-10-15 13:37:56 -0700	[diff] [blame]	1201	}
Harshad Shirwadkar	6866d7b	2020-10-15 13:37:55 -0700	[diff] [blame]	1202
Harshad Shirwadkar	8016e29	2020-10-15 13:37:59 -0700	[diff] [blame]	1203	/* Ext4 Replay Path Routines */
				1204
				1205	/* Get length of a particular tlv */
				1206	static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
				1207	{
				1208	return le16_to_cpu(tl->fc_len);
				1209	}
				1210
				1211	/* Get a pointer to "value" of a tlv */
				1212	static inline u8 ext4_fc_tag_val(struct ext4_fc_tl tl)
				1213	{
				1214	return (u8 )tl + sizeof(tl);
				1215	}
				1216
				1217	/* Helper struct for dentry replay routines */
				1218	struct dentry_info_args {
				1219	int parent_ino, dname_len, ino, inode_len;
				1220	char *dname;
				1221	};
				1222
				1223	static inline void tl_to_darg(struct dentry_info_args *darg,
				1224	struct ext4_fc_tl *tl)
				1225	{
				1226	struct ext4_fc_dentry_info *fcd;
				1227
				1228	fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
				1229
				1230	darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
				1231	darg->ino = le32_to_cpu(fcd->fc_ino);
				1232	darg->dname = fcd->fc_dname;
				1233	darg->dname_len = ext4_fc_tag_len(tl) -
				1234	sizeof(struct ext4_fc_dentry_info);
				1235	}
				1236
				1237	/* Unlink replay function */
				1238	static int ext4_fc_replay_unlink(struct super_block sb, struct ext4_fc_tl tl)
				1239	{
				1240	struct inode inode, old_parent;
				1241	struct qstr entry;
				1242	struct dentry_info_args darg;
				1243	int ret = 0;
				1244
				1245	tl_to_darg(&darg, tl);
				1246
				1247	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
				1248	darg.parent_ino, darg.dname_len);
				1249
				1250	entry.name = darg.dname;
				1251	entry.len = darg.dname_len;
				1252	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
				1253
				1254	if (IS_ERR_OR_NULL(inode)) {
				1255	jbd_debug(1, "Inode %d not found", darg.ino);
				1256	return 0;
				1257	}
				1258
				1259	old_parent = ext4_iget(sb, darg.parent_ino,
				1260	EXT4_IGET_NORMAL);
				1261	if (IS_ERR_OR_NULL(old_parent)) {
				1262	jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
				1263	iput(inode);
				1264	return 0;
				1265	}
				1266
				1267	ret = __ext4_unlink(old_parent, &entry, inode);
				1268	/* -ENOENT ok coz it might not exist anymore. */
				1269	if (ret == -ENOENT)
				1270	ret = 0;
				1271	iput(old_parent);
				1272	iput(inode);
				1273	return ret;
				1274	}
				1275
				1276	static int ext4_fc_replay_link_internal(struct super_block *sb,
				1277	struct dentry_info_args *darg,
				1278	struct inode *inode)
				1279	{
				1280	struct inode *dir = NULL;
				1281	struct dentry dentry_dir = NULL, dentry_inode = NULL;
				1282	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
				1283	int ret = 0;
				1284
				1285	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
				1286	if (IS_ERR(dir)) {
				1287	jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
				1288	dir = NULL;
				1289	goto out;
				1290	}
				1291
				1292	dentry_dir = d_obtain_alias(dir);
				1293	if (IS_ERR(dentry_dir)) {
				1294	jbd_debug(1, "Failed to obtain dentry");
				1295	dentry_dir = NULL;
				1296	goto out;
				1297	}
				1298
				1299	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
				1300	if (!dentry_inode) {
				1301	jbd_debug(1, "Inode dentry not created.");
				1302	ret = -ENOMEM;
				1303	goto out;
				1304	}
				1305
				1306	ret = __ext4_link(dir, inode, dentry_inode);
				1307	/*
				1308	* It's possible that link already existed since data blocks
				1309	* for the dir in question got persisted before we crashed OR
				1310	* we replayed this tag and crashed before the entire replay
				1311	* could complete.
				1312	*/
				1313	if (ret && ret != -EEXIST) {
				1314	jbd_debug(1, "Failed to link\n");
				1315	goto out;
				1316	}
				1317
				1318	ret = 0;
				1319	out:
				1320	if (dentry_dir) {
				1321	d_drop(dentry_dir);
				1322	dput(dentry_dir);
				1323	} else if (dir) {
				1324	iput(dir);
				1325	}
				1326	if (dentry_inode) {
				1327	d_drop(dentry_inode);
				1328	dput(dentry_inode);
				1329	}
				1330
				1331	return ret;
				1332	}
				1333
				1334	/* Link replay function */
				1335	static int ext4_fc_replay_link(struct super_block sb, struct ext4_fc_tl tl)
				1336	{
				1337	struct inode *inode;
				1338	struct dentry_info_args darg;
				1339	int ret = 0;
				1340
				1341	tl_to_darg(&darg, tl);
				1342	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
				1343	darg.parent_ino, darg.dname_len);
				1344
				1345	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
				1346	if (IS_ERR_OR_NULL(inode)) {
				1347	jbd_debug(1, "Inode not found.");
				1348	return 0;
				1349	}
				1350
				1351	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
				1352	iput(inode);
				1353	return ret;
				1354	}
				1355
				1356	/*
				1357	* Record all the modified inodes during replay. We use this later to setup
				1358	* block bitmaps correctly.
				1359	*/
				1360	static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
				1361	{
				1362	struct ext4_fc_replay_state *state;
				1363	int i;
				1364
				1365	state = &EXT4_SB(sb)->s_fc_replay_state;
				1366	for (i = 0; i < state->fc_modified_inodes_used; i++)
				1367	if (state->fc_modified_inodes[i] == ino)
				1368	return 0;
				1369	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
				1370	state->fc_modified_inodes_size +=
				1371	EXT4_FC_REPLAY_REALLOC_INCREMENT;
				1372	state->fc_modified_inodes = krealloc(
				1373	state->fc_modified_inodes, sizeof(int) *
				1374	state->fc_modified_inodes_size,
				1375	GFP_KERNEL);
				1376	if (!state->fc_modified_inodes)
				1377	return -ENOMEM;
				1378	}
				1379	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
				1380	return 0;
				1381	}
				1382
				1383	/*
				1384	* Inode replay function
				1385	*/
				1386	static int ext4_fc_replay_inode(struct super_block sb, struct ext4_fc_tl tl)
				1387	{
				1388	struct ext4_fc_inode *fc_inode;
				1389	struct ext4_inode *raw_inode;
				1390	struct ext4_inode *raw_fc_inode;
				1391	struct inode *inode = NULL;
				1392	struct ext4_iloc iloc;
				1393	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
				1394	struct ext4_extent_header *eh;
				1395
				1396	fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
				1397
				1398	ino = le32_to_cpu(fc_inode->fc_ino);
				1399	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
				1400
				1401	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
				1402	if (!IS_ERR_OR_NULL(inode)) {
				1403	ext4_ext_clear_bb(inode);
				1404	iput(inode);
				1405	}
				1406
				1407	ext4_fc_record_modified_inode(sb, ino);
				1408
				1409	raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
				1410	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
				1411	if (ret)
				1412	goto out;
				1413
				1414	inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
				1415	raw_inode = ext4_raw_inode(&iloc);
				1416
				1417	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
				1418	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
				1419	inode_len - offsetof(struct ext4_inode, i_generation));
				1420	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
				1421	eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
				1422	if (eh->eh_magic != EXT4_EXT_MAGIC) {
				1423	memset(eh, 0, sizeof(*eh));
				1424	eh->eh_magic = EXT4_EXT_MAGIC;
				1425	eh->eh_max = cpu_to_le16(
				1426	(sizeof(raw_inode->i_block) -
				1427	sizeof(struct ext4_extent_header))
				1428	/ sizeof(struct ext4_extent));
				1429	}
				1430	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
				1431	memcpy(raw_inode->i_block, raw_fc_inode->i_block,
				1432	sizeof(raw_inode->i_block));
				1433	}
				1434
				1435	/* Immediately update the inode on disk. */
				1436	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
				1437	if (ret)
				1438	goto out;
				1439	ret = sync_dirty_buffer(iloc.bh);
				1440	if (ret)
				1441	goto out;
				1442	ret = ext4_mark_inode_used(sb, ino);
				1443	if (ret)
				1444	goto out;
				1445
				1446	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
				1447	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
				1448	if (IS_ERR_OR_NULL(inode)) {
				1449	jbd_debug(1, "Inode not found.");
				1450	return -EFSCORRUPTED;
				1451	}
				1452
				1453	/*
				1454	* Our allocator could have made different decisions than before
				1455	* crashing. This should be fixed but until then, we calculate
				1456	* the number of blocks the inode.
				1457	*/
				1458	ext4_ext_replay_set_iblocks(inode);
				1459
				1460	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
				1461	ext4_reset_inode_seed(inode);
				1462
				1463	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
				1464	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
				1465	sync_dirty_buffer(iloc.bh);
				1466	brelse(iloc.bh);
				1467	out:
				1468	iput(inode);
				1469	if (!ret)
				1470	blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
				1471
				1472	return 0;
				1473	}
				1474
				1475	/*
				1476	* Dentry create replay function.
				1477	*
				1478	* EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
				1479	* inode for which we are trying to create a dentry here, should already have
				1480	* been replayed before we start here.
				1481	*/
				1482	static int ext4_fc_replay_create(struct super_block sb, struct ext4_fc_tl tl)
				1483	{
				1484	int ret = 0;
				1485	struct inode *inode = NULL;
				1486	struct inode *dir = NULL;
				1487	struct dentry_info_args darg;
				1488
				1489	tl_to_darg(&darg, tl);
				1490
				1491	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
				1492	darg.parent_ino, darg.dname_len);
				1493
				1494	/* This takes care of update group descriptor and other metadata */
				1495	ret = ext4_mark_inode_used(sb, darg.ino);
				1496	if (ret)
				1497	goto out;
				1498
				1499	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
				1500	if (IS_ERR_OR_NULL(inode)) {
				1501	jbd_debug(1, "inode %d not found.", darg.ino);
				1502	inode = NULL;
				1503	ret = -EINVAL;
				1504	goto out;
				1505	}
				1506
				1507	if (S_ISDIR(inode->i_mode)) {
				1508	/*
				1509	* If we are creating a directory, we need to make sure that the
				1510	* dot and dot dot dirents are setup properly.
				1511	*/
				1512	dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
				1513	if (IS_ERR_OR_NULL(dir)) {
				1514	jbd_debug(1, "Dir %d not found.", darg.ino);
				1515	goto out;
				1516	}
				1517	ret = ext4_init_new_dir(NULL, dir, inode);
				1518	iput(dir);
				1519	if (ret) {
				1520	ret = 0;
				1521	goto out;
				1522	}
				1523	}
				1524	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
				1525	if (ret)
				1526	goto out;
				1527	set_nlink(inode, 1);
				1528	ext4_mark_inode_dirty(NULL, inode);
				1529	out:
				1530	if (inode)
				1531	iput(inode);
				1532	return ret;
				1533	}
				1534
				1535	/*
				1536	* Record physical disk regions which are in use as per fast commit area. Our
				1537	* simple replay phase allocator excludes these regions from allocation.
				1538	*/
				1539	static int ext4_fc_record_regions(struct super_block *sb, int ino,
				1540	ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
				1541	{
				1542	struct ext4_fc_replay_state *state;
				1543	struct ext4_fc_alloc_region *region;
				1544
				1545	state = &EXT4_SB(sb)->s_fc_replay_state;
				1546	if (state->fc_regions_used == state->fc_regions_size) {
				1547	state->fc_regions_size +=
				1548	EXT4_FC_REPLAY_REALLOC_INCREMENT;
				1549	state->fc_regions = krealloc(
				1550	state->fc_regions,
				1551	state->fc_regions_size *
				1552	sizeof(struct ext4_fc_alloc_region),
				1553	GFP_KERNEL);
				1554	if (!state->fc_regions)
				1555	return -ENOMEM;
				1556	}
				1557	region = &state->fc_regions[state->fc_regions_used++];
				1558	region->ino = ino;
				1559	region->lblk = lblk;
				1560	region->pblk = pblk;
				1561	region->len = len;
				1562
				1563	return 0;
				1564	}
				1565
				1566	/* Replay add range tag */
				1567	static int ext4_fc_replay_add_range(struct super_block *sb,
				1568	struct ext4_fc_tl *tl)
				1569	{
				1570	struct ext4_fc_add_range *fc_add_ex;
				1571	struct ext4_extent newex, *ex;
				1572	struct inode *inode;
				1573	ext4_lblk_t start, cur;
				1574	int remaining, len;
				1575	ext4_fsblk_t start_pblk;
				1576	struct ext4_map_blocks map;
				1577	struct ext4_ext_path *path = NULL;
				1578	int ret;
				1579
				1580	fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
				1581	ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
				1582
				1583	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
				1584	le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
				1585	ext4_ext_get_actual_len(ex));
				1586
				1587	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
				1588	EXT4_IGET_NORMAL);
				1589	if (IS_ERR_OR_NULL(inode)) {
				1590	jbd_debug(1, "Inode not found.");
				1591	return 0;
				1592	}
				1593
				1594	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
				1595
				1596	start = le32_to_cpu(ex->ee_block);
				1597	start_pblk = ext4_ext_pblock(ex);
				1598	len = ext4_ext_get_actual_len(ex);
				1599
				1600	cur = start;
				1601	remaining = len;
				1602	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
				1603	start, start_pblk, len, ext4_ext_is_unwritten(ex),
				1604	inode->i_ino);
				1605
				1606	while (remaining > 0) {
				1607	map.m_lblk = cur;
				1608	map.m_len = remaining;
				1609	map.m_pblk = 0;
				1610	ret = ext4_map_blocks(NULL, inode, &map, 0);
				1611
				1612	if (ret < 0) {
				1613	iput(inode);
				1614	return 0;
				1615	}
				1616
				1617	if (ret == 0) {
				1618	/* Range is not mapped */
				1619	path = ext4_find_extent(inode, cur, NULL, 0);
				1620	if (!path)
				1621	continue;
				1622	memset(&newex, 0, sizeof(newex));
				1623	newex.ee_block = cpu_to_le32(cur);
				1624	ext4_ext_store_pblock(
				1625	&newex, start_pblk + cur - start);
				1626	newex.ee_len = cpu_to_le16(map.m_len);
				1627	if (ext4_ext_is_unwritten(ex))
				1628	ext4_ext_mark_unwritten(&newex);
				1629	down_write(&EXT4_I(inode)->i_data_sem);
				1630	ret = ext4_ext_insert_extent(
				1631	NULL, inode, &path, &newex, 0);
				1632	up_write((&EXT4_I(inode)->i_data_sem));
				1633	ext4_ext_drop_refs(path);
				1634	kfree(path);
				1635	if (ret) {
				1636	iput(inode);
				1637	return 0;
				1638	}
				1639	goto next;
				1640	}
				1641
				1642	if (start_pblk + cur - start != map.m_pblk) {
				1643	/*
				1644	* Logical to physical mapping changed. This can happen
				1645	* if this range was removed and then reallocated to
				1646	* map to new physical blocks during a fast commit.
				1647	*/
				1648	ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
				1649	ext4_ext_is_unwritten(ex),
				1650	start_pblk + cur - start);
				1651	if (ret) {
				1652	iput(inode);
				1653	return 0;
				1654	}
				1655	/*
				1656	* Mark the old blocks as free since they aren't used
				1657	* anymore. We maintain an array of all the modified
				1658	* inodes. In case these blocks are still used at either
				1659	* a different logical range in the same inode or in
				1660	* some different inode, we will mark them as allocated
				1661	* at the end of the FC replay using our array of
				1662	* modified inodes.
				1663	*/
				1664	ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
				1665	goto next;
				1666	}
				1667
				1668	/* Range is mapped and needs a state change */
				1669	jbd_debug(1, "Converting from %d to %d %lld",
				1670	map.m_flags & EXT4_MAP_UNWRITTEN,
				1671	ext4_ext_is_unwritten(ex), map.m_pblk);
				1672	ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
				1673	ext4_ext_is_unwritten(ex), map.m_pblk);
				1674	if (ret) {
				1675	iput(inode);
				1676	return 0;
				1677	}
				1678	/*
				1679	* We may have split the extent tree while toggling the state.
				1680	* Try to shrink the extent tree now.
				1681	*/
				1682	ext4_ext_replay_shrink_inode(inode, start + len);
				1683	next:
				1684	cur += map.m_len;
				1685	remaining -= map.m_len;
				1686	}
				1687	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
				1688	sb->s_blocksize_bits);
				1689	iput(inode);
				1690	return 0;
				1691	}
				1692
				1693	/* Replay DEL_RANGE tag */
				1694	static int
				1695	ext4_fc_replay_del_range(struct super_block sb, struct ext4_fc_tl tl)
				1696	{
				1697	struct inode *inode;
				1698	struct ext4_fc_del_range *lrange;
				1699	struct ext4_map_blocks map;
				1700	ext4_lblk_t cur, remaining;
				1701	int ret;
				1702
				1703	lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
				1704	cur = le32_to_cpu(lrange->fc_lblk);
				1705	remaining = le32_to_cpu(lrange->fc_len);
				1706
				1707	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
				1708	le32_to_cpu(lrange->fc_ino), cur, remaining);
				1709
				1710	inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
				1711	if (IS_ERR_OR_NULL(inode)) {
				1712	jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
				1713	return 0;
				1714	}
				1715
				1716	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
				1717
				1718	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
				1719	inode->i_ino, le32_to_cpu(lrange->fc_lblk),
				1720	le32_to_cpu(lrange->fc_len));
				1721	while (remaining > 0) {
				1722	map.m_lblk = cur;
				1723	map.m_len = remaining;
				1724
				1725	ret = ext4_map_blocks(NULL, inode, &map, 0);
				1726	if (ret < 0) {
				1727	iput(inode);
				1728	return 0;
				1729	}
				1730	if (ret > 0) {
				1731	remaining -= ret;
				1732	cur += ret;
				1733	ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
				1734	} else {
				1735	remaining -= map.m_len;
				1736	cur += map.m_len;
				1737	}
				1738	}
				1739
				1740	ret = ext4_punch_hole(inode,
				1741	le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
				1742	le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits);
				1743	if (ret)
				1744	jbd_debug(1, "ext4_punch_hole returned %d", ret);
				1745	ext4_ext_replay_shrink_inode(inode,
				1746	i_size_read(inode) >> sb->s_blocksize_bits);
				1747	ext4_mark_inode_dirty(NULL, inode);
				1748	iput(inode);
				1749
				1750	return 0;
				1751	}
				1752
				1753	static inline const char *tag2str(u16 tag)
				1754	{
				1755	switch (tag) {
				1756	case EXT4_FC_TAG_LINK:
				1757	return "TAG_ADD_ENTRY";
				1758	case EXT4_FC_TAG_UNLINK:
				1759	return "TAG_DEL_ENTRY";
				1760	case EXT4_FC_TAG_ADD_RANGE:
				1761	return "TAG_ADD_RANGE";
				1762	case EXT4_FC_TAG_CREAT:
				1763	return "TAG_CREAT_DENTRY";
				1764	case EXT4_FC_TAG_DEL_RANGE:
				1765	return "TAG_DEL_RANGE";
				1766	case EXT4_FC_TAG_INODE:
				1767	return "TAG_INODE";
				1768	case EXT4_FC_TAG_PAD:
				1769	return "TAG_PAD";
				1770	case EXT4_FC_TAG_TAIL:
				1771	return "TAG_TAIL";
				1772	case EXT4_FC_TAG_HEAD:
				1773	return "TAG_HEAD";
				1774	default:
				1775	return "TAG_ERROR";
				1776	}
				1777	}
				1778
				1779	static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
				1780	{
				1781	struct ext4_fc_replay_state *state;
				1782	struct inode *inode;
				1783	struct ext4_ext_path *path = NULL;
				1784	struct ext4_map_blocks map;
				1785	int i, ret, j;
				1786	ext4_lblk_t cur, end;
				1787
				1788	state = &EXT4_SB(sb)->s_fc_replay_state;
				1789	for (i = 0; i < state->fc_modified_inodes_used; i++) {
				1790	inode = ext4_iget(sb, state->fc_modified_inodes[i],
				1791	EXT4_IGET_NORMAL);
				1792	if (IS_ERR_OR_NULL(inode)) {
				1793	jbd_debug(1, "Inode %d not found.",
				1794	state->fc_modified_inodes[i]);
				1795	continue;
				1796	}
				1797	cur = 0;
				1798	end = EXT_MAX_BLOCKS;
				1799	while (cur < end) {
				1800	map.m_lblk = cur;
				1801	map.m_len = end - cur;
				1802
				1803	ret = ext4_map_blocks(NULL, inode, &map, 0);
				1804	if (ret < 0)
				1805	break;
				1806
				1807	if (ret > 0) {
				1808	path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
				1809	if (!IS_ERR_OR_NULL(path)) {
				1810	for (j = 0; j < path->p_depth; j++)
				1811	ext4_mb_mark_bb(inode->i_sb,
				1812	path[j].p_block, 1, 1);
				1813	ext4_ext_drop_refs(path);
				1814	kfree(path);
				1815	}
				1816	cur += ret;
				1817	ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
				1818	map.m_len, 1);
				1819	} else {
				1820	cur = cur + (map.m_len ? map.m_len : 1);
				1821	}
				1822	}
				1823	iput(inode);
				1824	}
				1825	}
				1826
				1827	/*
				1828	* Check if block is in excluded regions for block allocation. The simple
				1829	* allocator that runs during replay phase is calls this function to see
				1830	* if it is okay to use a block.
				1831	*/
				1832	bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
				1833	{
				1834	int i;
				1835	struct ext4_fc_replay_state *state;
				1836
				1837	state = &EXT4_SB(sb)->s_fc_replay_state;
				1838	for (i = 0; i < state->fc_regions_valid; i++) {
				1839	if (state->fc_regions[i].ino == 0 \|\|
				1840	state->fc_regions[i].len == 0)
				1841	continue;
				1842	if (blk >= state->fc_regions[i].pblk &&
				1843	blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
				1844	return true;
				1845	}
				1846	return false;
				1847	}
				1848
				1849	/* Cleanup function called after replay */
				1850	void ext4_fc_replay_cleanup(struct super_block *sb)
				1851	{
				1852	struct ext4_sb_info *sbi = EXT4_SB(sb);
				1853
				1854	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
				1855	kfree(sbi->s_fc_replay_state.fc_regions);
				1856	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
				1857	}
				1858
				1859	/*
				1860	* Recovery Scan phase handler
				1861	*
				1862	* This function is called during the scan phase and is responsible
				1863	* for doing following things:
				1864	* - Make sure the fast commit area has valid tags for replay
				1865	* - Count number of tags that need to be replayed by the replay handler
				1866	* - Verify CRC
				1867	* - Create a list of excluded blocks for allocation during replay phase
				1868	*
				1869	* This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
				1870	* incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
				1871	* to indicate that scan has finished and JBD2 can now start replay phase.
				1872	* It returns a negative error to indicate that there was an error. At the end
				1873	* of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
				1874	* to indicate the number of tags that need to replayed during the replay phase.
				1875	*/
				1876	static int ext4_fc_replay_scan(journal_t *journal,
				1877	struct buffer_head *bh, int off,
				1878	tid_t expected_tid)
				1879	{
				1880	struct super_block *sb = journal->j_private;
				1881	struct ext4_sb_info *sbi = EXT4_SB(sb);
				1882	struct ext4_fc_replay_state *state;
				1883	int ret = JBD2_FC_REPLAY_CONTINUE;
				1884	struct ext4_fc_add_range *ext;
				1885	struct ext4_fc_tl *tl;
				1886	struct ext4_fc_tail *tail;
				1887	__u8 start, end;
				1888	struct ext4_fc_head *head;
				1889	struct ext4_extent *ex;
				1890
				1891	state = &sbi->s_fc_replay_state;
				1892
				1893	start = (u8 *)bh->b_data;
				1894	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
				1895
				1896	if (state->fc_replay_expected_off == 0) {
				1897	state->fc_cur_tag = 0;
				1898	state->fc_replay_num_tags = 0;
				1899	state->fc_crc = 0;
				1900	state->fc_regions = NULL;
				1901	state->fc_regions_valid = state->fc_regions_used =
				1902	state->fc_regions_size = 0;
				1903	/* Check if we can stop early */
				1904	if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
				1905	!= EXT4_FC_TAG_HEAD)
				1906	return 0;
				1907	}
				1908
				1909	if (off != state->fc_replay_expected_off) {
				1910	ret = -EFSCORRUPTED;
				1911	goto out_err;
				1912	}
				1913
				1914	state->fc_replay_expected_off++;
				1915	fc_for_each_tl(start, end, tl) {
				1916	jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
				1917	tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
				1918	switch (le16_to_cpu(tl->fc_tag)) {
				1919	case EXT4_FC_TAG_ADD_RANGE:
				1920	ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
				1921	ex = (struct ext4_extent *)&ext->fc_ex;
				1922	ret = ext4_fc_record_regions(sb,
				1923	le32_to_cpu(ext->fc_ino),
				1924	le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
				1925	ext4_ext_get_actual_len(ex));
				1926	if (ret < 0)
				1927	break;
				1928	ret = JBD2_FC_REPLAY_CONTINUE;
				1929	fallthrough;
				1930	case EXT4_FC_TAG_DEL_RANGE:
				1931	case EXT4_FC_TAG_LINK:
				1932	case EXT4_FC_TAG_UNLINK:
				1933	case EXT4_FC_TAG_CREAT:
				1934	case EXT4_FC_TAG_INODE:
				1935	case EXT4_FC_TAG_PAD:
				1936	state->fc_cur_tag++;
				1937	state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
				1938	sizeof(*tl) + ext4_fc_tag_len(tl));
				1939	break;
				1940	case EXT4_FC_TAG_TAIL:
				1941	state->fc_cur_tag++;
				1942	tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
				1943	state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
				1944	sizeof(*tl) +
				1945	offsetof(struct ext4_fc_tail,
				1946	fc_crc));
				1947	if (le32_to_cpu(tail->fc_tid) == expected_tid &&
				1948	le32_to_cpu(tail->fc_crc) == state->fc_crc) {
				1949	state->fc_replay_num_tags = state->fc_cur_tag;
				1950	state->fc_regions_valid =
				1951	state->fc_regions_used;
				1952	} else {
				1953	ret = state->fc_replay_num_tags ?
				1954	JBD2_FC_REPLAY_STOP : -EFSBADCRC;
				1955	}
				1956	state->fc_crc = 0;
				1957	break;
				1958	case EXT4_FC_TAG_HEAD:
				1959	head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
				1960	if (le32_to_cpu(head->fc_features) &
				1961	~EXT4_FC_SUPPORTED_FEATURES) {
				1962	ret = -EOPNOTSUPP;
				1963	break;
				1964	}
				1965	if (le32_to_cpu(head->fc_tid) != expected_tid) {
				1966	ret = JBD2_FC_REPLAY_STOP;
				1967	break;
				1968	}
				1969	state->fc_cur_tag++;
				1970	state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
				1971	sizeof(*tl) + ext4_fc_tag_len(tl));
				1972	break;
				1973	default:
				1974	ret = state->fc_replay_num_tags ?
				1975	JBD2_FC_REPLAY_STOP : -ECANCELED;
				1976	}
				1977	if (ret < 0 \|\| ret == JBD2_FC_REPLAY_STOP)
				1978	break;
				1979	}
				1980
				1981	out_err:
				1982	trace_ext4_fc_replay_scan(sb, ret, off);
				1983	return ret;
				1984	}
				1985
Harshad Shirwadkar	5b849b5	2020-10-15 13:37:58 -0700	[diff] [blame]	1986	/*
				1987	* Main recovery path entry point.
Harshad Shirwadkar	8016e29	2020-10-15 13:37:59 -0700	[diff] [blame]	1988	* The meaning of return codes is similar as above.
Harshad Shirwadkar	5b849b5	2020-10-15 13:37:58 -0700	[diff] [blame]	1989	*/
				1990	static int ext4_fc_replay(journal_t journal, struct buffer_head bh,
				1991	enum passtype pass, int off, tid_t expected_tid)
				1992	{
Harshad Shirwadkar	8016e29	2020-10-15 13:37:59 -0700	[diff] [blame]	1993	struct super_block *sb = journal->j_private;
				1994	struct ext4_sb_info *sbi = EXT4_SB(sb);
				1995	struct ext4_fc_tl *tl;
				1996	__u8 start, end;
				1997	int ret = JBD2_FC_REPLAY_CONTINUE;
				1998	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
				1999	struct ext4_fc_tail *tail;
				2000
				2001	if (pass == PASS_SCAN) {
				2002	state->fc_current_pass = PASS_SCAN;
				2003	return ext4_fc_replay_scan(journal, bh, off, expected_tid);
				2004	}
				2005
				2006	if (state->fc_current_pass != pass) {
				2007	state->fc_current_pass = pass;
				2008	sbi->s_mount_state \|= EXT4_FC_REPLAY;
				2009	}
				2010	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
				2011	jbd_debug(1, "Replay stops\n");
				2012	ext4_fc_set_bitmaps_and_counters(sb);
				2013	return 0;
				2014	}
				2015
				2016	#ifdef CONFIG_EXT4_DEBUG
				2017	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
				2018	pr_warn("Dropping fc block %d because max_replay set\n", off);
				2019	return JBD2_FC_REPLAY_STOP;
				2020	}
				2021	#endif
				2022
				2023	start = (u8 *)bh->b_data;
				2024	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
				2025
				2026	fc_for_each_tl(start, end, tl) {
				2027	if (state->fc_replay_num_tags == 0) {
				2028	ret = JBD2_FC_REPLAY_STOP;
				2029	ext4_fc_set_bitmaps_and_counters(sb);
				2030	break;
				2031	}
				2032	jbd_debug(3, "Replay phase, tag:%s\n",
				2033	tag2str(le16_to_cpu(tl->fc_tag)));
				2034	state->fc_replay_num_tags--;
				2035	switch (le16_to_cpu(tl->fc_tag)) {
				2036	case EXT4_FC_TAG_LINK:
				2037	ret = ext4_fc_replay_link(sb, tl);
				2038	break;
				2039	case EXT4_FC_TAG_UNLINK:
				2040	ret = ext4_fc_replay_unlink(sb, tl);
				2041	break;
				2042	case EXT4_FC_TAG_ADD_RANGE:
				2043	ret = ext4_fc_replay_add_range(sb, tl);
				2044	break;
				2045	case EXT4_FC_TAG_CREAT:
				2046	ret = ext4_fc_replay_create(sb, tl);
				2047	break;
				2048	case EXT4_FC_TAG_DEL_RANGE:
				2049	ret = ext4_fc_replay_del_range(sb, tl);
				2050	break;
				2051	case EXT4_FC_TAG_INODE:
				2052	ret = ext4_fc_replay_inode(sb, tl);
				2053	break;
				2054	case EXT4_FC_TAG_PAD:
				2055	trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
				2056	ext4_fc_tag_len(tl), 0);
				2057	break;
				2058	case EXT4_FC_TAG_TAIL:
				2059	trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
				2060	ext4_fc_tag_len(tl), 0);
				2061	tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
				2062	WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
				2063	break;
				2064	case EXT4_FC_TAG_HEAD:
				2065	break;
				2066	default:
				2067	trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
				2068	ext4_fc_tag_len(tl), 0);
				2069	ret = -ECANCELED;
				2070	break;
				2071	}
				2072	if (ret < 0)
				2073	break;
				2074	ret = JBD2_FC_REPLAY_CONTINUE;
				2075	}
				2076	return ret;
Harshad Shirwadkar	5b849b5	2020-10-15 13:37:58 -0700	[diff] [blame]	2077	}
				2078
Harshad Shirwadkar	6866d7b	2020-10-15 13:37:55 -0700	[diff] [blame]	2079	void ext4_fc_init(struct super_block sb, journal_t journal)
				2080	{
Harshad Shirwadkar	5b849b5	2020-10-15 13:37:58 -0700	[diff] [blame]	2081	/*
				2082	* We set replay callback even if fast commit disabled because we may
				2083	* could still have fast commit blocks that need to be replayed even if
				2084	* fast commit has now been turned off.
				2085	*/
				2086	journal->j_fc_replay_callback = ext4_fc_replay;
Harshad Shirwadkar	6866d7b	2020-10-15 13:37:55 -0700	[diff] [blame]	2087	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
				2088	return;
Harshad Shirwadkar	ff780b9	2020-10-15 13:37:56 -0700	[diff] [blame]	2089	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
Harshad Shirwadkar	6866d7b	2020-10-15 13:37:55 -0700	[diff] [blame]	2090	if (jbd2_fc_init(journal, EXT4_NUM_FC_BLKS)) {
				2091	pr_warn("Error while enabling fast commits, turning off.");
				2092	ext4_clear_feature_fast_commit(sb);
				2093	}
				2094	}
Harshad Shirwadkar	aa75f4d	2020-10-15 13:37:57 -0700	[diff] [blame]	2095
Harshad Shirwadkar	ce8c59d	2020-10-15 13:38:01 -0700	[diff] [blame^]	2096	const char *fc_ineligible_reasons[] = {
				2097	"Extended attributes changed",
				2098	"Cross rename",
				2099	"Journal flag changed",
				2100	"Insufficient memory",
				2101	"Swap boot",
				2102	"Resize",
				2103	"Dir renamed",
				2104	"Falloc range op",
				2105	"FC Commit Failed"
				2106	};
				2107
				2108	int ext4_fc_info_show(struct seq_file seq, void v)
				2109	{
				2110	struct ext4_sb_info sbi = EXT4_SB((struct super_block )seq->private);
				2111	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
				2112	int i;
				2113
				2114	if (v != SEQ_START_TOKEN)
				2115	return 0;
				2116
				2117	seq_printf(seq,
				2118	"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
				2119	stats->fc_num_commits, stats->fc_ineligible_commits,
				2120	stats->fc_numblks,
				2121	div_u64(sbi->s_fc_avg_commit_time, 1000));
				2122	seq_puts(seq, "Ineligible reasons:\n");
				2123	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
				2124	seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
				2125	stats->fc_ineligible_reason_count[i]);
				2126
				2127	return 0;
				2128	}
				2129
Harshad Shirwadkar	aa75f4d	2020-10-15 13:37:57 -0700	[diff] [blame]	2130	int __init ext4_fc_init_dentry_cache(void)
				2131	{
				2132	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
				2133	SLAB_RECLAIM_ACCOUNT);
				2134
				2135	if (ext4_fc_dentry_cachep == NULL)
				2136	return -ENOMEM;
				2137
				2138	return 0;
				2139	}