Blame - fs/ext4/inode.c - linux-4.4

blob: 320acb6c35bfc356243b30a3ff8f72e66d4b913d [file] [log] [blame]

Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1	/*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2	* linux/fs/ext4/inode.c
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3	*
				4	* Copyright (C) 1992, 1993, 1994, 1995
				5	* Remy Card (card@masi.ibp.fr)
				6	* Laboratoire MASI - Institut Blaise Pascal
				7	* Universite Pierre et Marie Curie (Paris VI)
				8	*
				9	* from
				10	*
				11	* linux/fs/minix/inode.c
				12	*
				13	* Copyright (C) 1991, 1992 Linus Torvalds
				14	*
				15	* Goal-directed block allocation by Stephen Tweedie
				16	* (sct@redhat.com), 1993, 1998
				17	* Big-endian to little-endian byte-swapping/bitmaps by
				18	* David S. Miller (davem@caip.rutgers.edu), 1995
				19	* 64-bit file support on 64-bit platforms by Jakub Jelinek
				20	* (jj@sunsite.ms.mff.cuni.cz)
				21	*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	22	* Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	23	*/
				24
				25	#include <linux/module.h>
				26	#include <linux/fs.h>
				27	#include <linux/time.h>
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	28	#include <linux/jbd2.h>
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	29	#include <linux/highuid.h>
				30	#include <linux/pagemap.h>
				31	#include <linux/quotaops.h>
				32	#include <linux/string.h>
				33	#include <linux/buffer_head.h>
				34	#include <linux/writeback.h>
				35	#include <linux/mpage.h>
				36	#include <linux/uio.h>
				37	#include <linux/bio.h>
Christoph Hellwig	3dcf545	2008-04-29 18:13:32 -0400	[diff] [blame]	38	#include "ext4_jbd2.h"
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	39	#include "xattr.h"
				40	#include "acl.h"
				41
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	42	/*
				43	* Test whether an inode is a fast symlink.
				44	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	45	static int ext4_inode_is_fast_symlink(struct inode *inode)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	46	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	47	int ea_blocks = EXT4_I(inode)->i_file_acl ?
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	48	(inode->i_sb->s_blocksize >> 9) : 0;
				49
				50	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
				51	}
				52
				53	/*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	54	* The ext4 forget function must perform a revoke if we are freeing data
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	55	* which has been journaled. Metadata (eg. indirect blocks) must be
				56	* revoked in all cases.
				57	*
				58	* "bh" may be NULL: a metadata block may have been freed from memory
				59	* but there may still be a record of it in the journal, and that record
				60	* still needs to be revoked.
				61	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	62	int ext4_forget(handle_t handle, int is_metadata, struct inode inode,
				63	struct buffer_head *bh, ext4_fsblk_t blocknr)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	64	{
				65	int err;
				66
				67	might_sleep();
				68
				69	BUFFER_TRACE(bh, "enter");
				70
				71	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
				72	"data mode %lx\n",
				73	bh, is_metadata, inode->i_mode,
				74	test_opt(inode->i_sb, DATA_FLAGS));
				75
				76	/* Never use the revoke function if we are doing full data
				77	* journaling: there is no need to, and a V1 superblock won't
				78	* support it. Otherwise, only skip the revoke on un-journaled
				79	* data blocks. */
				80
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	81	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA \|\|
				82	(!is_metadata && !ext4_should_journal_data(inode))) {
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	83	if (bh) {
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	84	BUFFER_TRACE(bh, "call jbd2_journal_forget");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	85	return ext4_journal_forget(handle, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	86	}
				87	return 0;
				88	}
				89
				90	/*
				91	* data!=journal && (is_metadata \|\| should_journal_data(inode))
				92	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	93	BUFFER_TRACE(bh, "call ext4_journal_revoke");
				94	err = ext4_journal_revoke(handle, blocknr, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	95	if (err)
Harvey Harrison	46e665e	2008-04-17 10:38:59 -0400	[diff] [blame]	96	ext4_abort(inode->i_sb, __func__,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	97	"error %d when attempting revoke", err);
				98	BUFFER_TRACE(bh, "exit");
				99	return err;
				100	}
				101
				102	/*
				103	* Work out how many blocks we need to proceed with the next chunk of a
				104	* truncate transaction.
				105	*/
				106	static unsigned long blocks_for_truncate(struct inode *inode)
				107	{
Aneesh Kumar K.V	725d26d	2008-01-28 23:58:27 -0500	[diff] [blame]	108	ext4_lblk_t needed;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	109
				110	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
				111
				112	/* Give ourselves just enough room to cope with inodes in which
				113	* i_blocks is corrupt: we've seen disk corruptions in the past
				114	* which resulted in random data in an inode which looked enough
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	115	* like a regular file for ext4 to try to delete it. Things
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	116	* will go a bit crazy if that happens, but at least we should
				117	* try not to panic the whole kernel. */
				118	if (needed < 2)
				119	needed = 2;
				120
				121	/* But we need to bound the transaction so we don't overflow the
				122	* journal. */
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	123	if (needed > EXT4_MAX_TRANS_DATA)
				124	needed = EXT4_MAX_TRANS_DATA;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	125
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	126	return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	127	}
				128
				129	/*
				130	* Truncate transactions can be complex and absolutely huge. So we need to
				131	* be able to restart the transaction at a conventient checkpoint to make
				132	* sure we don't overflow the journal.
				133	*
				134	* start_transaction gets us a new handle for a truncate transaction,
				135	* and extend_transaction tries to extend the existing one a bit. If
				136	* extend fails, we need to propagate the failure up and restart the
				137	* transaction in the top-level truncate loop. --sct
				138	*/
				139	static handle_t start_transaction(struct inode inode)
				140	{
				141	handle_t *result;
				142
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	143	result = ext4_journal_start(inode, blocks_for_truncate(inode));
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	144	if (!IS_ERR(result))
				145	return result;
				146
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	147	ext4_std_error(inode->i_sb, PTR_ERR(result));
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	148	return result;
				149	}
				150
				151	/*
				152	* Try to extend this transaction for the purposes of truncation.
				153	*
				154	* Returns 0 if we managed to create more room. If we can't create more
				155	* room, and the transaction must be restarted we return 1.
				156	*/
				157	static int try_to_extend_transaction(handle_t handle, struct inode inode)
				158	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	159	if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	160	return 0;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	161	if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	162	return 0;
				163	return 1;
				164	}
				165
				166	/*
				167	* Restart the transaction associated with *handle. This does a commit,
				168	* so before we call here everything must be consistently dirtied against
				169	* this transaction.
				170	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	171	static int ext4_journal_test_restart(handle_t handle, struct inode inode)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	172	{
				173	jbd_debug(2, "restarting handle %p\n", handle);
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	174	return ext4_journal_restart(handle, blocks_for_truncate(inode));
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	175	}
				176
				177	/*
				178	* Called at the last iput() if i_nlink is zero.
				179	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	180	void ext4_delete_inode (struct inode * inode)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	181	{
				182	handle_t *handle;
				183
				184	truncate_inode_pages(&inode->i_data, 0);
				185
				186	if (is_bad_inode(inode))
				187	goto no_delete;
				188
				189	handle = start_transaction(inode);
				190	if (IS_ERR(handle)) {
				191	/*
				192	* If we're going to skip the normal cleanup, we still need to
				193	* make sure that the in-core orphan linked list is properly
				194	* cleaned up.
				195	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	196	ext4_orphan_del(NULL, inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	197	goto no_delete;
				198	}
				199
				200	if (IS_SYNC(inode))
				201	handle->h_sync = 1;
				202	inode->i_size = 0;
				203	if (inode->i_blocks)
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	204	ext4_truncate(inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	205	/*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	206	* Kill off the orphan record which ext4_truncate created.
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	207	* AKPM: I think this can be inside the above `if'.
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	208	* Note that ext4_orphan_del() has to be able to cope with the
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	209	* deletion of a non-existent orphan - this is because we don't
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	210	* know if ext4_truncate() actually created an orphan record.
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	211	* (Well, we could do this if we need to, but heck - it works)
				212	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	213	ext4_orphan_del(handle, inode);
				214	EXT4_I(inode)->i_dtime = get_seconds();
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	215
				216	/*
				217	* One subtle ordering requirement: if anything has gone wrong
				218	* (transaction abort, IO errors, whatever), then we can still
				219	* do these next steps (the fs will already have been marked as
				220	* having errors), but we can't free the inode if the mark_dirty
				221	* fails.
				222	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	223	if (ext4_mark_inode_dirty(handle, inode))
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	224	/* If that failed, just do the required in-core inode clear. */
				225	clear_inode(inode);
				226	else
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	227	ext4_free_inode(handle, inode);
				228	ext4_journal_stop(handle);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	229	return;
				230	no_delete:
				231	clear_inode(inode); /* We must guarantee clearing of inode... */
				232	}
				233
				234	typedef struct {
				235	__le32 *p;
				236	__le32 key;
				237	struct buffer_head *bh;
				238	} Indirect;
				239
				240	static inline void add_chain(Indirect p, struct buffer_head bh, __le32 *v)
				241	{
				242	p->key = *(p->p = v);
				243	p->bh = bh;
				244	}
				245
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	246	/**
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	247	* ext4_block_to_path - parse the block number into array of offsets
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	248	* @inode: inode in question (we are only interested in its superblock)
				249	* @i_block: block number to be parsed
				250	* @offsets: array to store the offsets in
Dave Kleikamp	8c55e20	2007-05-24 13:04:54 -0400	[diff] [blame]	251	* @boundary: set this non-zero if the referred-to block is likely to be
				252	* followed (on disk) by an indirect block.
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	253	*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	254	* To store the locations of file's data ext4 uses a data structure common
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	255	* for UNIX filesystems - tree of pointers anchored in the inode, with
				256	* data blocks at leaves and indirect blocks in intermediate nodes.
				257	* This function translates the block number into path in that tree -
				258	* return value is the path length and @offsets[n] is the offset of
				259	* pointer to (n+1)th node in the nth one. If @block is out of range
				260	* (negative or too large) warning is printed and zero returned.
				261	*
				262	* Note: function doesn't find node addresses, so no IO is needed. All
				263	* we need to know is the capacity of indirect blocks (taken from the
				264	* inode->i_sb).
				265	*/
				266
				267	/*
				268	* Portability note: the last comparison (check that we fit into triple
				269	* indirect block) is spelled differently, because otherwise on an
				270	* architecture with 32-bit longs and 8Kb pages we might get into trouble
				271	* if our filesystem had 8Kb blocks. We might use long long, but that would
				272	* kill us on x86. Oh, well, at least the sign propagation does not matter -
				273	* i_block would have to be negative in the very beginning, so we would not
				274	* get there at all.
				275	*/
				276
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	277	static int ext4_block_to_path(struct inode *inode,
Aneesh Kumar K.V	725d26d	2008-01-28 23:58:27 -0500	[diff] [blame]	278	ext4_lblk_t i_block,
				279	ext4_lblk_t offsets[4], int *boundary)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	280	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	281	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
				282	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
				283	const long direct_blocks = EXT4_NDIR_BLOCKS,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	284	indirect_blocks = ptrs,
				285	double_blocks = (1 << (ptrs_bits * 2));
				286	int n = 0;
				287	int final = 0;
				288
				289	if (i_block < 0) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	290	ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	291	} else if (i_block < direct_blocks) {
				292	offsets[n++] = i_block;
				293	final = direct_blocks;
				294	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	295	offsets[n++] = EXT4_IND_BLOCK;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	296	offsets[n++] = i_block;
				297	final = ptrs;
				298	} else if ((i_block -= indirect_blocks) < double_blocks) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	299	offsets[n++] = EXT4_DIND_BLOCK;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	300	offsets[n++] = i_block >> ptrs_bits;
				301	offsets[n++] = i_block & (ptrs - 1);
				302	final = ptrs;
				303	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	304	offsets[n++] = EXT4_TIND_BLOCK;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	305	offsets[n++] = i_block >> (ptrs_bits * 2);
				306	offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
				307	offsets[n++] = i_block & (ptrs - 1);
				308	final = ptrs;
				309	} else {
Eric Sandeen	e2b4657	2008-01-28 23:58:27 -0500	[diff] [blame]	310	ext4_warning(inode->i_sb, "ext4_block_to_path",
Aneesh Kumar K.V	0e855ac	2008-01-28 23:58:26 -0500	[diff] [blame]	311	"block %lu > max",
Eric Sandeen	e2b4657	2008-01-28 23:58:27 -0500	[diff] [blame]	312	i_block + direct_blocks +
				313	indirect_blocks + double_blocks);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	314	}
				315	if (boundary)
				316	*boundary = final - 1 - (i_block & (ptrs - 1));
				317	return n;
				318	}
				319
				320	/**
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	321	* ext4_get_branch - read the chain of indirect blocks leading to data
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	322	* @inode: inode in question
				323	* @depth: depth of the chain (1 - direct pointer, etc.)
				324	* @offsets: offsets of pointers in inode/indirect blocks
				325	* @chain: place to store the result
				326	* @err: here we store the error value
				327	*
				328	* Function fills the array of triples <key, p, bh> and returns %NULL
				329	* if everything went OK or the pointer to the last filled triple
				330	* (incomplete one) otherwise. Upon the return chain[i].key contains
				331	* the number of (i+1)-th block in the chain (as it is stored in memory,
				332	* i.e. little-endian 32-bit), chain[i].p contains the address of that
				333	* number (it points into struct inode for i==0 and into the bh->b_data
				334	* for i>0) and chain[i].bh points to the buffer_head of i-th indirect
				335	* block for i>0 and NULL for i==0. In other words, it holds the block
				336	* numbers of the chain, addresses they were taken from (and where we can
				337	* verify that chain did not change) and buffer_heads hosting these
				338	* numbers.
				339	*
				340	* Function stops when it stumbles upon zero pointer (absent block)
				341	* (pointer to last triple returned, *@err == 0)
				342	* or when it gets an IO error reading an indirect block
				343	* (ditto, *@err == -EIO)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	344	* or when it reads all @depth-1 indirect blocks successfully and finds
				345	* the whole chain, all way to the data (returns %NULL, *err == 0).
Aneesh Kumar K.V	c278bfe	2008-01-28 23:58:27 -0500	[diff] [blame]	346	*
				347	* Need to be called with
Aneesh Kumar K.V	0e855ac	2008-01-28 23:58:26 -0500	[diff] [blame]	348	* down_read(&EXT4_I(inode)->i_data_sem)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	349	*/
Aneesh Kumar K.V	725d26d	2008-01-28 23:58:27 -0500	[diff] [blame]	350	static Indirect ext4_get_branch(struct inode inode, int depth,
				351	ext4_lblk_t *offsets,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	352	Indirect chain[4], int *err)
				353	{
				354	struct super_block *sb = inode->i_sb;
				355	Indirect *p = chain;
				356	struct buffer_head *bh;
				357
				358	*err = 0;
				359	/* i_data is not going away, no lock needed */
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	360	add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	361	if (!p->key)
				362	goto no_block;
				363	while (--depth) {
				364	bh = sb_bread(sb, le32_to_cpu(p->key));
				365	if (!bh)
				366	goto failure;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	367	add_chain(++p, bh, (__le32)bh->b_data + ++offsets);
				368	/* Reader: end */
				369	if (!p->key)
				370	goto no_block;
				371	}
				372	return NULL;
				373
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	374	failure:
				375	*err = -EIO;
				376	no_block:
				377	return p;
				378	}
				379
				380	/**
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	381	* ext4_find_near - find a place for allocation with sufficient locality
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	382	* @inode: owner
				383	* @ind: descriptor of indirect block.
				384	*
Benoit Boissinot	1cc8dcf	2008-04-21 22:45:55 +0000	[diff] [blame]	385	* This function returns the preferred place for block allocation.
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	386	* It is used when heuristic for sequential allocation fails.
				387	* Rules are:
				388	* + if there is a block to the left of our position - allocate near it.
				389	* + if pointer will live in indirect block - allocate near that block.
				390	* + if pointer will live in inode - allocate in the same
				391	* cylinder group.
				392	*
				393	* In the latter case we colour the starting block by the callers PID to
				394	* prevent it from clashing with concurrent allocations for a different inode
				395	* in the same block group. The PID is used here so that functionally related
				396	* files will be close-by on-disk.
				397	*
				398	* Caller must make sure that @ind is valid and will stay that way.
				399	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	400	static ext4_fsblk_t ext4_find_near(struct inode inode, Indirect ind)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	401	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	402	struct ext4_inode_info *ei = EXT4_I(inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	403	__le32 start = ind->bh ? (__le32) ind->bh->b_data : ei->i_data;
				404	__le32 *p;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	405	ext4_fsblk_t bg_start;
Valerie Clement	74d3487	2008-02-15 13:43:07 -0500	[diff] [blame]	406	ext4_fsblk_t last_block;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	407	ext4_grpblk_t colour;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	408
				409	/* Try to find previous block */
				410	for (p = ind->p - 1; p >= start; p--) {
				411	if (*p)
				412	return le32_to_cpu(*p);
				413	}
				414
				415	/* No such thing, so let's try location of indirect block */
				416	if (ind->bh)
				417	return ind->bh->b_blocknr;
				418
				419	/*
				420	* It is going to be referred to from the inode itself? OK, just put it
				421	* into the same cylinder group then.
				422	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	423	bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
Valerie Clement	74d3487	2008-02-15 13:43:07 -0500	[diff] [blame]	424	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
				425
				426	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
				427	colour = (current->pid % 16) *
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	428	(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
Valerie Clement	74d3487	2008-02-15 13:43:07 -0500	[diff] [blame]	429	else
				430	colour = (current->pid % 16) * ((last_block - bg_start) / 16);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	431	return bg_start + colour;
				432	}
				433
				434	/**
Benoit Boissinot	1cc8dcf	2008-04-21 22:45:55 +0000	[diff] [blame]	435	* ext4_find_goal - find a preferred place for allocation.
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	436	* @inode: owner
				437	* @block: block we want
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	438	* @partial: pointer to the last triple within a chain
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	439	*
Benoit Boissinot	1cc8dcf	2008-04-21 22:45:55 +0000	[diff] [blame]	440	* Normally this function find the preferred place for block allocation,
Akinobu Mita	fb01bfd	2008-02-06 01:40:16 -0800	[diff] [blame]	441	* returns it.
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	442	*/
Aneesh Kumar K.V	725d26d	2008-01-28 23:58:27 -0500	[diff] [blame]	443	static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
Akinobu Mita	fb01bfd	2008-02-06 01:40:16 -0800	[diff] [blame]	444	Indirect *partial)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	445	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	446	struct ext4_block_alloc_info *block_i;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	447
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	448	block_i = EXT4_I(inode)->i_block_alloc_info;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	449
				450	/*
				451	* try the heuristic for sequential allocation,
				452	* failing that at least try to get decent locality.
				453	*/
				454	if (block_i && (block == block_i->last_alloc_logical_block + 1)
				455	&& (block_i->last_alloc_physical_block != 0)) {
				456	return block_i->last_alloc_physical_block + 1;
				457	}
				458
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	459	return ext4_find_near(inode, partial);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	460	}
				461
				462	/**
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	463	* ext4_blks_to_allocate: Look up the block map and count the number
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	464	* of direct blocks need to be allocated for the given branch.
				465	*
				466	* @branch: chain of indirect blocks
				467	* @k: number of blocks need for indirect blocks
				468	* @blks: number of data blocks to be mapped.
				469	* @blocks_to_boundary: the offset in the indirect block
				470	*
				471	* return the total number of blocks to be allocate, including the
				472	* direct and indirect blocks.
				473	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	474	static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	475	int blocks_to_boundary)
				476	{
				477	unsigned long count = 0;
				478
				479	/*
				480	* Simple case, [t,d]Indirect block(s) has not allocated yet
				481	* then it's clear blocks on that path have not allocated
				482	*/
				483	if (k > 0) {
				484	/* right now we don't handle cross boundary allocation */
				485	if (blks < blocks_to_boundary + 1)
				486	count += blks;
				487	else
				488	count += blocks_to_boundary + 1;
				489	return count;
				490	}
				491
				492	count++;
				493	while (count < blks && count <= blocks_to_boundary &&
				494	le32_to_cpu(*(branch[0].p + count)) == 0) {
				495	count++;
				496	}
				497	return count;
				498	}
				499
				500	/**
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	501	* ext4_alloc_blocks: multiple allocate blocks needed for a branch
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	502	* @indirect_blks: the number of blocks need to allocate for indirect
				503	* blocks
				504	*
				505	* @new_blocks: on return it will store the new block numbers for
				506	* the indirect blocks(if needed) and the first direct block,
				507	* @blks: on return it will store the total number of allocated
				508	* direct blocks
				509	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	510	static int ext4_alloc_blocks(handle_t handle, struct inode inode,
Aneesh Kumar K.V	7061eba	2008-07-11 19:27:31 -0400	[diff] [blame]	511	ext4_lblk_t iblock, ext4_fsblk_t goal,
				512	int indirect_blks, int blks,
				513	ext4_fsblk_t new_blocks[4], int *err)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	514	{
				515	int target, i;
Aneesh Kumar K.V	7061eba	2008-07-11 19:27:31 -0400	[diff] [blame]	516	unsigned long count = 0, blk_allocated = 0;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	517	int index = 0;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	518	ext4_fsblk_t current_block = 0;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	519	int ret = 0;
				520
				521	/*
				522	* Here we try to allocate the requested multiple blocks at once,
				523	* on a best-effort basis.
				524	* To build a branch, we should allocate blocks for
				525	* the indirect blocks(if not allocated yet), and at least
				526	* the first direct block of this branch. That's the
				527	* minimum number of blocks need to allocate(required)
				528	*/
Aneesh Kumar K.V	7061eba	2008-07-11 19:27:31 -0400	[diff] [blame]	529	/* first we try to allocate the indirect blocks */
				530	target = indirect_blks;
				531	while (target > 0) {
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	532	count = target;
				533	/* allocating blocks for indirect blocks and direct blocks */
Aneesh Kumar K.V	7061eba	2008-07-11 19:27:31 -0400	[diff] [blame]	534	current_block = ext4_new_meta_blocks(handle, inode,
				535	goal, &count, err);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	536	if (*err)
				537	goto failed_out;
				538
				539	target -= count;
				540	/* allocate blocks for indirect blocks */
				541	while (index < indirect_blks && count) {
				542	new_blocks[index++] = current_block++;
				543	count--;
				544	}
Aneesh Kumar K.V	7061eba	2008-07-11 19:27:31 -0400	[diff] [blame]	545	if (count > 0) {
				546	/*
				547	* save the new block number
				548	* for the first direct block
				549	*/
				550	new_blocks[index] = current_block;
				551	printk(KERN_INFO "%s returned more blocks than "
				552	"requested\n", __func__);
				553	WARN_ON(1);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	554	break;
Aneesh Kumar K.V	7061eba	2008-07-11 19:27:31 -0400	[diff] [blame]	555	}
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	556	}
				557
Aneesh Kumar K.V	7061eba	2008-07-11 19:27:31 -0400	[diff] [blame]	558	target = blks - count ;
				559	blk_allocated = count;
				560	if (!target)
				561	goto allocated;
				562	/* Now allocate data blocks */
				563	count = target;
Aneesh Kumar K.V	654b490	2008-07-11 19:27:31 -0400	[diff] [blame]	564	/* allocating blocks for data blocks */
Aneesh Kumar K.V	7061eba	2008-07-11 19:27:31 -0400	[diff] [blame]	565	current_block = ext4_new_blocks(handle, inode, iblock,
				566	goal, &count, err);
				567	if (*err && (target == blks)) {
				568	/*
				569	* if the allocation failed and we didn't allocate
				570	* any blocks before
				571	*/
				572	goto failed_out;
				573	}
				574	if (!*err) {
				575	if (target == blks) {
				576	/*
				577	* save the new block number
				578	* for the first direct block
				579	*/
				580	new_blocks[index] = current_block;
				581	}
				582	blk_allocated += count;
				583	}
				584	allocated:
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	585	/* total number of blocks allocated for direct blocks */
Aneesh Kumar K.V	7061eba	2008-07-11 19:27:31 -0400	[diff] [blame]	586	ret = blk_allocated;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	587	*err = 0;
				588	return ret;
				589	failed_out:
				590	for (i = 0; i <index; i++)
Alex Tomas	c9de560	2008-01-29 00:19:52 -0500	[diff] [blame]	591	ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	592	return ret;
				593	}
				594
				595	/**
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	596	* ext4_alloc_branch - allocate and set up a chain of blocks.
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	597	* @inode: owner
				598	* @indirect_blks: number of allocated indirect blocks
				599	* @blks: number of allocated direct blocks
				600	* @offsets: offsets (in the blocks) to store the pointers to next.
				601	* @branch: place to store the chain in.
				602	*
				603	* This function allocates blocks, zeroes out all but the last one,
				604	* links them into chain and (if we are synchronous) writes them to disk.
				605	* In other words, it prepares a branch that can be spliced onto the
				606	* inode. It stores the information about that chain in the branch[], in
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	607	* the same format as ext4_get_branch() would do. We are calling it after
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	608	* we had read the existing part of chain and partial points to the last
				609	* triple of that (one with zero ->key). Upon the exit we have the same
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	610	* picture as after the successful ext4_get_block(), except that in one
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	611	* place chain is disconnected - *branch->p is still zero (we did not
				612	* set the last link), but branch->key contains the number that should
				613	* be placed into *branch->p to fill that gap.
				614	*
				615	* If allocation fails we free all blocks we've allocated (and forget
				616	* their buffer_heads) and return the error value the from failed
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	617	* ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	618	* as described above and return 0.
				619	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	620	static int ext4_alloc_branch(handle_t handle, struct inode inode,
Aneesh Kumar K.V	7061eba	2008-07-11 19:27:31 -0400	[diff] [blame]	621	ext4_lblk_t iblock, int indirect_blks,
				622	int *blks, ext4_fsblk_t goal,
				623	ext4_lblk_t offsets, Indirect branch)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	624	{
				625	int blocksize = inode->i_sb->s_blocksize;
				626	int i, n = 0;
				627	int err = 0;
				628	struct buffer_head *bh;
				629	int num;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	630	ext4_fsblk_t new_blocks[4];
				631	ext4_fsblk_t current_block;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	632
Aneesh Kumar K.V	7061eba	2008-07-11 19:27:31 -0400	[diff] [blame]	633	num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	634	*blks, new_blocks, &err);
				635	if (err)
				636	return err;
				637
				638	branch[0].key = cpu_to_le32(new_blocks[0]);
				639	/*
				640	* metadata blocks and data blocks are allocated.
				641	*/
				642	for (n = 1; n <= indirect_blks; n++) {
				643	/*
				644	* Get buffer_head for parent block, zero it out
				645	* and set the pointer to new one, then send
				646	* parent to disk.
				647	*/
				648	bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
				649	branch[n].bh = bh;
				650	lock_buffer(bh);
				651	BUFFER_TRACE(bh, "call get_create_access");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	652	err = ext4_journal_get_create_access(handle, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	653	if (err) {
				654	unlock_buffer(bh);
				655	brelse(bh);
				656	goto failed;
				657	}
				658
				659	memset(bh->b_data, 0, blocksize);
				660	branch[n].p = (__le32 *) bh->b_data + offsets[n];
				661	branch[n].key = cpu_to_le32(new_blocks[n]);
				662	*branch[n].p = branch[n].key;
				663	if ( n == indirect_blks) {
				664	current_block = new_blocks[n];
				665	/*
				666	* End of chain, update the last new metablock of
				667	* the chain to point to the new allocated
				668	* data blocks numbers
				669	*/
				670	for (i=1; i < num; i++)
				671	*(branch[n].p + i) = cpu_to_le32(++current_block);
				672	}
				673	BUFFER_TRACE(bh, "marking uptodate");
				674	set_buffer_uptodate(bh);
				675	unlock_buffer(bh);
				676
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	677	BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
				678	err = ext4_journal_dirty_metadata(handle, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	679	if (err)
				680	goto failed;
				681	}
				682	*blks = num;
				683	return err;
				684	failed:
				685	/* Allocation failed, free what we already allocated */
				686	for (i = 1; i <= n ; i++) {
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	687	BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	688	ext4_journal_forget(handle, branch[i].bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	689	}
				690	for (i = 0; i <indirect_blks; i++)
Alex Tomas	c9de560	2008-01-29 00:19:52 -0500	[diff] [blame]	691	ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	692
Alex Tomas	c9de560	2008-01-29 00:19:52 -0500	[diff] [blame]	693	ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	694
				695	return err;
				696	}
				697
				698	/**
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	699	* ext4_splice_branch - splice the allocated branch onto inode.
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	700	* @inode: owner
				701	* @block: (logical) number of block we are adding
				702	* @chain: chain of indirect blocks (with a missing link - see
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	703	* ext4_alloc_branch)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	704	* @where: location of missing link
				705	* @num: number of indirect blocks we are adding
				706	* @blks: number of direct blocks we are adding
				707	*
				708	* This function fills the missing link and does all housekeeping needed in
				709	* inode (->i_blocks, etc.). In case of success we end up with the full
				710	* chain to new block and return 0.
				711	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	712	static int ext4_splice_branch(handle_t handle, struct inode inode,
Aneesh Kumar K.V	725d26d	2008-01-28 23:58:27 -0500	[diff] [blame]	713	ext4_lblk_t block, Indirect *where, int num, int blks)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	714	{
				715	int i;
				716	int err = 0;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	717	struct ext4_block_alloc_info *block_i;
				718	ext4_fsblk_t current_block;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	719
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	720	block_i = EXT4_I(inode)->i_block_alloc_info;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	721	/*
				722	* If we're splicing into a [td]indirect block (as opposed to the
				723	* inode) then we need to get write access to the [td]indirect block
				724	* before the splice.
				725	*/
				726	if (where->bh) {
				727	BUFFER_TRACE(where->bh, "get_write_access");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	728	err = ext4_journal_get_write_access(handle, where->bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	729	if (err)
				730	goto err_out;
				731	}
				732	/* That's it */
				733
				734	*where->p = where->key;
				735
				736	/*
				737	* Update the host buffer_head or inode to point to more just allocated
				738	* direct blocks blocks
				739	*/
				740	if (num == 0 && blks > 1) {
				741	current_block = le32_to_cpu(where->key) + 1;
				742	for (i = 1; i < blks; i++)
				743	*(where->p + i ) = cpu_to_le32(current_block++);
				744	}
				745
				746	/*
				747	* update the most recently allocated logical & physical block
				748	* in i_block_alloc_info, to assist find the proper goal block for next
				749	* allocation
				750	*/
				751	if (block_i) {
				752	block_i->last_alloc_logical_block = block + blks - 1;
				753	block_i->last_alloc_physical_block =
				754	le32_to_cpu(where[num].key) + blks - 1;
				755	}
				756
				757	/* We are done with atomic stuff, now do the rest of housekeeping */
				758
Kalpak Shah	ef7f383	2007-07-18 09:15:20 -0400	[diff] [blame]	759	inode->i_ctime = ext4_current_time(inode);
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	760	ext4_mark_inode_dirty(handle, inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	761
				762	/* had we spliced it onto indirect block? */
				763	if (where->bh) {
				764	/*
				765	* If we spliced it onto an indirect block, we haven't
				766	* altered the inode. Note however that if it is being spliced
				767	* onto an indirect block at the very end of the file (the
				768	* file is growing) then we will alter the inode to reflect
				769	* the new i_size. But that is not done here - it is done in
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	770	* generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	771	*/
				772	jbd_debug(5, "splicing indirect only\n");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	773	BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
				774	err = ext4_journal_dirty_metadata(handle, where->bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	775	if (err)
				776	goto err_out;
				777	} else {
				778	/*
				779	* OK, we spliced it into the inode itself on a direct block.
				780	* Inode was dirtied above.
				781	*/
				782	jbd_debug(5, "splicing direct\n");
				783	}
				784	return err;
				785
				786	err_out:
				787	for (i = 1; i <= num; i++) {
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	788	BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	789	ext4_journal_forget(handle, where[i].bh);
Alex Tomas	c9de560	2008-01-29 00:19:52 -0500	[diff] [blame]	790	ext4_free_blocks(handle, inode,
				791	le32_to_cpu(where[i-1].key), 1, 0);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	792	}
Alex Tomas	c9de560	2008-01-29 00:19:52 -0500	[diff] [blame]	793	ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	794
				795	return err;
				796	}
				797
				798	/*
				799	* Allocation strategy is simple: if we have to allocate something, we will
				800	* have to go the whole way to leaf. So let's do it before attaching anything
				801	* to tree, set linkage between the newborn blocks, write them if sync is
				802	* required, recheck the path, free and repeat if check fails, otherwise
				803	* set the last missing link (that will protect us from any truncate-generated
				804	* removals - all blocks on the path are immune now) and possibly force the
				805	* write on the parent block.
				806	* That has a nice additional property: no special recovery from the failed
				807	* allocations is needed - we simply release blocks and do not touch anything
				808	* reachable from inode.
				809	*
				810	* `handle' can be NULL if create == 0.
				811	*
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	812	* return > 0, # of blocks mapped or allocated.
				813	* return = 0, if plain lookup failed.
				814	* return < 0, error case.
Aneesh Kumar K.V	c278bfe	2008-01-28 23:58:27 -0500	[diff] [blame]	815	*
				816	*
				817	* Need to be called with
Aneesh Kumar K.V	0e855ac	2008-01-28 23:58:26 -0500	[diff] [blame]	818	* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
				819	* (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	820	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	821	int ext4_get_blocks_handle(handle_t handle, struct inode inode,
Aneesh Kumar K.V	725d26d	2008-01-28 23:58:27 -0500	[diff] [blame]	822	ext4_lblk_t iblock, unsigned long maxblocks,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	823	struct buffer_head *bh_result,
				824	int create, int extend_disksize)
				825	{
				826	int err = -EIO;
Aneesh Kumar K.V	725d26d	2008-01-28 23:58:27 -0500	[diff] [blame]	827	ext4_lblk_t offsets[4];
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	828	Indirect chain[4];
				829	Indirect *partial;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	830	ext4_fsblk_t goal;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	831	int indirect_blks;
				832	int blocks_to_boundary = 0;
				833	int depth;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	834	struct ext4_inode_info *ei = EXT4_I(inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	835	int count = 0;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	836	ext4_fsblk_t first_block = 0;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	837
				838
Alex Tomas	a86c618	2006-10-11 01:21:03 -0700	[diff] [blame]	839	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	840	J_ASSERT(handle != NULL \|\| create == 0);
Aneesh Kumar K.V	725d26d	2008-01-28 23:58:27 -0500	[diff] [blame]	841	depth = ext4_block_to_path(inode, iblock, offsets,
				842	&blocks_to_boundary);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	843
				844	if (depth == 0)
				845	goto out;
				846
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	847	partial = ext4_get_branch(inode, depth, offsets, chain, &err);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	848
				849	/* Simplest case - block found, no allocation needed */
				850	if (!partial) {
				851	first_block = le32_to_cpu(chain[depth - 1].key);
				852	clear_buffer_new(bh_result);
				853	count++;
				854	/map more blocks/
				855	while (count < maxblocks && count <= blocks_to_boundary) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	856	ext4_fsblk_t blk;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	857
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	858	blk = le32_to_cpu(*(chain[depth-1].p + count));
				859
				860	if (blk == first_block + count)
				861	count++;
				862	else
				863	break;
				864	}
Aneesh Kumar K.V	c278bfe	2008-01-28 23:58:27 -0500	[diff] [blame]	865	goto got_it;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	866	}
				867
				868	/* Next simple case - plain lookup or failed read of indirect block */
				869	if (!create \|\| err == -EIO)
				870	goto cleanup;
				871
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	872	/*
				873	* Okay, we need to do block allocation. Lazily initialize the block
				874	* allocation info here if necessary
				875	*/
				876	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	877	ext4_init_block_alloc_info(inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	878
Akinobu Mita	fb01bfd	2008-02-06 01:40:16 -0800	[diff] [blame]	879	goal = ext4_find_goal(inode, iblock, partial);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	880
				881	/* the number of blocks need to allocate for [d,t]indirect blocks */
				882	indirect_blks = (chain + depth) - partial - 1;
				883
				884	/*
				885	* Next look up the indirect map to count the totoal number of
				886	* direct blocks to allocate for this branch.
				887	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	888	count = ext4_blks_to_allocate(partial, indirect_blks,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	889	maxblocks, blocks_to_boundary);
				890	/*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	891	* Block out ext4_truncate while we alter the tree
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	892	*/
Aneesh Kumar K.V	7061eba	2008-07-11 19:27:31 -0400	[diff] [blame]	893	err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
				894	&count, goal,
				895	offsets + (partial - chain), partial);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	896
				897	/*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	898	* The ext4_splice_branch call will free and forget any buffers
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	899	* on the new chain if there is a failure, but that risks using
				900	* up transaction credits, especially for bitmaps where the
				901	* credits cannot be returned. Can we handle this somehow? We
				902	* may need to return -EAGAIN upwards in the worst case. --sct
				903	*/
				904	if (!err)
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	905	err = ext4_splice_branch(handle, inode, iblock,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	906	partial, indirect_blks, count);
				907	/*
Aneesh Kumar K.V	0e855ac	2008-01-28 23:58:26 -0500	[diff] [blame]	908	* i_disksize growing is protected by i_data_sem. Don't forget to
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	909	* protect it if you're about to implement concurrent
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	910	* ext4_get_block() -bzzz
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	911	*/
				912	if (!err && extend_disksize && inode->i_size > ei->i_disksize)
				913	ei->i_disksize = inode->i_size;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	914	if (err)
				915	goto cleanup;
				916
				917	set_buffer_new(bh_result);
				918	got_it:
				919	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
				920	if (count > blocks_to_boundary)
				921	set_buffer_boundary(bh_result);
				922	err = count;
				923	/* Clean up and exit */
				924	partial = chain + depth - 1; /* the whole chain */
				925	cleanup:
				926	while (partial > chain) {
				927	BUFFER_TRACE(partial->bh, "call brelse");
				928	brelse(partial->bh);
				929	partial--;
				930	}
				931	BUFFER_TRACE(bh_result, "returned");
				932	out:
				933	return err;
				934	}
				935
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	936	/* Maximum number of blocks we map for direct IO at once. */
				937	#define DIO_MAX_BLOCKS 4096
				938	/*
				939	* Number of credits we need for writing DIO_MAX_BLOCKS:
				940	* We need sb + group descriptor + bitmap + inode -> 4
				941	* For B blocks with A block pointers per block we need:
				942	* 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
				943	* If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
				944	*/
				945	#define DIO_CREDITS 25
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	946
Mingming Cao	f5ab0d1	2008-02-25 15:29:55 -0500	[diff] [blame]	947
				948	/*
				949	*
				950	*
				951	* ext4_ext4 get_block() wrapper function
				952	* It will do a look up first, and returns if the blocks already mapped.
				953	* Otherwise it takes the write lock of the i_data_sem and allocate blocks
				954	* and store the allocated blocks in the result buffer head and mark it
				955	* mapped.
				956	*
				957	* If file type is extents based, it will call ext4_ext_get_blocks(),
				958	* Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
				959	* based files
				960	*
				961	* On success, it returns the number of blocks being mapped or allocate.
				962	* if create==0 and the blocks are pre-allocated and uninitialized block,
				963	* the result buffer head is unmapped. If the create ==1, it will make sure
				964	* the buffer head is mapped.
				965	*
				966	* It returns 0 if plain look up failed (blocks have not been allocated), in
				967	* that casem, buffer head is unmapped
				968	*
				969	* It returns the error in case of allocation failure.
				970	*/
Aneesh Kumar K.V	0e855ac	2008-01-28 23:58:26 -0500	[diff] [blame]	971	int ext4_get_blocks_wrap(handle_t handle, struct inode inode, sector_t block,
				972	unsigned long max_blocks, struct buffer_head *bh,
				973	int create, int extend_disksize)
				974	{
				975	int retval;
Mingming Cao	f5ab0d1	2008-02-25 15:29:55 -0500	[diff] [blame]	976
				977	clear_buffer_mapped(bh);
				978
Aneesh Kumar K.V	4df3d26	2008-01-28 23:58:29 -0500	[diff] [blame]	979	/*
				980	* Try to see if we can get the block without requesting
				981	* for new file system block.
				982	*/
				983	down_read((&EXT4_I(inode)->i_data_sem));
				984	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
				985	retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
				986	bh, 0, 0);
Aneesh Kumar K.V	0e855ac	2008-01-28 23:58:26 -0500	[diff] [blame]	987	} else {
Aneesh Kumar K.V	4df3d26	2008-01-28 23:58:29 -0500	[diff] [blame]	988	retval = ext4_get_blocks_handle(handle,
				989	inode, block, max_blocks, bh, 0, 0);
Aneesh Kumar K.V	0e855ac	2008-01-28 23:58:26 -0500	[diff] [blame]	990	}
Aneesh Kumar K.V	4df3d26	2008-01-28 23:58:29 -0500	[diff] [blame]	991	up_read((&EXT4_I(inode)->i_data_sem));
Mingming Cao	f5ab0d1	2008-02-25 15:29:55 -0500	[diff] [blame]	992
				993	/* If it is only a block(s) look up */
				994	if (!create)
Aneesh Kumar K.V	4df3d26	2008-01-28 23:58:29 -0500	[diff] [blame]	995	return retval;
				996
				997	/*
Mingming Cao	f5ab0d1	2008-02-25 15:29:55 -0500	[diff] [blame]	998	* Returns if the blocks have already allocated
				999	*
				1000	* Note that if blocks have been preallocated
				1001	* ext4_ext_get_block() returns th create = 0
				1002	* with buffer head unmapped.
				1003	*/
				1004	if (retval > 0 && buffer_mapped(bh))
				1005	return retval;
				1006
				1007	/*
				1008	* New blocks allocate and/or writing to uninitialized extent
				1009	* will possibly result in updating i_data, so we take
				1010	* the write lock of i_data_sem, and call get_blocks()
				1011	* with create == 1 flag.
Aneesh Kumar K.V	4df3d26	2008-01-28 23:58:29 -0500	[diff] [blame]	1012	*/
				1013	down_write((&EXT4_I(inode)->i_data_sem));
				1014	/*
				1015	* We need to check for EXT4 here because migrate
				1016	* could have changed the inode type in between
				1017	*/
Aneesh Kumar K.V	0e855ac	2008-01-28 23:58:26 -0500	[diff] [blame]	1018	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
				1019	retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
				1020	bh, create, extend_disksize);
				1021	} else {
				1022	retval = ext4_get_blocks_handle(handle, inode, block,
				1023	max_blocks, bh, create, extend_disksize);
Aneesh Kumar K.V	267e4db	2008-04-29 08:11:12 -0400	[diff] [blame]	1024
				1025	if (retval > 0 && buffer_new(bh)) {
				1026	/*
				1027	* We allocated new blocks which will result in
				1028	* i_data's format changing. Force the migrate
				1029	* to fail by clearing migrate flags
				1030	*/
				1031	EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
				1032	~EXT4_EXT_MIGRATE;
				1033	}
Aneesh Kumar K.V	0e855ac	2008-01-28 23:58:26 -0500	[diff] [blame]	1034	}
Aneesh Kumar K.V	4df3d26	2008-01-28 23:58:29 -0500	[diff] [blame]	1035	up_write((&EXT4_I(inode)->i_data_sem));
Aneesh Kumar K.V	0e855ac	2008-01-28 23:58:26 -0500	[diff] [blame]	1036	return retval;
				1037	}
				1038
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1039	static int ext4_get_block(struct inode *inode, sector_t iblock,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1040	struct buffer_head *bh_result, int create)
				1041	{
Dmitriy Monakhov	3e4fdaf	2007-02-10 01:46:35 -0800	[diff] [blame]	1042	handle_t *handle = ext4_journal_current_handle();
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	1043	int ret = 0, started = 0;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1044	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
				1045
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	1046	if (create && !handle) {
				1047	/* Direct IO write... */
				1048	if (max_blocks > DIO_MAX_BLOCKS)
				1049	max_blocks = DIO_MAX_BLOCKS;
				1050	handle = ext4_journal_start(inode, DIO_CREDITS +
				1051	2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
				1052	if (IS_ERR(handle)) {
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1053	ret = PTR_ERR(handle);
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	1054	goto out;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1055	}
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	1056	started = 1;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1057	}
				1058
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	1059	ret = ext4_get_blocks_wrap(handle, inode, iblock,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1060	max_blocks, bh_result, create, 0);
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	1061	if (ret > 0) {
				1062	bh_result->b_size = (ret << inode->i_blkbits);
				1063	ret = 0;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1064	}
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	1065	if (started)
				1066	ext4_journal_stop(handle);
				1067	out:
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1068	return ret;
				1069	}
				1070
				1071	/*
				1072	* `handle' can be NULL if create is zero
				1073	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1074	struct buffer_head ext4_getblk(handle_t handle, struct inode *inode,
Aneesh Kumar K.V	725d26d	2008-01-28 23:58:27 -0500	[diff] [blame]	1075	ext4_lblk_t block, int create, int *errp)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1076	{
				1077	struct buffer_head dummy;
				1078	int fatal = 0, err;
				1079
				1080	J_ASSERT(handle != NULL \|\| create == 0);
				1081
				1082	dummy.b_state = 0;
				1083	dummy.b_blocknr = -1000;
				1084	buffer_trace_init(&dummy.b_history);
Alex Tomas	a86c618	2006-10-11 01:21:03 -0700	[diff] [blame]	1085	err = ext4_get_blocks_wrap(handle, inode, block, 1,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1086	&dummy, create, 1);
				1087	/*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1088	* ext4_get_blocks_handle() returns number of blocks
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1089	* mapped. 0 in case of a HOLE.
				1090	*/
				1091	if (err > 0) {
				1092	if (err > 1)
				1093	WARN_ON(1);
				1094	err = 0;
				1095	}
				1096	*errp = err;
				1097	if (!err && buffer_mapped(&dummy)) {
				1098	struct buffer_head *bh;
				1099	bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
				1100	if (!bh) {
				1101	*errp = -EIO;
				1102	goto err;
				1103	}
				1104	if (buffer_new(&dummy)) {
				1105	J_ASSERT(create != 0);
Aneesh Kumar K.V	ac39849	2007-10-16 18:38:25 -0400	[diff] [blame]	1106	J_ASSERT(handle != NULL);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1107
				1108	/*
				1109	* Now that we do not always journal data, we should
				1110	* keep in mind whether this should always journal the
				1111	* new buffer as metadata. For now, regular file
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1112	* writes use ext4_get_block instead, so it's not a
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1113	* problem.
				1114	*/
				1115	lock_buffer(bh);
				1116	BUFFER_TRACE(bh, "call get_create_access");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1117	fatal = ext4_journal_get_create_access(handle, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1118	if (!fatal && !buffer_uptodate(bh)) {
				1119	memset(bh->b_data,0,inode->i_sb->s_blocksize);
				1120	set_buffer_uptodate(bh);
				1121	}
				1122	unlock_buffer(bh);
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1123	BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
				1124	err = ext4_journal_dirty_metadata(handle, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1125	if (!fatal)
				1126	fatal = err;
				1127	} else {
				1128	BUFFER_TRACE(bh, "not a new buffer");
				1129	}
				1130	if (fatal) {
				1131	*errp = fatal;
				1132	brelse(bh);
				1133	bh = NULL;
				1134	}
				1135	return bh;
				1136	}
				1137	err:
				1138	return NULL;
				1139	}
				1140
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1141	struct buffer_head ext4_bread(handle_t handle, struct inode *inode,
Aneesh Kumar K.V	725d26d	2008-01-28 23:58:27 -0500	[diff] [blame]	1142	ext4_lblk_t block, int create, int *err)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1143	{
				1144	struct buffer_head * bh;
				1145
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1146	bh = ext4_getblk(handle, inode, block, create, err);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1147	if (!bh)
				1148	return bh;
				1149	if (buffer_uptodate(bh))
				1150	return bh;
				1151	ll_rw_block(READ_META, 1, &bh);
				1152	wait_on_buffer(bh);
				1153	if (buffer_uptodate(bh))
				1154	return bh;
				1155	put_bh(bh);
				1156	*err = -EIO;
				1157	return NULL;
				1158	}
				1159
				1160	static int walk_page_buffers( handle_t *handle,
				1161	struct buffer_head *head,
				1162	unsigned from,
				1163	unsigned to,
				1164	int *partial,
				1165	int (fn)( handle_t handle,
				1166	struct buffer_head *bh))
				1167	{
				1168	struct buffer_head *bh;
				1169	unsigned block_start, block_end;
				1170	unsigned blocksize = head->b_size;
				1171	int err, ret = 0;
				1172	struct buffer_head *next;
				1173
				1174	for ( bh = head, block_start = 0;
				1175	ret == 0 && (bh != head \|\| !block_start);
				1176	block_start = block_end, bh = next)
				1177	{
				1178	next = bh->b_this_page;
				1179	block_end = block_start + blocksize;
				1180	if (block_end <= from \|\| block_start >= to) {
				1181	if (partial && !buffer_uptodate(bh))
				1182	*partial = 1;
				1183	continue;
				1184	}
				1185	err = (*fn)(handle, bh);
				1186	if (!ret)
				1187	ret = err;
				1188	}
				1189	return ret;
				1190	}
				1191
				1192	/*
				1193	* To preserve ordering, it is essential that the hole instantiation and
				1194	* the data write be encapsulated in a single transaction. We cannot
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1195	* close off a transaction and start a new one between the ext4_get_block()
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	1196	* and the commit_write(). So doing the jbd2_journal_start at the start of
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1197	* prepare_write() is the right place.
				1198	*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1199	* Also, this function can nest inside ext4_writepage() ->
				1200	* block_write_full_page(). In that case, we know that ext4_writepage()
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1201	* has generated enough buffer credits to do the whole page. So we won't
				1202	* block on the journal in that case, which is good, because the caller may
				1203	* be PF_MEMALLOC.
				1204	*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1205	* By accident, ext4 can be reentered when a transaction is open via
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1206	* quota file writes. If we were to commit the transaction while thus
				1207	* reentered, there can be a deadlock - we would be holding a quota
				1208	* lock, and the commit would never complete if another thread had a
				1209	* transaction open and was blocking on the quota lock - a ranking
				1210	* violation.
				1211	*
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	1212	* So what we do is to rely on the fact that jbd2_journal_stop/journal_start
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1213	* will _not_ run commit under these circumstances because handle->h_ref
				1214	* is elevated. We'll still have enough credits for the tiny quotafile
				1215	* write.
				1216	*/
				1217	static int do_journal_get_write_access(handle_t *handle,
				1218	struct buffer_head *bh)
				1219	{
				1220	if (!buffer_mapped(bh) \|\| buffer_freed(bh))
				1221	return 0;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1222	return ext4_journal_get_write_access(handle, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1223	}
				1224
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1225	static int ext4_write_begin(struct file file, struct address_space mapping,
				1226	loff_t pos, unsigned len, unsigned flags,
				1227	struct page pagep, void fsdata)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1228	{
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1229	struct inode *inode = mapping->host;
Andrew Morton	7479d2b	2007-04-01 23:49:44 -0700	[diff] [blame]	1230	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1231	handle_t *handle;
				1232	int retries = 0;
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1233	struct page *page;
				1234	pgoff_t index;
				1235	unsigned from, to;
				1236
				1237	index = pos >> PAGE_CACHE_SHIFT;
				1238	from = pos & (PAGE_CACHE_SIZE - 1);
				1239	to = from + len;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1240
				1241	retry:
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1242	handle = ext4_journal_start(inode, needed_blocks);
				1243	if (IS_ERR(handle)) {
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1244	ret = PTR_ERR(handle);
				1245	goto out;
				1246	}
				1247
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1248	page = __grab_cache_page(mapping, index);
				1249	if (!page) {
				1250	ext4_journal_stop(handle);
				1251	ret = -ENOMEM;
				1252	goto out;
				1253	}
				1254	*pagep = page;
				1255
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1256	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
				1257	ext4_get_block);
				1258
				1259	if (!ret && ext4_should_journal_data(inode)) {
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1260	ret = walk_page_buffers(handle, page_buffers(page),
				1261	from, to, NULL, do_journal_get_write_access);
				1262	}
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1263
				1264	if (ret) {
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1265	unlock_page(page);
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1266	ext4_journal_stop(handle);
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1267	page_cache_release(page);
				1268	}
				1269
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1270	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1271	goto retry;
Andrew Morton	7479d2b	2007-04-01 23:49:44 -0700	[diff] [blame]	1272	out:
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1273	return ret;
				1274	}
				1275
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1276	int ext4_journal_dirty_data(handle_t handle, struct buffer_head bh)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1277	{
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	1278	int err = jbd2_journal_dirty_data(handle, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1279	if (err)
Harvey Harrison	46e665e	2008-04-17 10:38:59 -0400	[diff] [blame]	1280	ext4_journal_abort_handle(__func__, __func__,
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1281	bh, handle, err);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1282	return err;
				1283	}
				1284
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1285	/* For write_end() in data=journal mode */
				1286	static int write_end_fn(handle_t handle, struct buffer_head bh)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1287	{
				1288	if (!buffer_mapped(bh) \|\| buffer_freed(bh))
				1289	return 0;
				1290	set_buffer_uptodate(bh);
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1291	return ext4_journal_dirty_metadata(handle, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1292	}
				1293
				1294	/*
				1295	* We need to pick up the new inode size which generic_commit_write gave us
				1296	* `file' can be NULL - eg, when called from page_symlink().
				1297	*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1298	* ext4 never places buffers on inode->i_mapping->private_list. metadata
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1299	* buffers are managed internally.
				1300	*/
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1301	static int ext4_ordered_write_end(struct file *file,
				1302	struct address_space *mapping,
				1303	loff_t pos, unsigned len, unsigned copied,
				1304	struct page page, void fsdata)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1305	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1306	handle_t *handle = ext4_journal_current_handle();
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1307	struct inode *inode = mapping->host;
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1308	unsigned from, to;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1309	int ret = 0, ret2;
				1310
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1311	from = pos & (PAGE_CACHE_SIZE - 1);
				1312	to = from + len;
				1313
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1314	ret = walk_page_buffers(handle, page_buffers(page),
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1315	from, to, NULL, ext4_journal_dirty_data);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1316
				1317	if (ret == 0) {
				1318	/*
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1319	* generic_write_end() will run mark_inode_dirty() if i_size
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1320	* changes. So let's piggyback the i_disksize mark_inode_dirty
				1321	* into that.
				1322	*/
				1323	loff_t new_i_size;
				1324
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1325	new_i_size = pos + copied;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1326	if (new_i_size > EXT4_I(inode)->i_disksize)
				1327	EXT4_I(inode)->i_disksize = new_i_size;
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1328	ret2 = generic_write_end(file, mapping, pos, len, copied,
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1329	page, fsdata);
Roel Kluin	f8a87d8	2008-04-29 22:01:18 -0400	[diff] [blame]	1330	copied = ret2;
				1331	if (ret2 < 0)
				1332	ret = ret2;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1333	}
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1334	ret2 = ext4_journal_stop(handle);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1335	if (!ret)
				1336	ret = ret2;
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1337
				1338	return ret ? ret : copied;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1339	}
				1340
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1341	static int ext4_writeback_write_end(struct file *file,
				1342	struct address_space *mapping,
				1343	loff_t pos, unsigned len, unsigned copied,
				1344	struct page page, void fsdata)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1345	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1346	handle_t *handle = ext4_journal_current_handle();
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1347	struct inode *inode = mapping->host;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1348	int ret = 0, ret2;
				1349	loff_t new_i_size;
				1350
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1351	new_i_size = pos + copied;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1352	if (new_i_size > EXT4_I(inode)->i_disksize)
				1353	EXT4_I(inode)->i_disksize = new_i_size;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1354
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1355	ret2 = generic_write_end(file, mapping, pos, len, copied,
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1356	page, fsdata);
Roel Kluin	f8a87d8	2008-04-29 22:01:18 -0400	[diff] [blame]	1357	copied = ret2;
				1358	if (ret2 < 0)
				1359	ret = ret2;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1360
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1361	ret2 = ext4_journal_stop(handle);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1362	if (!ret)
				1363	ret = ret2;
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1364
				1365	return ret ? ret : copied;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1366	}
				1367
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1368	static int ext4_journalled_write_end(struct file *file,
				1369	struct address_space *mapping,
				1370	loff_t pos, unsigned len, unsigned copied,
				1371	struct page page, void fsdata)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1372	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1373	handle_t *handle = ext4_journal_current_handle();
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1374	struct inode *inode = mapping->host;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1375	int ret = 0, ret2;
				1376	int partial = 0;
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1377	unsigned from, to;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1378
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1379	from = pos & (PAGE_CACHE_SIZE - 1);
				1380	to = from + len;
				1381
				1382	if (copied < len) {
				1383	if (!PageUptodate(page))
				1384	copied = 0;
				1385	page_zero_new_buffers(page, from+copied, to);
				1386	}
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1387
				1388	ret = walk_page_buffers(handle, page_buffers(page), from,
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1389	to, &partial, write_end_fn);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1390	if (!partial)
				1391	SetPageUptodate(page);
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1392	if (pos+copied > inode->i_size)
				1393	i_size_write(inode, pos+copied);
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1394	EXT4_I(inode)->i_state \|= EXT4_STATE_JDATA;
				1395	if (inode->i_size > EXT4_I(inode)->i_disksize) {
				1396	EXT4_I(inode)->i_disksize = inode->i_size;
				1397	ret2 = ext4_mark_inode_dirty(handle, inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1398	if (!ret)
				1399	ret = ret2;
				1400	}
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1401
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1402	unlock_page(page);
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1403	ret2 = ext4_journal_stop(handle);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1404	if (!ret)
				1405	ret = ret2;
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1406	page_cache_release(page);
				1407
				1408	return ret ? ret : copied;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1409	}
				1410
				1411	/*
				1412	* bmap() is special. It gets used by applications such as lilo and by
				1413	* the swapper to find the on-disk block of a specific piece of data.
				1414	*
				1415	* Naturally, this is dangerous if the block concerned is still in the
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1416	* journal. If somebody makes a swapfile on an ext4 data-journaling
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1417	* filesystem and enables swap, then they may get a nasty shock when the
				1418	* data getting swapped to that swapfile suddenly gets overwritten by
				1419	* the original zero's written out previously to the journal and
				1420	* awaiting writeback in the kernel's buffer cache.
				1421	*
				1422	* So, if we see any bmap calls here on a modified, data-journaled file,
				1423	* take extra steps to flush any blocks which might be in the cache.
				1424	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1425	static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1426	{
				1427	struct inode *inode = mapping->host;
				1428	journal_t *journal;
				1429	int err;
				1430
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1431	if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1432	/*
				1433	* This is a REALLY heavyweight approach, but the use of
				1434	* bmap on dirty files is expected to be extremely rare:
				1435	* only if we run lilo or swapon on a freshly made file
				1436	* do we expect this to happen.
				1437	*
				1438	* (bmap requires CAP_SYS_RAWIO so this does not
				1439	* represent an unprivileged user DOS attack --- we'd be
				1440	* in trouble if mortal users could trigger this path at
				1441	* will.)
				1442	*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1443	* NB. EXT4_STATE_JDATA is not set on files other than
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1444	* regular files. If somebody wants to bmap a directory
				1445	* or symlink and gets confused because the buffer
				1446	* hasn't yet been flushed to disk, they deserve
				1447	* everything they get.
				1448	*/
				1449
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1450	EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
				1451	journal = EXT4_JOURNAL(inode);
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	1452	jbd2_journal_lock_updates(journal);
				1453	err = jbd2_journal_flush(journal);
				1454	jbd2_journal_unlock_updates(journal);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1455
				1456	if (err)
				1457	return 0;
				1458	}
				1459
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1460	return generic_block_bmap(mapping,block,ext4_get_block);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1461	}
				1462
				1463	static int bget_one(handle_t handle, struct buffer_head bh)
				1464	{
				1465	get_bh(bh);
				1466	return 0;
				1467	}
				1468
				1469	static int bput_one(handle_t handle, struct buffer_head bh)
				1470	{
				1471	put_bh(bh);
				1472	return 0;
				1473	}
				1474
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	1475	static int jbd2_journal_dirty_data_fn(handle_t handle, struct buffer_head bh)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1476	{
				1477	if (buffer_mapped(bh))
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1478	return ext4_journal_dirty_data(handle, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1479	return 0;
				1480	}
				1481
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1482	static int ext4_bh_unmapped_or_delay(handle_t handle, struct buffer_head bh)
				1483	{
				1484	return !buffer_mapped(bh) \|\| buffer_delay(bh);
				1485	}
				1486
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1487	/*
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1488	* Note that we don't need to start a transaction unless we're journaling
				1489	* data because we should have holes filled from ext4_page_mkwrite(). If
				1490	* we are journaling data, we cannot start transaction directly because
				1491	* transaction start ranks above page lock so we have to do some magic...
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1492	*
				1493	* In all journalling modes block_write_full_page() will start the I/O.
				1494	*
				1495	* Problem:
				1496	*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1497	* ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
				1498	* ext4_writepage()
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1499	*
				1500	* Similar for:
				1501	*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1502	* ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1503	*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1504	* Same applies to ext4_get_block(). We will deadlock on various things like
Aneesh Kumar K.V	0e855ac	2008-01-28 23:58:26 -0500	[diff] [blame]	1505	* lock_journal and i_data_sem
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1506	*
				1507	* Setting PF_MEMALLOC here doesn't work - too many internal memory
				1508	* allocations fail.
				1509	*
				1510	* 16May01: If we're reentered then journal_current_handle() will be
				1511	* non-zero. We simply return.
				1512	*
				1513	* 1 July 2001: @@@ FIXME:
				1514	* In journalled data mode, a data buffer may be metadata against the
				1515	* current transaction. But the same file is part of a shared mapping
				1516	* and someone does a writepage() on it.
				1517	*
				1518	* We will move the buffer onto the async_data list, but after it has
				1519	* been dirtied. So there's a small window where we have dirty data on
				1520	* BJ_Metadata.
				1521	*
				1522	* Note that this only applies to the last partial page in the file. The
				1523	* bit which block_write_full_page() uses prepare/commit for. (That's
				1524	* broken code anyway: it's wrong for msync()).
				1525	*
				1526	* It's a rare case: affects the final partial page, for journalled data
				1527	* where the file is subject to bith write() and writepage() in the same
				1528	* transction. To fix it we'll need a custom block_write_full_page().
				1529	* We'll probably need that anyway for journalling writepage() output.
				1530	*
				1531	* We don't honour synchronous mounts for writepage(). That would be
				1532	* disastrous. Any write() or metadata operation will sync the fs for
				1533	* us.
				1534	*
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1535	*/
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1536	static int __ext4_ordered_writepage(struct page *page,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1537	struct writeback_control *wbc)
				1538	{
				1539	struct inode *inode = page->mapping->host;
				1540	struct buffer_head *page_bufs;
				1541	handle_t *handle = NULL;
				1542	int ret = 0;
				1543	int err;
				1544
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1545	if (!page_has_buffers(page)) {
				1546	create_empty_buffers(page, inode->i_sb->s_blocksize,
				1547	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1548	}
				1549	page_bufs = page_buffers(page);
				1550	walk_page_buffers(handle, page_bufs, 0,
				1551	PAGE_CACHE_SIZE, NULL, bget_one);
				1552
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1553	ret = block_write_full_page(page, ext4_get_block, wbc);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1554
				1555	/*
				1556	* The page can become unlocked at any point now, and
				1557	* truncate can then come in and change things. So we
				1558	* can't touch page from now on. But page_bufs is
				1559	* safe due to elevated refcount.
				1560	*/
				1561
				1562	/*
				1563	* And attach them to the current transaction. But only if
				1564	* block_write_full_page() succeeded. Otherwise they are unmapped,
				1565	* and generally junk.
				1566	*/
				1567	if (ret == 0) {
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1568	handle = ext4_journal_start(inode,
				1569	ext4_writepage_trans_blocks(inode));
				1570	if (IS_ERR(handle)) {
				1571	ret = PTR_ERR(handle);
				1572	goto out_put;
				1573	}
				1574
				1575	ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	1576	NULL, jbd2_journal_dirty_data_fn);
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1577	err = ext4_journal_stop(handle);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1578	if (!ret)
				1579	ret = err;
				1580	}
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1581	out_put:
				1582	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
				1583	bput_one);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1584	return ret;
				1585	}
				1586
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1587	static int ext4_ordered_writepage(struct page *page,
				1588	struct writeback_control *wbc)
				1589	{
				1590	struct inode *inode = page->mapping->host;
				1591	loff_t size = i_size_read(inode);
				1592	loff_t len;
				1593
				1594	J_ASSERT(PageLocked(page));
				1595	J_ASSERT(page_has_buffers(page));
				1596	if (page->index == size >> PAGE_CACHE_SHIFT)
				1597	len = size & ~PAGE_CACHE_MASK;
				1598	else
				1599	len = PAGE_CACHE_SIZE;
				1600	BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
				1601	ext4_bh_unmapped_or_delay));
				1602
				1603	/*
				1604	* We give up here if we're reentered, because it might be for a
				1605	* different filesystem.
				1606	*/
				1607	if (!ext4_journal_current_handle())
				1608	return __ext4_ordered_writepage(page, wbc);
				1609
				1610	redirty_page_for_writepage(wbc, page);
				1611	unlock_page(page);
				1612	return 0;
				1613	}
				1614
				1615	static int __ext4_writeback_writepage(struct page *page,
				1616	struct writeback_control *wbc)
				1617	{
				1618	struct inode *inode = page->mapping->host;
				1619
				1620	if (test_opt(inode->i_sb, NOBH))
				1621	return nobh_writepage(page, ext4_get_block, wbc);
				1622	else
				1623	return block_write_full_page(page, ext4_get_block, wbc);
				1624	}
				1625
				1626
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1627	static int ext4_writeback_writepage(struct page *page,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1628	struct writeback_control *wbc)
				1629	{
				1630	struct inode *inode = page->mapping->host;
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1631	loff_t size = i_size_read(inode);
				1632	loff_t len;
				1633
				1634	J_ASSERT(PageLocked(page));
				1635	J_ASSERT(page_has_buffers(page));
				1636	if (page->index == size >> PAGE_CACHE_SHIFT)
				1637	len = size & ~PAGE_CACHE_MASK;
				1638	else
				1639	len = PAGE_CACHE_SIZE;
				1640	BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
				1641	ext4_bh_unmapped_or_delay));
				1642
				1643	if (!ext4_journal_current_handle())
				1644	return __ext4_writeback_writepage(page, wbc);
				1645
				1646	redirty_page_for_writepage(wbc, page);
				1647	unlock_page(page);
				1648	return 0;
				1649	}
				1650
				1651	static int __ext4_journalled_writepage(struct page *page,
				1652	struct writeback_control *wbc)
				1653	{
				1654	struct address_space *mapping = page->mapping;
				1655	struct inode *inode = mapping->host;
				1656	struct buffer_head *page_bufs;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1657	handle_t *handle = NULL;
				1658	int ret = 0;
				1659	int err;
				1660
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1661	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
				1662	if (ret != 0)
				1663	goto out_unlock;
				1664
				1665	page_bufs = page_buffers(page);
				1666	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
				1667	bget_one);
				1668	/* As soon as we unlock the page, it can go away, but we have
				1669	* references to buffers so we are safe */
				1670	unlock_page(page);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1671
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1672	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1673	if (IS_ERR(handle)) {
				1674	ret = PTR_ERR(handle);
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1675	goto out;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1676	}
				1677
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1678	ret = walk_page_buffers(handle, page_bufs, 0,
				1679	PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1680
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1681	err = walk_page_buffers(handle, page_bufs, 0,
				1682	PAGE_CACHE_SIZE, NULL, write_end_fn);
				1683	if (ret == 0)
				1684	ret = err;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1685	err = ext4_journal_stop(handle);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1686	if (!ret)
				1687	ret = err;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1688
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1689	walk_page_buffers(handle, page_bufs, 0,
				1690	PAGE_CACHE_SIZE, NULL, bput_one);
				1691	EXT4_I(inode)->i_state \|= EXT4_STATE_JDATA;
				1692	goto out;
				1693
				1694	out_unlock:
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1695	unlock_page(page);
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1696	out:
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1697	return ret;
				1698	}
				1699
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1700	static int ext4_journalled_writepage(struct page *page,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1701	struct writeback_control *wbc)
				1702	{
				1703	struct inode *inode = page->mapping->host;
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1704	loff_t size = i_size_read(inode);
				1705	loff_t len;
				1706
				1707	J_ASSERT(PageLocked(page));
				1708	J_ASSERT(page_has_buffers(page));
				1709	if (page->index == size >> PAGE_CACHE_SHIFT)
				1710	len = size & ~PAGE_CACHE_MASK;
				1711	else
				1712	len = PAGE_CACHE_SIZE;
				1713	BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
				1714	ext4_bh_unmapped_or_delay));
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1715
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1716	if (ext4_journal_current_handle())
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1717	goto no_write;
				1718
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1719	if (PageChecked(page)) {
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1720	/*
				1721	* It's mmapped pagecache. Add buffers and journal it. There
				1722	* doesn't seem much point in redirtying the page here.
				1723	*/
				1724	ClearPageChecked(page);
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1725	return __ext4_journalled_writepage(page, wbc);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1726	} else {
				1727	/*
				1728	* It may be a page full of checkpoint-mode buffers. We don't
				1729	* really know unless we go poke around in the buffer_heads.
				1730	* But block_write_full_page will do the right thing.
				1731	*/
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1732	return block_write_full_page(page, ext4_get_block, wbc);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1733	}
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1734	no_write:
				1735	redirty_page_for_writepage(wbc, page);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1736	unlock_page(page);
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1737	return 0;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1738	}
				1739
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1740	static int ext4_readpage(struct file file, struct page page)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1741	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1742	return mpage_readpage(page, ext4_get_block);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1743	}
				1744
				1745	static int
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1746	ext4_readpages(struct file file, struct address_space mapping,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1747	struct list_head *pages, unsigned nr_pages)
				1748	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1749	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1750	}
				1751
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1752	static void ext4_invalidatepage(struct page *page, unsigned long offset)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1753	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1754	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1755
				1756	/*
				1757	* If it's a full truncate we just forget about the pending dirtying
				1758	*/
				1759	if (offset == 0)
				1760	ClearPageChecked(page);
				1761
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	1762	jbd2_journal_invalidatepage(journal, page, offset);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1763	}
				1764
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1765	static int ext4_releasepage(struct page *page, gfp_t wait)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1766	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1767	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1768
				1769	WARN_ON(PageChecked(page));
				1770	if (!page_has_buffers(page))
				1771	return 0;
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	1772	return jbd2_journal_try_to_free_buffers(journal, page, wait);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1773	}
				1774
				1775	/*
				1776	* If the O_DIRECT write will extend the file then add this inode to the
				1777	* orphan list. So recovery will truncate it back to the original size
				1778	* if the machine crashes during the write.
				1779	*
				1780	* If the O_DIRECT write is intantiating holes inside i_size and the machine
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	1781	* crashes then stale disk data _may_ be exposed inside the file. But current
				1782	* VFS code falls back into buffered path in that case so we are safe.
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1783	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1784	static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1785	const struct iovec *iov, loff_t offset,
				1786	unsigned long nr_segs)
				1787	{
				1788	struct file *file = iocb->ki_filp;
				1789	struct inode *inode = file->f_mapping->host;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1790	struct ext4_inode_info *ei = EXT4_I(inode);
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	1791	handle_t *handle;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1792	ssize_t ret;
				1793	int orphan = 0;
				1794	size_t count = iov_length(iov, nr_segs);
				1795
				1796	if (rw == WRITE) {
				1797	loff_t final_size = offset + count;
				1798
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1799	if (final_size > inode->i_size) {
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	1800	/* Credits for sb + inode write */
				1801	handle = ext4_journal_start(inode, 2);
				1802	if (IS_ERR(handle)) {
				1803	ret = PTR_ERR(handle);
				1804	goto out;
				1805	}
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1806	ret = ext4_orphan_add(handle, inode);
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	1807	if (ret) {
				1808	ext4_journal_stop(handle);
				1809	goto out;
				1810	}
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1811	orphan = 1;
				1812	ei->i_disksize = inode->i_size;
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	1813	ext4_journal_stop(handle);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1814	}
				1815	}
				1816
				1817	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
				1818	offset, nr_segs,
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1819	ext4_get_block, NULL);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1820
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	1821	if (orphan) {
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1822	int err;
				1823
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	1824	/* Credits for sb + inode write */
				1825	handle = ext4_journal_start(inode, 2);
				1826	if (IS_ERR(handle)) {
				1827	/* This is really bad luck. We've written the data
				1828	* but cannot extend i_size. Bail out and pretend
				1829	* the write failed... */
				1830	ret = PTR_ERR(handle);
				1831	goto out;
				1832	}
				1833	if (inode->i_nlink)
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1834	ext4_orphan_del(handle, inode);
Jan Kara	7fb5409	2008-02-10 01:08:38 -0500	[diff] [blame]	1835	if (ret > 0) {
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1836	loff_t end = offset + ret;
				1837	if (end > inode->i_size) {
				1838	ei->i_disksize = end;
				1839	i_size_write(inode, end);
				1840	/*
				1841	* We're going to return a positive `ret'
				1842	* here due to non-zero-length I/O, so there's
				1843	* no way of reporting error returns from
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1844	* ext4_mark_inode_dirty() to userspace. So
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1845	* ignore it.
				1846	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1847	ext4_mark_inode_dirty(handle, inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1848	}
				1849	}
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1850	err = ext4_journal_stop(handle);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1851	if (ret == 0)
				1852	ret = err;
				1853	}
				1854	out:
				1855	return ret;
				1856	}
				1857
				1858	/*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1859	* Pages can be marked dirty completely asynchronously from ext4's journalling
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1860	* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
				1861	* much here because ->set_page_dirty is called under VFS locks. The page is
				1862	* not necessarily locked.
				1863	*
				1864	* We cannot just dirty the page and leave attached buffers clean, because the
				1865	* buffers' dirty state is "definitive". We cannot just set the buffers dirty
				1866	* or jbddirty because all the journalling code will explode.
				1867	*
				1868	* So what we do is to mark the page "pending dirty" and next time writepage
				1869	* is called, propagate that into the buffers appropriately.
				1870	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1871	static int ext4_journalled_set_page_dirty(struct page *page)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1872	{
				1873	SetPageChecked(page);
				1874	return __set_page_dirty_nobuffers(page);
				1875	}
				1876
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1877	static const struct address_space_operations ext4_ordered_aops = {
				1878	.readpage = ext4_readpage,
				1879	.readpages = ext4_readpages,
				1880	.writepage = ext4_ordered_writepage,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1881	.sync_page = block_sync_page,
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1882	.write_begin = ext4_write_begin,
				1883	.write_end = ext4_ordered_write_end,
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1884	.bmap = ext4_bmap,
				1885	.invalidatepage = ext4_invalidatepage,
				1886	.releasepage = ext4_releasepage,
				1887	.direct_IO = ext4_direct_IO,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1888	.migratepage = buffer_migrate_page,
				1889	};
				1890
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1891	static const struct address_space_operations ext4_writeback_aops = {
				1892	.readpage = ext4_readpage,
				1893	.readpages = ext4_readpages,
				1894	.writepage = ext4_writeback_writepage,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1895	.sync_page = block_sync_page,
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1896	.write_begin = ext4_write_begin,
				1897	.write_end = ext4_writeback_write_end,
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1898	.bmap = ext4_bmap,
				1899	.invalidatepage = ext4_invalidatepage,
				1900	.releasepage = ext4_releasepage,
				1901	.direct_IO = ext4_direct_IO,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1902	.migratepage = buffer_migrate_page,
				1903	};
				1904
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1905	static const struct address_space_operations ext4_journalled_aops = {
				1906	.readpage = ext4_readpage,
				1907	.readpages = ext4_readpages,
				1908	.writepage = ext4_journalled_writepage,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1909	.sync_page = block_sync_page,
Nick Piggin	bfc1af6	2007-10-16 01:25:05 -0700	[diff] [blame]	1910	.write_begin = ext4_write_begin,
				1911	.write_end = ext4_journalled_write_end,
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1912	.set_page_dirty = ext4_journalled_set_page_dirty,
				1913	.bmap = ext4_bmap,
				1914	.invalidatepage = ext4_invalidatepage,
				1915	.releasepage = ext4_releasepage,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1916	};
				1917
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1918	void ext4_set_aops(struct inode *inode)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1919	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1920	if (ext4_should_order_data(inode))
				1921	inode->i_mapping->a_ops = &ext4_ordered_aops;
				1922	else if (ext4_should_writeback_data(inode))
				1923	inode->i_mapping->a_ops = &ext4_writeback_aops;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1924	else
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1925	inode->i_mapping->a_ops = &ext4_journalled_aops;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1926	}
				1927
				1928	/*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1929	* ext4_block_truncate_page() zeroes out a mapping from file offset `from'
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1930	* up to the end of the block which corresponds to `from'.
				1931	* This required during truncate. We need to physically zero the tail end
				1932	* of that block so it doesn't yield old data if the file is later grown.
				1933	*/
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1934	int ext4_block_truncate_page(handle_t *handle,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1935	struct address_space *mapping, loff_t from)
				1936	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1937	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1938	unsigned offset = from & (PAGE_CACHE_SIZE-1);
Aneesh Kumar K.V	725d26d	2008-01-28 23:58:27 -0500	[diff] [blame]	1939	unsigned blocksize, length, pos;
				1940	ext4_lblk_t iblock;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1941	struct inode *inode = mapping->host;
				1942	struct buffer_head *bh;
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1943	struct page *page;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1944	int err = 0;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1945
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	1946	page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
				1947	if (!page)
				1948	return -EINVAL;
				1949
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1950	blocksize = inode->i_sb->s_blocksize;
				1951	length = blocksize - (offset & (blocksize - 1));
				1952	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
				1953
				1954	/*
				1955	* For "nobh" option, we can only work if we don't need to
				1956	* read-in the page - otherwise we create buffers to do the IO.
				1957	*/
				1958	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1959	ext4_should_writeback_data(inode) && PageUptodate(page)) {
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	1960	zero_user(page, offset, length);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1961	set_page_dirty(page);
				1962	goto unlock;
				1963	}
				1964
				1965	if (!page_has_buffers(page))
				1966	create_empty_buffers(page, blocksize, 0);
				1967
				1968	/* Find the buffer that contains "offset" */
				1969	bh = page_buffers(page);
				1970	pos = blocksize;
				1971	while (offset >= pos) {
				1972	bh = bh->b_this_page;
				1973	iblock++;
				1974	pos += blocksize;
				1975	}
				1976
				1977	err = 0;
				1978	if (buffer_freed(bh)) {
				1979	BUFFER_TRACE(bh, "freed: skip");
				1980	goto unlock;
				1981	}
				1982
				1983	if (!buffer_mapped(bh)) {
				1984	BUFFER_TRACE(bh, "unmapped");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	1985	ext4_get_block(inode, iblock, bh, 0);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	1986	/* unmapped? It's a hole - nothing to do */
				1987	if (!buffer_mapped(bh)) {
				1988	BUFFER_TRACE(bh, "still unmapped");
				1989	goto unlock;
				1990	}
				1991	}
				1992
				1993	/* Ok, it's mapped. Make sure it's up-to-date */
				1994	if (PageUptodate(page))
				1995	set_buffer_uptodate(bh);
				1996
				1997	if (!buffer_uptodate(bh)) {
				1998	err = -EIO;
				1999	ll_rw_block(READ, 1, &bh);
				2000	wait_on_buffer(bh);
				2001	/* Uhhuh. Read error. Complain and punt. */
				2002	if (!buffer_uptodate(bh))
				2003	goto unlock;
				2004	}
				2005
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2006	if (ext4_should_journal_data(inode)) {
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2007	BUFFER_TRACE(bh, "get write access");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2008	err = ext4_journal_get_write_access(handle, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2009	if (err)
				2010	goto unlock;
				2011	}
				2012
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2013	zero_user(page, offset, length);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2014
				2015	BUFFER_TRACE(bh, "zeroed end of block");
				2016
				2017	err = 0;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2018	if (ext4_should_journal_data(inode)) {
				2019	err = ext4_journal_dirty_metadata(handle, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2020	} else {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2021	if (ext4_should_order_data(inode))
				2022	err = ext4_journal_dirty_data(handle, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2023	mark_buffer_dirty(bh);
				2024	}
				2025
				2026	unlock:
				2027	unlock_page(page);
				2028	page_cache_release(page);
				2029	return err;
				2030	}
				2031
				2032	/*
				2033	* Probably it should be a library function... search for first non-zero word
				2034	* or memcmp with zero_page, whatever is better for particular architecture.
				2035	* Linus?
				2036	*/
				2037	static inline int all_zeroes(__le32 p, __le32 q)
				2038	{
				2039	while (p < q)
				2040	if (*p++)
				2041	return 0;
				2042	return 1;
				2043	}
				2044
				2045	/**
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2046	* ext4_find_shared - find the indirect blocks for partial truncation.
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2047	* @inode: inode in question
				2048	* @depth: depth of the affected branch
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2049	* @offsets: offsets of pointers in that branch (see ext4_block_to_path)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2050	* @chain: place to store the pointers to partial indirect blocks
				2051	* @top: place to the (detached) top of branch
				2052	*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2053	* This is a helper function used by ext4_truncate().
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2054	*
				2055	* When we do truncate() we may have to clean the ends of several
				2056	* indirect blocks but leave the blocks themselves alive. Block is
				2057	* partially truncated if some data below the new i_size is refered
				2058	* from it (and it is on the path to the first completely truncated
				2059	* data block, indeed). We have to free the top of that path along
				2060	* with everything to the right of the path. Since no allocation
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2061	* past the truncation point is possible until ext4_truncate()
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2062	* finishes, we may safely do the latter, but top of branch may
				2063	* require special attention - pageout below the truncation point
				2064	* might try to populate it.
				2065	*
				2066	* We atomically detach the top of branch from the tree, store the
				2067	* block number of its root in *@top, pointers to buffer_heads of
				2068	* partially truncated blocks - in @chain[].bh and pointers to
				2069	* their last elements that should not be removed - in
				2070	* @chain[].p. Return value is the pointer to last filled element
				2071	* of @chain.
				2072	*
				2073	* The work left to caller to do the actual freeing of subtrees:
				2074	* a) free the subtree starting from *@top
				2075	* b) free the subtrees whose roots are stored in
				2076	* (@chain[i].p+1 .. end of @chain[i].bh->b_data)
				2077	* c) free the subtrees growing from the inode past the @chain[0].
				2078	* (no partially truncated stuff there). */
				2079
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2080	static Indirect ext4_find_shared(struct inode inode, int depth,
Aneesh Kumar K.V	725d26d	2008-01-28 23:58:27 -0500	[diff] [blame]	2081	ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2082	{
				2083	Indirect partial, p;
				2084	int k, err;
				2085
				2086	*top = 0;
				2087	/* Make k index the deepest non-null offest + 1 */
				2088	for (k = depth; k > 1 && !offsets[k-1]; k--)
				2089	;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2090	partial = ext4_get_branch(inode, k, offsets, chain, &err);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2091	/* Writer: pointers */
				2092	if (!partial)
				2093	partial = chain + k-1;
				2094	/*
				2095	* If the branch acquired continuation since we've looked at it -
				2096	* fine, it should all survive and (new) top doesn't belong to us.
				2097	*/
				2098	if (!partial->key && *partial->p)
				2099	/* Writer: end */
				2100	goto no_top;
				2101	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
				2102	;
				2103	/*
				2104	* OK, we've found the last block that must survive. The rest of our
				2105	* branch should be detached before unlocking. However, if that rest
				2106	* of branch is all ours and does not grow immediately from the inode
				2107	* it's easier to cheat and just decrement partial->p.
				2108	*/
				2109	if (p == chain + k - 1 && p > chain) {
				2110	p->p--;
				2111	} else {
				2112	top = p->p;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2113	/* Nope, don't do this in ext4. Must leave the tree intact */
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2114	#if 0
				2115	*p->p = 0;
				2116	#endif
				2117	}
				2118	/* Writer: end */
				2119
				2120	while(partial > p) {
				2121	brelse(partial->bh);
				2122	partial--;
				2123	}
				2124	no_top:
				2125	return partial;
				2126	}
				2127
				2128	/*
				2129	* Zero a number of block pointers in either an inode or an indirect block.
				2130	* If we restart the transaction we must again get write access to the
				2131	* indirect block for further modification.
				2132	*
				2133	* We release `count' blocks on disk, but (last - first) may be greater
				2134	* than `count' because there can be holes in there.
				2135	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2136	static void ext4_clear_blocks(handle_t handle, struct inode inode,
				2137	struct buffer_head *bh, ext4_fsblk_t block_to_free,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2138	unsigned long count, __le32 first, __le32 last)
				2139	{
				2140	__le32 *p;
				2141	if (try_to_extend_transaction(handle, inode)) {
				2142	if (bh) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2143	BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
				2144	ext4_journal_dirty_metadata(handle, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2145	}
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2146	ext4_mark_inode_dirty(handle, inode);
				2147	ext4_journal_test_restart(handle, inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2148	if (bh) {
				2149	BUFFER_TRACE(bh, "retaking write access");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2150	ext4_journal_get_write_access(handle, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2151	}
				2152	}
				2153
				2154	/*
				2155	* Any buffers which are on the journal will be in memory. We find
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	2156	* them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2157	* on them. We've already detached each block from the file, so
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	2158	* bforget() in jbd2_journal_forget() should be safe.
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2159	*
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	2160	* AKPM: turn on bforget in jbd2_journal_forget()!!!
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2161	*/
				2162	for (p = first; p < last; p++) {
				2163	u32 nr = le32_to_cpu(*p);
				2164	if (nr) {
Aneesh Kumar K.V	1d03ec9	2008-01-28 23:58:27 -0500	[diff] [blame]	2165	struct buffer_head *tbh;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2166
				2167	*p = 0;
Aneesh Kumar K.V	1d03ec9	2008-01-28 23:58:27 -0500	[diff] [blame]	2168	tbh = sb_find_get_block(inode->i_sb, nr);
				2169	ext4_forget(handle, 0, inode, tbh, nr);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2170	}
				2171	}
				2172
Alex Tomas	c9de560	2008-01-29 00:19:52 -0500	[diff] [blame]	2173	ext4_free_blocks(handle, inode, block_to_free, count, 0);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2174	}
				2175
				2176	/**
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2177	* ext4_free_data - free a list of data blocks
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2178	* @handle: handle for this transaction
				2179	* @inode: inode we are dealing with
				2180	* @this_bh: indirect buffer_head which contains @first and @last
				2181	* @first: array of block numbers
				2182	* @last: points immediately past the end of array
				2183	*
				2184	* We are freeing all blocks refered from that array (numbers are stored as
				2185	* little-endian 32-bit) and updating @inode->i_blocks appropriately.
				2186	*
				2187	* We accumulate contiguous runs of blocks to free. Conveniently, if these
				2188	* blocks are contiguous then releasing them at one time will only affect one
				2189	* or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
				2190	* actually use a lot of journal space.
				2191	*
				2192	* @this_bh will be %NULL if @first and @last point into the inode's direct
				2193	* block pointers.
				2194	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2195	static void ext4_free_data(handle_t handle, struct inode inode,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2196	struct buffer_head *this_bh,
				2197	__le32 first, __le32 last)
				2198	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2199	ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2200	unsigned long count = 0; /* Number of blocks in the run */
				2201	__le32 block_to_free_p = NULL; / Pointer into inode/ind
				2202	corresponding to
				2203	block_to_free */
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2204	ext4_fsblk_t nr; /* Current block # */
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2205	__le32 p; / Pointer into inode/ind
				2206	for current block */
				2207	int err;
				2208
				2209	if (this_bh) { /* For indirect block */
				2210	BUFFER_TRACE(this_bh, "get_write_access");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2211	err = ext4_journal_get_write_access(handle, this_bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2212	/* Important: if we can't update the indirect pointers
				2213	* to the blocks, we can't free them. */
				2214	if (err)
				2215	return;
				2216	}
				2217
				2218	for (p = first; p < last; p++) {
				2219	nr = le32_to_cpu(*p);
				2220	if (nr) {
				2221	/* accumulate blocks to free if they're contiguous */
				2222	if (count == 0) {
				2223	block_to_free = nr;
				2224	block_to_free_p = p;
				2225	count = 1;
				2226	} else if (nr == block_to_free + count) {
				2227	count++;
				2228	} else {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2229	ext4_clear_blocks(handle, inode, this_bh,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2230	block_to_free,
				2231	count, block_to_free_p, p);
				2232	block_to_free = nr;
				2233	block_to_free_p = p;
				2234	count = 1;
				2235	}
				2236	}
				2237	}
				2238
				2239	if (count > 0)
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2240	ext4_clear_blocks(handle, inode, this_bh, block_to_free,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2241	count, block_to_free_p, p);
				2242
				2243	if (this_bh) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2244	BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
Duane Griffin	71dc8fb	2008-07-11 19:27:31 -0400	[diff] [blame]	2245
				2246	/*
				2247	* The buffer head should have an attached journal head at this
				2248	* point. However, if the data is corrupted and an indirect
				2249	* block pointed to itself, it would have been detached when
				2250	* the block was cleared. Check for this instead of OOPSing.
				2251	*/
				2252	if (bh2jh(this_bh))
				2253	ext4_journal_dirty_metadata(handle, this_bh);
				2254	else
				2255	ext4_error(inode->i_sb, __func__,
				2256	"circular indirect block detected, "
				2257	"inode=%lu, block=%llu",
				2258	inode->i_ino,
				2259	(unsigned long long) this_bh->b_blocknr);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2260	}
				2261	}
				2262
				2263	/**
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2264	* ext4_free_branches - free an array of branches
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2265	* @handle: JBD handle for this transaction
				2266	* @inode: inode we are dealing with
				2267	* @parent_bh: the buffer_head which contains @first and @last
				2268	* @first: array of block numbers
				2269	* @last: pointer immediately past the end of array
				2270	* @depth: depth of the branches to free
				2271	*
				2272	* We are freeing all blocks refered from these branches (numbers are
				2273	* stored as little-endian 32-bit) and updating @inode->i_blocks
				2274	* appropriately.
				2275	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2276	static void ext4_free_branches(handle_t handle, struct inode inode,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2277	struct buffer_head *parent_bh,
				2278	__le32 first, __le32 last, int depth)
				2279	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2280	ext4_fsblk_t nr;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2281	__le32 *p;
				2282
				2283	if (is_handle_aborted(handle))
				2284	return;
				2285
				2286	if (depth--) {
				2287	struct buffer_head *bh;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2288	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2289	p = last;
				2290	while (--p >= first) {
				2291	nr = le32_to_cpu(*p);
				2292	if (!nr)
				2293	continue; /* A hole */
				2294
				2295	/* Go read the buffer for the next level down */
				2296	bh = sb_bread(inode->i_sb, nr);
				2297
				2298	/*
				2299	* A read failure? Report error and clear slot
				2300	* (should be rare).
				2301	*/
				2302	if (!bh) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2303	ext4_error(inode->i_sb, "ext4_free_branches",
Mingming Cao	2ae0210	2006-10-11 01:21:11 -0700	[diff] [blame]	2304	"Read failure, inode=%lu, block=%llu",
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2305	inode->i_ino, nr);
				2306	continue;
				2307	}
				2308
				2309	/* This zaps the entire block. Bottom up. */
				2310	BUFFER_TRACE(bh, "free child branches");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2311	ext4_free_branches(handle, inode, bh,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2312	(__le32*)bh->b_data,
				2313	(__le32*)bh->b_data + addr_per_block,
				2314	depth);
				2315
				2316	/*
				2317	* We've probably journalled the indirect block several
				2318	* times during the truncate. But it's no longer
				2319	* needed and we now drop it from the transaction via
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	2320	* jbd2_journal_revoke().
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2321	*
				2322	* That's easy if it's exclusively part of this
				2323	* transaction. But if it's part of the committing
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	2324	* transaction then jbd2_journal_forget() will simply
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2325	* brelse() it. That means that if the underlying
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2326	* block is reallocated in ext4_get_block(),
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2327	* unmap_underlying_metadata() will find this block
				2328	* and will try to get rid of it. damn, damn.
				2329	*
				2330	* If this block has already been committed to the
				2331	* journal, a revoke record will be written. And
				2332	* revoke records must be emitted before clearing
				2333	* this block's bit in the bitmaps.
				2334	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2335	ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2336
				2337	/*
				2338	* Everything below this this pointer has been
				2339	* released. Now let this top-of-subtree go.
				2340	*
				2341	* We want the freeing of this indirect block to be
				2342	* atomic in the journal with the updating of the
				2343	* bitmap block which owns it. So make some room in
				2344	* the journal.
				2345	*
				2346	* We zero the parent pointer after freeing its
				2347	* pointee in the bitmaps, so if extend_transaction()
				2348	* for some reason fails to put the bitmap changes and
				2349	* the release into the same transaction, recovery
				2350	* will merely complain about releasing a free block,
				2351	* rather than leaking blocks.
				2352	*/
				2353	if (is_handle_aborted(handle))
				2354	return;
				2355	if (try_to_extend_transaction(handle, inode)) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2356	ext4_mark_inode_dirty(handle, inode);
				2357	ext4_journal_test_restart(handle, inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2358	}
				2359
Alex Tomas	c9de560	2008-01-29 00:19:52 -0500	[diff] [blame]	2360	ext4_free_blocks(handle, inode, nr, 1, 1);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2361
				2362	if (parent_bh) {
				2363	/*
				2364	* The block which we have just freed is
				2365	* pointed to by an indirect block: journal it
				2366	*/
				2367	BUFFER_TRACE(parent_bh, "get_write_access");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2368	if (!ext4_journal_get_write_access(handle,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2369	parent_bh)){
				2370	*p = 0;
				2371	BUFFER_TRACE(parent_bh,
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2372	"call ext4_journal_dirty_metadata");
				2373	ext4_journal_dirty_metadata(handle,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2374	parent_bh);
				2375	}
				2376	}
				2377	}
				2378	} else {
				2379	/* We have reached the bottom of the tree. */
				2380	BUFFER_TRACE(parent_bh, "free data blocks");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2381	ext4_free_data(handle, inode, parent_bh, first, last);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2382	}
				2383	}
				2384
Duane Griffin	91ef4ca	2008-07-11 19:27:31 -0400	[diff] [blame]	2385	int ext4_can_truncate(struct inode *inode)
				2386	{
				2387	if (IS_APPEND(inode) \|\| IS_IMMUTABLE(inode))
				2388	return 0;
				2389	if (S_ISREG(inode->i_mode))
				2390	return 1;
				2391	if (S_ISDIR(inode->i_mode))
				2392	return 1;
				2393	if (S_ISLNK(inode->i_mode))
				2394	return !ext4_inode_is_fast_symlink(inode);
				2395	return 0;
				2396	}
				2397
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2398	/*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2399	* ext4_truncate()
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2400	*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2401	* We block out ext4_get_block() block instantiations across the entire
				2402	* transaction, and VFS/VM ensures that ext4_truncate() cannot run
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2403	* simultaneously on behalf of the same inode.
				2404	*
				2405	* As we work through the truncate and commmit bits of it to the journal there
				2406	* is one core, guiding principle: the file's tree must always be consistent on
				2407	* disk. We must be able to restart the truncate after a crash.
				2408	*
				2409	* The file's tree may be transiently inconsistent in memory (although it
				2410	* probably isn't), but whenever we close off and commit a journal transaction,
				2411	* the contents of (the filesystem + the journal) must be consistent and
				2412	* restartable. It's pretty simple, really: bottom up, right to left (although
				2413	* left-to-right works OK too).
				2414	*
				2415	* Note that at recovery time, journal replay occurs before the restart of
				2416	* truncate against the orphan inode list.
				2417	*
				2418	* The committed inode has the new, desired i_size (which is the same as
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2419	* i_disksize in this case). After a crash, ext4_orphan_cleanup() will see
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2420	* that this inode's truncate did not complete and it will again call
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2421	* ext4_truncate() to have another go. So there will be instantiated blocks
				2422	* to the right of the truncation point in a crashed ext4 filesystem. But
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2423	* that's fine - as long as they are linked from the inode, the post-crash
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2424	* ext4_truncate() run will find them and release them.
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2425	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2426	void ext4_truncate(struct inode *inode)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2427	{
				2428	handle_t *handle;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2429	struct ext4_inode_info *ei = EXT4_I(inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2430	__le32 *i_data = ei->i_data;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2431	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2432	struct address_space *mapping = inode->i_mapping;
Aneesh Kumar K.V	725d26d	2008-01-28 23:58:27 -0500	[diff] [blame]	2433	ext4_lblk_t offsets[4];
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2434	Indirect chain[4];
				2435	Indirect *partial;
				2436	__le32 nr = 0;
				2437	int n;
Aneesh Kumar K.V	725d26d	2008-01-28 23:58:27 -0500	[diff] [blame]	2438	ext4_lblk_t last_block;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2439	unsigned blocksize = inode->i_sb->s_blocksize;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2440
Duane Griffin	91ef4ca	2008-07-11 19:27:31 -0400	[diff] [blame]	2441	if (!ext4_can_truncate(inode))
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2442	return;
				2443
Aneesh Kumar K.V	1d03ec9	2008-01-28 23:58:27 -0500	[diff] [blame]	2444	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	2445	ext4_ext_truncate(inode);
Aneesh Kumar K.V	1d03ec9	2008-01-28 23:58:27 -0500	[diff] [blame]	2446	return;
				2447	}
Alex Tomas	a86c618	2006-10-11 01:21:03 -0700	[diff] [blame]	2448
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2449	handle = start_transaction(inode);
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	2450	if (IS_ERR(handle))
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2451	return; /* AKPM: return what? */
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2452
				2453	last_block = (inode->i_size + blocksize-1)
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2454	>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2455
Jan Kara	cf108bc	2008-07-11 19:27:31 -0400	[diff] [blame^]	2456	if (inode->i_size & (blocksize - 1))
				2457	if (ext4_block_truncate_page(handle, mapping, inode->i_size))
				2458	goto out_stop;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2459
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2460	n = ext4_block_to_path(inode, last_block, offsets, NULL);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2461	if (n == 0)
				2462	goto out_stop; /* error */
				2463
				2464	/*
				2465	* OK. This truncate is going to happen. We add the inode to the
				2466	* orphan list, so that if this truncate spans multiple transactions,
				2467	* and we crash, we will resume the truncate when the filesystem
				2468	* recovers. It also marks the inode dirty, to catch the new size.
				2469	*
				2470	* Implication: the file must always be in a sane, consistent
				2471	* truncatable state while each transaction commits.
				2472	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2473	if (ext4_orphan_add(handle, inode))
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2474	goto out_stop;
				2475
				2476	/*
				2477	* The orphan list entry will now protect us from any crash which
				2478	* occurs before the truncate completes, so it is now safe to propagate
				2479	* the new, shorter inode size (held for now in i_size) into the
				2480	* on-disk inode. We do this via i_disksize, which is the value which
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2481	* ext4 really writes onto the disk inode.
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2482	*/
				2483	ei->i_disksize = inode->i_size;
				2484
				2485	/*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2486	* From here we block out all ext4_get_block() callers who want to
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2487	* modify the block allocation tree.
				2488	*/
Aneesh Kumar K.V	0e855ac	2008-01-28 23:58:26 -0500	[diff] [blame]	2489	down_write(&ei->i_data_sem);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2490
				2491	if (n == 1) { /* direct blocks */
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2492	ext4_free_data(handle, inode, NULL, i_data+offsets[0],
				2493	i_data + EXT4_NDIR_BLOCKS);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2494	goto do_indirects;
				2495	}
				2496
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2497	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2498	/* Kill the top of shared branch (not detached) */
				2499	if (nr) {
				2500	if (partial == chain) {
				2501	/* Shared branch grows from the inode */
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2502	ext4_free_branches(handle, inode, NULL,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2503	&nr, &nr+1, (chain+n-1) - partial);
				2504	*partial->p = 0;
				2505	/*
				2506	* We mark the inode dirty prior to restart,
				2507	* and prior to stop. No need for it here.
				2508	*/
				2509	} else {
				2510	/* Shared branch grows from an indirect block */
				2511	BUFFER_TRACE(partial->bh, "get_write_access");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2512	ext4_free_branches(handle, inode, partial->bh,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2513	partial->p,
				2514	partial->p+1, (chain+n-1) - partial);
				2515	}
				2516	}
				2517	/* Clear the ends of indirect blocks on the shared branch */
				2518	while (partial > chain) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2519	ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2520	(__le32*)partial->bh->b_data+addr_per_block,
				2521	(chain+n-1) - partial);
				2522	BUFFER_TRACE(partial->bh, "call brelse");
				2523	brelse (partial->bh);
				2524	partial--;
				2525	}
				2526	do_indirects:
				2527	/* Kill the remaining (whole) subtrees */
				2528	switch (offsets[0]) {
				2529	default:
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2530	nr = i_data[EXT4_IND_BLOCK];
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2531	if (nr) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2532	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
				2533	i_data[EXT4_IND_BLOCK] = 0;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2534	}
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2535	case EXT4_IND_BLOCK:
				2536	nr = i_data[EXT4_DIND_BLOCK];
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2537	if (nr) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2538	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
				2539	i_data[EXT4_DIND_BLOCK] = 0;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2540	}
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2541	case EXT4_DIND_BLOCK:
				2542	nr = i_data[EXT4_TIND_BLOCK];
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2543	if (nr) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2544	ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
				2545	i_data[EXT4_TIND_BLOCK] = 0;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2546	}
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2547	case EXT4_TIND_BLOCK:
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2548	;
				2549	}
				2550
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2551	ext4_discard_reservation(inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2552
Aneesh Kumar K.V	0e855ac	2008-01-28 23:58:26 -0500	[diff] [blame]	2553	up_write(&ei->i_data_sem);
Kalpak Shah	ef7f383	2007-07-18 09:15:20 -0400	[diff] [blame]	2554	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2555	ext4_mark_inode_dirty(handle, inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2556
				2557	/*
				2558	* In a multi-transaction truncate, we only make the final transaction
				2559	* synchronous
				2560	*/
				2561	if (IS_SYNC(inode))
				2562	handle->h_sync = 1;
				2563	out_stop:
				2564	/*
				2565	* If this was a simple ftruncate(), and the file will remain alive
				2566	* then we need to clear up the orphan record which we created above.
				2567	* However, if this was a real unlink then we were called by
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2568	* ext4_delete_inode(), and we allow that function to clean up the
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2569	* orphan info for us.
				2570	*/
				2571	if (inode->i_nlink)
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2572	ext4_orphan_del(handle, inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2573
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2574	ext4_journal_stop(handle);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2575	}
				2576
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2577	static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
				2578	unsigned long ino, struct ext4_iloc *iloc)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2579	{
Avantika Mathur	fd2d429	2008-01-28 23:58:27 -0500	[diff] [blame]	2580	ext4_group_t block_group;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2581	unsigned long offset;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2582	ext4_fsblk_t block;
Akinobu Mita	c0a4ef3	2008-04-17 10:38:59 -0400	[diff] [blame]	2583	struct ext4_group_desc *gdp;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2584
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2585	if (!ext4_valid_inum(sb, ino)) {
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2586	/*
				2587	* This error is already checked for in namei.c unless we are
				2588	* looking at an NFS filehandle, in which case no error
				2589	* report is needed
				2590	*/
				2591	return 0;
				2592	}
				2593
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2594	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
Akinobu Mita	c0a4ef3	2008-04-17 10:38:59 -0400	[diff] [blame]	2595	gdp = ext4_get_group_desc(sb, block_group, NULL);
				2596	if (!gdp)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2597	return 0;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2598
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2599	/*
				2600	* Figure out the offset within the block group inode table
				2601	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2602	offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
				2603	EXT4_INODE_SIZE(sb);
Alexandre Ratchov	8fadc14	2006-10-11 01:21:15 -0700	[diff] [blame]	2604	block = ext4_inode_table(sb, gdp) +
				2605	(offset >> EXT4_BLOCK_SIZE_BITS(sb));
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2606
				2607	iloc->block_group = block_group;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2608	iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2609	return block;
				2610	}
				2611
				2612	/*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2613	* ext4_get_inode_loc returns with an extra refcount against the inode's
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2614	* underlying buffer_head on success. If 'in_mem' is true, we have all
				2615	* data in memory that is needed to recreate the on-disk version of this
				2616	* inode.
				2617	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2618	static int __ext4_get_inode_loc(struct inode *inode,
				2619	struct ext4_iloc *iloc, int in_mem)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2620	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2621	ext4_fsblk_t block;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2622	struct buffer_head *bh;
				2623
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2624	block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2625	if (!block)
				2626	return -EIO;
				2627
				2628	bh = sb_getblk(inode->i_sb, block);
				2629	if (!bh) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2630	ext4_error (inode->i_sb, "ext4_get_inode_loc",
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2631	"unable to read inode block - "
Mingming Cao	2ae0210	2006-10-11 01:21:11 -0700	[diff] [blame]	2632	"inode=%lu, block=%llu",
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2633	inode->i_ino, block);
				2634	return -EIO;
				2635	}
				2636	if (!buffer_uptodate(bh)) {
				2637	lock_buffer(bh);
				2638	if (buffer_uptodate(bh)) {
				2639	/* someone brought it uptodate while we waited */
				2640	unlock_buffer(bh);
				2641	goto has_buffer;
				2642	}
				2643
				2644	/*
				2645	* If we have all information of the inode in memory and this
				2646	* is the only valid inode in the block, we need not read the
				2647	* block.
				2648	*/
				2649	if (in_mem) {
				2650	struct buffer_head *bitmap_bh;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2651	struct ext4_group_desc *desc;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2652	int inodes_per_buffer;
				2653	int inode_offset, i;
Avantika Mathur	fd2d429	2008-01-28 23:58:27 -0500	[diff] [blame]	2654	ext4_group_t block_group;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2655	int start;
				2656
				2657	block_group = (inode->i_ino - 1) /
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2658	EXT4_INODES_PER_GROUP(inode->i_sb);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2659	inodes_per_buffer = bh->b_size /
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2660	EXT4_INODE_SIZE(inode->i_sb);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2661	inode_offset = ((inode->i_ino - 1) %
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2662	EXT4_INODES_PER_GROUP(inode->i_sb));
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2663	start = inode_offset & ~(inodes_per_buffer - 1);
				2664
				2665	/* Is the inode bitmap in cache? */
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2666	desc = ext4_get_group_desc(inode->i_sb,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2667	block_group, NULL);
				2668	if (!desc)
				2669	goto make_io;
				2670
				2671	bitmap_bh = sb_getblk(inode->i_sb,
Alexandre Ratchov	8fadc14	2006-10-11 01:21:15 -0700	[diff] [blame]	2672	ext4_inode_bitmap(inode->i_sb, desc));
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2673	if (!bitmap_bh)
				2674	goto make_io;
				2675
				2676	/*
				2677	* If the inode bitmap isn't in cache then the
				2678	* optimisation may end up performing two reads instead
				2679	* of one, so skip it.
				2680	*/
				2681	if (!buffer_uptodate(bitmap_bh)) {
				2682	brelse(bitmap_bh);
				2683	goto make_io;
				2684	}
				2685	for (i = start; i < start + inodes_per_buffer; i++) {
				2686	if (i == inode_offset)
				2687	continue;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2688	if (ext4_test_bit(i, bitmap_bh->b_data))
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2689	break;
				2690	}
				2691	brelse(bitmap_bh);
				2692	if (i == start + inodes_per_buffer) {
				2693	/* all other inodes are free, so skip I/O */
				2694	memset(bh->b_data, 0, bh->b_size);
				2695	set_buffer_uptodate(bh);
				2696	unlock_buffer(bh);
				2697	goto has_buffer;
				2698	}
				2699	}
				2700
				2701	make_io:
				2702	/*
				2703	* There are other valid inodes in the buffer, this inode
				2704	* has in-inode xattrs, or we don't have this inode in memory.
				2705	* Read the block from disk.
				2706	*/
				2707	get_bh(bh);
				2708	bh->b_end_io = end_buffer_read_sync;
				2709	submit_bh(READ_META, bh);
				2710	wait_on_buffer(bh);
				2711	if (!buffer_uptodate(bh)) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2712	ext4_error(inode->i_sb, "ext4_get_inode_loc",
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2713	"unable to read inode block - "
Mingming Cao	2ae0210	2006-10-11 01:21:11 -0700	[diff] [blame]	2714	"inode=%lu, block=%llu",
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2715	inode->i_ino, block);
				2716	brelse(bh);
				2717	return -EIO;
				2718	}
				2719	}
				2720	has_buffer:
				2721	iloc->bh = bh;
				2722	return 0;
				2723	}
				2724
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2725	int ext4_get_inode_loc(struct inode inode, struct ext4_iloc iloc)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2726	{
				2727	/* We have all inode data except xattrs in memory here. */
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2728	return __ext4_get_inode_loc(inode, iloc,
				2729	!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2730	}
				2731
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2732	void ext4_set_inode_flags(struct inode *inode)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2733	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2734	unsigned int flags = EXT4_I(inode)->i_flags;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2735
				2736	inode->i_flags &= ~(S_SYNC\|S_APPEND\|S_IMMUTABLE\|S_NOATIME\|S_DIRSYNC);
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2737	if (flags & EXT4_SYNC_FL)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2738	inode->i_flags \|= S_SYNC;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2739	if (flags & EXT4_APPEND_FL)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2740	inode->i_flags \|= S_APPEND;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2741	if (flags & EXT4_IMMUTABLE_FL)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2742	inode->i_flags \|= S_IMMUTABLE;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2743	if (flags & EXT4_NOATIME_FL)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2744	inode->i_flags \|= S_NOATIME;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2745	if (flags & EXT4_DIRSYNC_FL)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2746	inode->i_flags \|= S_DIRSYNC;
				2747	}
				2748
Jan Kara	ff9ddf7	2007-07-18 09:24:20 -0400	[diff] [blame]	2749	/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
				2750	void ext4_get_inode_flags(struct ext4_inode_info *ei)
				2751	{
				2752	unsigned int flags = ei->vfs_inode.i_flags;
				2753
				2754	ei->i_flags &= ~(EXT4_SYNC_FL\|EXT4_APPEND_FL\|
				2755	EXT4_IMMUTABLE_FL\|EXT4_NOATIME_FL\|EXT4_DIRSYNC_FL);
				2756	if (flags & S_SYNC)
				2757	ei->i_flags \|= EXT4_SYNC_FL;
				2758	if (flags & S_APPEND)
				2759	ei->i_flags \|= EXT4_APPEND_FL;
				2760	if (flags & S_IMMUTABLE)
				2761	ei->i_flags \|= EXT4_IMMUTABLE_FL;
				2762	if (flags & S_NOATIME)
				2763	ei->i_flags \|= EXT4_NOATIME_FL;
				2764	if (flags & S_DIRSYNC)
				2765	ei->i_flags \|= EXT4_DIRSYNC_FL;
				2766	}
Aneesh Kumar K.V	0fc1b45	2008-01-28 23:58:26 -0500	[diff] [blame]	2767	static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
				2768	struct ext4_inode_info *ei)
				2769	{
				2770	blkcnt_t i_blocks ;
Aneesh Kumar K.V	8180a56	2008-01-28 23:58:27 -0500	[diff] [blame]	2771	struct inode *inode = &(ei->vfs_inode);
				2772	struct super_block *sb = inode->i_sb;
Aneesh Kumar K.V	0fc1b45	2008-01-28 23:58:26 -0500	[diff] [blame]	2773
				2774	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
				2775	EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
				2776	/* we are using combined 48 bit field */
				2777	i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 \|
				2778	le32_to_cpu(raw_inode->i_blocks_lo);
Aneesh Kumar K.V	8180a56	2008-01-28 23:58:27 -0500	[diff] [blame]	2779	if (ei->i_flags & EXT4_HUGE_FILE_FL) {
				2780	/* i_blocks represent file system block size */
				2781	return i_blocks << (inode->i_blkbits - 9);
				2782	} else {
				2783	return i_blocks;
				2784	}
Aneesh Kumar K.V	0fc1b45	2008-01-28 23:58:26 -0500	[diff] [blame]	2785	} else {
				2786	return le32_to_cpu(raw_inode->i_blocks_lo);
				2787	}
				2788	}
Jan Kara	ff9ddf7	2007-07-18 09:24:20 -0400	[diff] [blame]	2789
David Howells	1d1fe1e	2008-02-07 00:15:37 -0800	[diff] [blame]	2790	struct inode ext4_iget(struct super_block sb, unsigned long ino)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2791	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2792	struct ext4_iloc iloc;
				2793	struct ext4_inode *raw_inode;
David Howells	1d1fe1e	2008-02-07 00:15:37 -0800	[diff] [blame]	2794	struct ext4_inode_info *ei;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2795	struct buffer_head *bh;
David Howells	1d1fe1e	2008-02-07 00:15:37 -0800	[diff] [blame]	2796	struct inode *inode;
				2797	long ret;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2798	int block;
				2799
David Howells	1d1fe1e	2008-02-07 00:15:37 -0800	[diff] [blame]	2800	inode = iget_locked(sb, ino);
				2801	if (!inode)
				2802	return ERR_PTR(-ENOMEM);
				2803	if (!(inode->i_state & I_NEW))
				2804	return inode;
				2805
				2806	ei = EXT4_I(inode);
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2807	#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
				2808	ei->i_acl = EXT4_ACL_NOT_CACHED;
				2809	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2810	#endif
				2811	ei->i_block_alloc_info = NULL;
				2812
David Howells	1d1fe1e	2008-02-07 00:15:37 -0800	[diff] [blame]	2813	ret = __ext4_get_inode_loc(inode, &iloc, 0);
				2814	if (ret < 0)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2815	goto bad_inode;
				2816	bh = iloc.bh;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2817	raw_inode = ext4_raw_inode(&iloc);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2818	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
				2819	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
				2820	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
				2821	if(!(test_opt (inode->i_sb, NO_UID32))) {
				2822	inode->i_uid \|= le16_to_cpu(raw_inode->i_uid_high) << 16;
				2823	inode->i_gid \|= le16_to_cpu(raw_inode->i_gid_high) << 16;
				2824	}
				2825	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2826
				2827	ei->i_state = 0;
				2828	ei->i_dir_start_lookup = 0;
				2829	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
				2830	/* We now have enough fields to check if the inode was active or not.
				2831	* This is needed because nfsd might try to access dead inodes
				2832	* the test is that same one that e2fsck uses
				2833	* NeilBrown 1999oct15
				2834	*/
				2835	if (inode->i_nlink == 0) {
				2836	if (inode->i_mode == 0 \|\|
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2837	!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2838	/* this inode is deleted */
				2839	brelse (bh);
David Howells	1d1fe1e	2008-02-07 00:15:37 -0800	[diff] [blame]	2840	ret = -ESTALE;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2841	goto bad_inode;
				2842	}
				2843	/* The only unlinked inodes we let through here have
				2844	* valid i_mode and are being read by the orphan
				2845	* recovery code: that's fine, we're about to complete
				2846	* the process of deleting those. */
				2847	}
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2848	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
Aneesh Kumar K.V	0fc1b45	2008-01-28 23:58:26 -0500	[diff] [blame]	2849	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
Aneesh Kumar K.V	7973c0c	2008-01-28 23:58:27 -0500	[diff] [blame]	2850	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
Mingming Cao	9b8f1f0	2006-10-11 01:21:13 -0700	[diff] [blame]	2851	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
Aneesh Kumar K.V	a48380f	2008-01-28 23:58:27 -0500	[diff] [blame]	2852	cpu_to_le32(EXT4_OS_HURD)) {
Badari Pulavarty	a1ddeb7	2006-10-11 01:21:09 -0700	[diff] [blame]	2853	ei->i_file_acl \|=
				2854	((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2855	}
Aneesh Kumar K.V	a48380f	2008-01-28 23:58:27 -0500	[diff] [blame]	2856	inode->i_size = ext4_isize(raw_inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2857	ei->i_disksize = inode->i_size;
				2858	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
				2859	ei->i_block_group = iloc.block_group;
				2860	/*
				2861	* NOTE! The in-memory inode i_data array is in little-endian order
				2862	* even on big-endian machines: we do NOT byteswap the block numbers!
				2863	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2864	for (block = 0; block < EXT4_N_BLOCKS; block++)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2865	ei->i_data[block] = raw_inode->i_block[block];
				2866	INIT_LIST_HEAD(&ei->i_orphan);
				2867
Eric Sandeen	0040d98	2008-02-05 22:36:43 -0500	[diff] [blame]	2868	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2869	ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2870	if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
Kirill Korotaev	e5d2861	2007-06-23 17:16:51 -0700	[diff] [blame]	2871	EXT4_INODE_SIZE(inode->i_sb)) {
				2872	brelse (bh);
David Howells	1d1fe1e	2008-02-07 00:15:37 -0800	[diff] [blame]	2873	ret = -EIO;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2874	goto bad_inode;
Kirill Korotaev	e5d2861	2007-06-23 17:16:51 -0700	[diff] [blame]	2875	}
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2876	if (ei->i_extra_isize == 0) {
				2877	/* The extra space is currently unused. Use it. */
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2878	ei->i_extra_isize = sizeof(struct ext4_inode) -
				2879	EXT4_GOOD_OLD_INODE_SIZE;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2880	} else {
				2881	__le32 magic = (void )raw_inode +
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2882	EXT4_GOOD_OLD_INODE_SIZE +
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2883	ei->i_extra_isize;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2884	if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
				2885	ei->i_state \|= EXT4_STATE_XATTR;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2886	}
				2887	} else
				2888	ei->i_extra_isize = 0;
				2889
Kalpak Shah	ef7f383	2007-07-18 09:15:20 -0400	[diff] [blame]	2890	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
				2891	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
				2892	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
				2893	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
				2894
Jean Noel Cordenner	25ec56b	2008-01-28 23:58:27 -0500	[diff] [blame]	2895	inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
				2896	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
				2897	if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
				2898	inode->i_version \|=
				2899	(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
				2900	}
				2901
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2902	if (S_ISREG(inode->i_mode)) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2903	inode->i_op = &ext4_file_inode_operations;
				2904	inode->i_fop = &ext4_file_operations;
				2905	ext4_set_aops(inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2906	} else if (S_ISDIR(inode->i_mode)) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2907	inode->i_op = &ext4_dir_inode_operations;
				2908	inode->i_fop = &ext4_dir_operations;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2909	} else if (S_ISLNK(inode->i_mode)) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2910	if (ext4_inode_is_fast_symlink(inode))
				2911	inode->i_op = &ext4_fast_symlink_inode_operations;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2912	else {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2913	inode->i_op = &ext4_symlink_inode_operations;
				2914	ext4_set_aops(inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2915	}
				2916	} else {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2917	inode->i_op = &ext4_special_inode_operations;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2918	if (raw_inode->i_block[0])
				2919	init_special_inode(inode, inode->i_mode,
				2920	old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
				2921	else
				2922	init_special_inode(inode, inode->i_mode,
				2923	new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
				2924	}
				2925	brelse (iloc.bh);
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2926	ext4_set_inode_flags(inode);
David Howells	1d1fe1e	2008-02-07 00:15:37 -0800	[diff] [blame]	2927	unlock_new_inode(inode);
				2928	return inode;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2929
				2930	bad_inode:
David Howells	1d1fe1e	2008-02-07 00:15:37 -0800	[diff] [blame]	2931	iget_failed(inode);
				2932	return ERR_PTR(ret);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2933	}
				2934
Aneesh Kumar K.V	0fc1b45	2008-01-28 23:58:26 -0500	[diff] [blame]	2935	static int ext4_inode_blocks_set(handle_t *handle,
				2936	struct ext4_inode *raw_inode,
				2937	struct ext4_inode_info *ei)
				2938	{
				2939	struct inode *inode = &(ei->vfs_inode);
				2940	u64 i_blocks = inode->i_blocks;
				2941	struct super_block *sb = inode->i_sb;
				2942	int err = 0;
				2943
				2944	if (i_blocks <= ~0U) {
				2945	/*
				2946	* i_blocks can be represnted in a 32 bit variable
				2947	* as multiple of 512 bytes
				2948	*/
Aneesh Kumar K.V	8180a56	2008-01-28 23:58:27 -0500	[diff] [blame]	2949	raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
Aneesh Kumar K.V	0fc1b45	2008-01-28 23:58:26 -0500	[diff] [blame]	2950	raw_inode->i_blocks_high = 0;
Aneesh Kumar K.V	8180a56	2008-01-28 23:58:27 -0500	[diff] [blame]	2951	ei->i_flags &= ~EXT4_HUGE_FILE_FL;
Aneesh Kumar K.V	0fc1b45	2008-01-28 23:58:26 -0500	[diff] [blame]	2952	} else if (i_blocks <= 0xffffffffffffULL) {
				2953	/*
				2954	* i_blocks can be represented in a 48 bit variable
				2955	* as multiple of 512 bytes
				2956	*/
				2957	err = ext4_update_rocompat_feature(handle, sb,
				2958	EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
				2959	if (err)
				2960	goto err_out;
				2961	/* i_block is stored in the split 48 bit fields */
Aneesh Kumar K.V	8180a56	2008-01-28 23:58:27 -0500	[diff] [blame]	2962	raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
Aneesh Kumar K.V	0fc1b45	2008-01-28 23:58:26 -0500	[diff] [blame]	2963	raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
Aneesh Kumar K.V	8180a56	2008-01-28 23:58:27 -0500	[diff] [blame]	2964	ei->i_flags &= ~EXT4_HUGE_FILE_FL;
Aneesh Kumar K.V	0fc1b45	2008-01-28 23:58:26 -0500	[diff] [blame]	2965	} else {
Aneesh Kumar K.V	8180a56	2008-01-28 23:58:27 -0500	[diff] [blame]	2966	/*
				2967	* i_blocks should be represented in a 48 bit variable
				2968	* as multiple of file system block size
				2969	*/
				2970	err = ext4_update_rocompat_feature(handle, sb,
				2971	EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
				2972	if (err)
				2973	goto err_out;
				2974	ei->i_flags \|= EXT4_HUGE_FILE_FL;
				2975	/* i_block is stored in file system block size */
				2976	i_blocks = i_blocks >> (inode->i_blkbits - 9);
				2977	raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
				2978	raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
Aneesh Kumar K.V	0fc1b45	2008-01-28 23:58:26 -0500	[diff] [blame]	2979	}
				2980	err_out:
				2981	return err;
				2982	}
				2983
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2984	/*
				2985	* Post the struct inode info into an on-disk inode location in the
				2986	* buffer-cache. This gobbles the caller's reference to the
				2987	* buffer_head in the inode location struct.
				2988	*
				2989	* The caller must have write access to iloc->bh.
				2990	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2991	static int ext4_do_update_inode(handle_t *handle,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2992	struct inode *inode,
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2993	struct ext4_iloc *iloc)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2994	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	2995	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
				2996	struct ext4_inode_info *ei = EXT4_I(inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	2997	struct buffer_head *bh = iloc->bh;
				2998	int err = 0, rc, block;
				2999
				3000	/* For fields not not tracking in the in-memory inode,
				3001	* initialise them to zero for new inodes. */
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3002	if (ei->i_state & EXT4_STATE_NEW)
				3003	memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3004
Jan Kara	ff9ddf7	2007-07-18 09:24:20 -0400	[diff] [blame]	3005	ext4_get_inode_flags(ei);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3006	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
				3007	if(!(test_opt(inode->i_sb, NO_UID32))) {
				3008	raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
				3009	raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
				3010	/*
				3011	* Fix up interoperability with old kernels. Otherwise, old inodes get
				3012	* re-used with the upper 16 bits of the uid/gid intact
				3013	*/
				3014	if(!ei->i_dtime) {
				3015	raw_inode->i_uid_high =
				3016	cpu_to_le16(high_16_bits(inode->i_uid));
				3017	raw_inode->i_gid_high =
				3018	cpu_to_le16(high_16_bits(inode->i_gid));
				3019	} else {
				3020	raw_inode->i_uid_high = 0;
				3021	raw_inode->i_gid_high = 0;
				3022	}
				3023	} else {
				3024	raw_inode->i_uid_low =
				3025	cpu_to_le16(fs_high2lowuid(inode->i_uid));
				3026	raw_inode->i_gid_low =
				3027	cpu_to_le16(fs_high2lowgid(inode->i_gid));
				3028	raw_inode->i_uid_high = 0;
				3029	raw_inode->i_gid_high = 0;
				3030	}
				3031	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
Kalpak Shah	ef7f383	2007-07-18 09:15:20 -0400	[diff] [blame]	3032
				3033	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
				3034	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
				3035	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
				3036	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
				3037
Aneesh Kumar K.V	0fc1b45	2008-01-28 23:58:26 -0500	[diff] [blame]	3038	if (ext4_inode_blocks_set(handle, raw_inode, ei))
				3039	goto out_brelse;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3040	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
Aneesh Kumar K.V	267e4db	2008-04-29 08:11:12 -0400	[diff] [blame]	3041	/* clear the migrate flag in the raw_inode */
				3042	raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
Mingming Cao	9b8f1f0	2006-10-11 01:21:13 -0700	[diff] [blame]	3043	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
				3044	cpu_to_le32(EXT4_OS_HURD))
Badari Pulavarty	a1ddeb7	2006-10-11 01:21:09 -0700	[diff] [blame]	3045	raw_inode->i_file_acl_high =
				3046	cpu_to_le16(ei->i_file_acl >> 32);
Aneesh Kumar K.V	7973c0c	2008-01-28 23:58:27 -0500	[diff] [blame]	3047	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
Aneesh Kumar K.V	a48380f	2008-01-28 23:58:27 -0500	[diff] [blame]	3048	ext4_isize_set(raw_inode, ei->i_disksize);
				3049	if (ei->i_disksize > 0x7fffffffULL) {
				3050	struct super_block *sb = inode->i_sb;
				3051	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
				3052	EXT4_FEATURE_RO_COMPAT_LARGE_FILE) \|\|
				3053	EXT4_SB(sb)->s_es->s_rev_level ==
				3054	cpu_to_le32(EXT4_GOOD_OLD_REV)) {
				3055	/* If this is the first large file
				3056	* created, add a flag to the superblock.
				3057	*/
				3058	err = ext4_journal_get_write_access(handle,
				3059	EXT4_SB(sb)->s_sbh);
				3060	if (err)
				3061	goto out_brelse;
				3062	ext4_update_dynamic_rev(sb);
				3063	EXT4_SET_RO_COMPAT_FEATURE(sb,
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3064	EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
Aneesh Kumar K.V	a48380f	2008-01-28 23:58:27 -0500	[diff] [blame]	3065	sb->s_dirt = 1;
				3066	handle->h_sync = 1;
				3067	err = ext4_journal_dirty_metadata(handle,
				3068	EXT4_SB(sb)->s_sbh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3069	}
				3070	}
				3071	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
				3072	if (S_ISCHR(inode->i_mode) \|\| S_ISBLK(inode->i_mode)) {
				3073	if (old_valid_dev(inode->i_rdev)) {
				3074	raw_inode->i_block[0] =
				3075	cpu_to_le32(old_encode_dev(inode->i_rdev));
				3076	raw_inode->i_block[1] = 0;
				3077	} else {
				3078	raw_inode->i_block[0] = 0;
				3079	raw_inode->i_block[1] =
				3080	cpu_to_le32(new_encode_dev(inode->i_rdev));
				3081	raw_inode->i_block[2] = 0;
				3082	}
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3083	} else for (block = 0; block < EXT4_N_BLOCKS; block++)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3084	raw_inode->i_block[block] = ei->i_data[block];
				3085
Jean Noel Cordenner	25ec56b	2008-01-28 23:58:27 -0500	[diff] [blame]	3086	raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
				3087	if (ei->i_extra_isize) {
				3088	if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
				3089	raw_inode->i_version_hi =
				3090	cpu_to_le32(inode->i_version >> 32);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3091	raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
Jean Noel Cordenner	25ec56b	2008-01-28 23:58:27 -0500	[diff] [blame]	3092	}
				3093
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3094
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3095	BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
				3096	rc = ext4_journal_dirty_metadata(handle, bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3097	if (!err)
				3098	err = rc;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3099	ei->i_state &= ~EXT4_STATE_NEW;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3100
				3101	out_brelse:
				3102	brelse (bh);
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3103	ext4_std_error(inode->i_sb, err);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3104	return err;
				3105	}
				3106
				3107	/*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3108	* ext4_write_inode()
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3109	*
				3110	* We are called from a few places:
				3111	*
				3112	* - Within generic_file_write() for O_SYNC files.
				3113	* Here, there will be no transaction running. We wait for any running
				3114	* trasnaction to commit.
				3115	*
				3116	* - Within sys_sync(), kupdate and such.
				3117	* We wait on commit, if tol to.
				3118	*
				3119	* - Within prune_icache() (PF_MEMALLOC == true)
				3120	* Here we simply return. We can't afford to block kswapd on the
				3121	* journal commit.
				3122	*
				3123	* In all cases it is actually safe for us to return without doing anything,
				3124	* because the inode has been copied into a raw inode buffer in
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3125	* ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3126	* knfsd.
				3127	*
				3128	* Note that we are absolutely dependent upon all inode dirtiers doing the
				3129	* right thing: they must call mark_inode_dirty() after dirtying info in
				3130	* which we are interested.
				3131	*
				3132	* It would be a bug for them to not do this. The code:
				3133	*
				3134	* mark_inode_dirty(inode)
				3135	* stuff();
				3136	* inode->i_size = expr;
				3137	*
				3138	* is in error because a kswapd-driven write_inode() could occur while
				3139	* `stuff()' is running, and the new i_size will be lost. Plus the inode
				3140	* will no longer be on the superblock's dirty inode list.
				3141	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3142	int ext4_write_inode(struct inode *inode, int wait)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3143	{
				3144	if (current->flags & PF_MEMALLOC)
				3145	return 0;
				3146
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3147	if (ext4_journal_current_handle()) {
Mingming Cao	b38bd33	2007-07-19 01:48:35 -0700	[diff] [blame]	3148	jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3149	dump_stack();
				3150	return -EIO;
				3151	}
				3152
				3153	if (!wait)
				3154	return 0;
				3155
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3156	return ext4_force_commit(inode->i_sb);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3157	}
				3158
				3159	/*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3160	* ext4_setattr()
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3161	*
				3162	* Called from notify_change.
				3163	*
				3164	* We want to trap VFS attempts to truncate the file as soon as
				3165	* possible. In particular, we want to make sure that when the VFS
				3166	* shrinks i_size, we put the inode on the orphan list and modify
				3167	* i_disksize immediately, so that during the subsequent flushing of
				3168	* dirty pages and freeing of disk blocks, we can guarantee that any
				3169	* commit will leave the blocks being flushed in an unused state on
				3170	* disk. (On recovery, the inode will get truncated and the blocks will
				3171	* be freed, so we have a strong guarantee that no future commit will
				3172	* leave these blocks visible to the user.)
				3173	*
				3174	* Called with inode->sem down.
				3175	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3176	int ext4_setattr(struct dentry dentry, struct iattr attr)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3177	{
				3178	struct inode *inode = dentry->d_inode;
				3179	int error, rc = 0;
				3180	const unsigned int ia_valid = attr->ia_valid;
				3181
				3182	error = inode_change_ok(inode, attr);
				3183	if (error)
				3184	return error;
				3185
				3186	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) \|\|
				3187	(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
				3188	handle_t *handle;
				3189
				3190	/* (user+group)*(old+new) structure, inode write (sb,
				3191	* inode block, ? - but truncate inode update has it) */
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3192	handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
				3193	EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3194	if (IS_ERR(handle)) {
				3195	error = PTR_ERR(handle);
				3196	goto err_out;
				3197	}
				3198	error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
				3199	if (error) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3200	ext4_journal_stop(handle);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3201	return error;
				3202	}
				3203	/* Update corresponding info in inode so that everything is in
				3204	* one transaction */
				3205	if (attr->ia_valid & ATTR_UID)
				3206	inode->i_uid = attr->ia_uid;
				3207	if (attr->ia_valid & ATTR_GID)
				3208	inode->i_gid = attr->ia_gid;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3209	error = ext4_mark_inode_dirty(handle, inode);
				3210	ext4_journal_stop(handle);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3211	}
				3212
Eric Sandeen	e2b4657	2008-01-28 23:58:27 -0500	[diff] [blame]	3213	if (attr->ia_valid & ATTR_SIZE) {
				3214	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
				3215	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
				3216
				3217	if (attr->ia_size > sbi->s_bitmap_maxbytes) {
				3218	error = -EFBIG;
				3219	goto err_out;
				3220	}
				3221	}
				3222	}
				3223
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3224	if (S_ISREG(inode->i_mode) &&
				3225	attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
				3226	handle_t *handle;
				3227
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3228	handle = ext4_journal_start(inode, 3);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3229	if (IS_ERR(handle)) {
				3230	error = PTR_ERR(handle);
				3231	goto err_out;
				3232	}
				3233
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3234	error = ext4_orphan_add(handle, inode);
				3235	EXT4_I(inode)->i_disksize = attr->ia_size;
				3236	rc = ext4_mark_inode_dirty(handle, inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3237	if (!error)
				3238	error = rc;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3239	ext4_journal_stop(handle);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3240	}
				3241
				3242	rc = inode_setattr(inode, attr);
				3243
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3244	/* If inode_setattr's call to ext4_truncate failed to get a
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3245	* transaction handle at all, we need to clean up the in-core
				3246	* orphan list manually. */
				3247	if (inode->i_nlink)
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3248	ext4_orphan_del(NULL, inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3249
				3250	if (!rc && (ia_valid & ATTR_MODE))
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3251	rc = ext4_acl_chmod(inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3252
				3253	err_out:
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3254	ext4_std_error(inode->i_sb, error);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3255	if (!error)
				3256	error = rc;
				3257	return error;
				3258	}
				3259
				3260
				3261	/*
				3262	* How many blocks doth make a writepage()?
				3263	*
				3264	* With N blocks per page, it may be:
				3265	* N data blocks
				3266	* 2 indirect block
				3267	* 2 dindirect
				3268	* 1 tindirect
				3269	* N+5 bitmap blocks (from the above)
				3270	* N+5 group descriptor summary blocks
				3271	* 1 inode block
				3272	* 1 superblock.
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3273	* 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3274	*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3275	* 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3276	*
				3277	* With ordered or writeback data it's the same, less the N data blocks.
				3278	*
				3279	* If the inode's direct blocks can hold an integral number of pages then a
				3280	* page cannot straddle two indirect blocks, and we can only touch one indirect
				3281	* and dindirect block, and the "5" above becomes "3".
				3282	*
				3283	* This still overestimates under most circumstances. If we were to pass the
				3284	* start and end offsets in here as well we could do block_to_path() on each
				3285	* block and work out the exact number of indirects which are touched. Pah.
				3286	*/
				3287
Alex Tomas	a86c618	2006-10-11 01:21:03 -0700	[diff] [blame]	3288	int ext4_writepage_trans_blocks(struct inode *inode)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3289	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3290	int bpp = ext4_journal_blocks_per_page(inode);
				3291	int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3292	int ret;
				3293
Alex Tomas	a86c618	2006-10-11 01:21:03 -0700	[diff] [blame]	3294	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
				3295	return ext4_ext_writepage_trans_blocks(inode, bpp);
				3296
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3297	if (ext4_should_journal_data(inode))
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3298	ret = 3 * (bpp + indirects) + 2;
				3299	else
				3300	ret = 2 * (bpp + indirects) + 2;
				3301
				3302	#ifdef CONFIG_QUOTA
				3303	/* We know that structure was already allocated during DQUOT_INIT so
				3304	* we will be updating only the data blocks + inodes */
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3305	ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3306	#endif
				3307
				3308	return ret;
				3309	}
				3310
				3311	/*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3312	* The caller must have previously called ext4_reserve_inode_write().
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3313	* Give this, we know that the caller already has write access to iloc->bh.
				3314	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3315	int ext4_mark_iloc_dirty(handle_t *handle,
				3316	struct inode inode, struct ext4_iloc iloc)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3317	{
				3318	int err = 0;
				3319
Jean Noel Cordenner	25ec56b	2008-01-28 23:58:27 -0500	[diff] [blame]	3320	if (test_opt(inode->i_sb, I_VERSION))
				3321	inode_inc_iversion(inode);
				3322
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3323	/* the do_update_inode consumes one bh->b_count */
				3324	get_bh(iloc->bh);
				3325
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	3326	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3327	err = ext4_do_update_inode(handle, inode, iloc);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3328	put_bh(iloc->bh);
				3329	return err;
				3330	}
				3331
				3332	/*
				3333	* On success, We end up with an outstanding reference count against
				3334	* iloc->bh. This _must_ be cleaned up later.
				3335	*/
				3336
				3337	int
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3338	ext4_reserve_inode_write(handle_t handle, struct inode inode,
				3339	struct ext4_iloc *iloc)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3340	{
				3341	int err = 0;
				3342	if (handle) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3343	err = ext4_get_inode_loc(inode, iloc);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3344	if (!err) {
				3345	BUFFER_TRACE(iloc->bh, "get_write_access");
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3346	err = ext4_journal_get_write_access(handle, iloc->bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3347	if (err) {
				3348	brelse(iloc->bh);
				3349	iloc->bh = NULL;
				3350	}
				3351	}
				3352	}
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3353	ext4_std_error(inode->i_sb, err);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3354	return err;
				3355	}
				3356
				3357	/*
Kalpak Shah	6dd4ee7	2007-07-18 09:19:57 -0400	[diff] [blame]	3358	* Expand an inode by new_extra_isize bytes.
				3359	* Returns 0 on success or negative error number on failure.
				3360	*/
Aneesh Kumar K.V	1d03ec9	2008-01-28 23:58:27 -0500	[diff] [blame]	3361	static int ext4_expand_extra_isize(struct inode *inode,
				3362	unsigned int new_extra_isize,
				3363	struct ext4_iloc iloc,
				3364	handle_t *handle)
Kalpak Shah	6dd4ee7	2007-07-18 09:19:57 -0400	[diff] [blame]	3365	{
				3366	struct ext4_inode *raw_inode;
				3367	struct ext4_xattr_ibody_header *header;
				3368	struct ext4_xattr_entry *entry;
				3369
				3370	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
				3371	return 0;
				3372
				3373	raw_inode = ext4_raw_inode(&iloc);
				3374
				3375	header = IHDR(inode, raw_inode);
				3376	entry = IFIRST(header);
				3377
				3378	/* No extended attributes present */
				3379	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) \|\|
				3380	header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
				3381	memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
				3382	new_extra_isize);
				3383	EXT4_I(inode)->i_extra_isize = new_extra_isize;
				3384	return 0;
				3385	}
				3386
				3387	/* try to expand with EAs present */
				3388	return ext4_expand_extra_isize_ea(inode, new_extra_isize,
				3389	raw_inode, handle);
				3390	}
				3391
				3392	/*
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3393	* What we do here is to mark the in-core inode as clean with respect to inode
				3394	* dirtiness (it may still be data-dirty).
				3395	* This means that the in-core inode may be reaped by prune_icache
				3396	* without having to perform any I/O. This is a very good thing,
				3397	* because any task may call prune_icache - even ones which
				3398	* have a transaction open against a different journal.
				3399	*
				3400	* Is this cheating? Not really. Sure, we haven't written the
				3401	* inode out, but prune_icache isn't a user-visible syncing function.
				3402	* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
				3403	* we start and wait on commits.
				3404	*
				3405	* Is this efficient/effective? Well, we're being nice to the system
				3406	* by cleaning up our inodes proactively so they can be reaped
				3407	* without I/O. But we are potentially leaving up to five seconds'
				3408	* worth of inodes floating about which prune_icache wants us to
				3409	* write out. One way to fix that would be to get prune_icache()
				3410	* to do a write_super() to free up some memory. It has the desired
				3411	* effect.
				3412	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3413	int ext4_mark_inode_dirty(handle_t handle, struct inode inode)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3414	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3415	struct ext4_iloc iloc;
Kalpak Shah	6dd4ee7	2007-07-18 09:19:57 -0400	[diff] [blame]	3416	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
				3417	static unsigned int mnt_count;
				3418	int err, ret;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3419
				3420	might_sleep();
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3421	err = ext4_reserve_inode_write(handle, inode, &iloc);
Kalpak Shah	6dd4ee7	2007-07-18 09:19:57 -0400	[diff] [blame]	3422	if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
				3423	!(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
				3424	/*
				3425	* We need extra buffer credits since we may write into EA block
				3426	* with this same handle. If journal_extend fails, then it will
				3427	* only result in a minor loss of functionality for that inode.
				3428	* If this is felt to be critical, then e2fsck should be run to
				3429	* force a large enough s_min_extra_isize.
				3430	*/
				3431	if ((jbd2_journal_extend(handle,
				3432	EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
				3433	ret = ext4_expand_extra_isize(inode,
				3434	sbi->s_want_extra_isize,
				3435	iloc, handle);
				3436	if (ret) {
				3437	EXT4_I(inode)->i_state \|= EXT4_STATE_NO_EXPAND;
Aneesh Kumar K.V	c1bddad	2007-10-16 18:38:25 -0400	[diff] [blame]	3438	if (mnt_count !=
				3439	le16_to_cpu(sbi->s_es->s_mnt_count)) {
Harvey Harrison	46e665e	2008-04-17 10:38:59 -0400	[diff] [blame]	3440	ext4_warning(inode->i_sb, __func__,
Kalpak Shah	6dd4ee7	2007-07-18 09:19:57 -0400	[diff] [blame]	3441	"Unable to expand inode %lu. Delete"
				3442	" some EAs or run e2fsck.",
				3443	inode->i_ino);
Aneesh Kumar K.V	c1bddad	2007-10-16 18:38:25 -0400	[diff] [blame]	3444	mnt_count =
				3445	le16_to_cpu(sbi->s_es->s_mnt_count);
Kalpak Shah	6dd4ee7	2007-07-18 09:19:57 -0400	[diff] [blame]	3446	}
				3447	}
				3448	}
				3449	}
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3450	if (!err)
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3451	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3452	return err;
				3453	}
				3454
				3455	/*
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3456	* ext4_dirty_inode() is called from __mark_inode_dirty()
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3457	*
				3458	* We're really interested in the case where a file is being extended.
				3459	* i_size has been changed by generic_commit_write() and we thus need
				3460	* to include the updated inode in the current transaction.
				3461	*
				3462	* Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
				3463	* are allocated to the file.
				3464	*
				3465	* If the inode is marked synchronous, we don't honour that here - doing
				3466	* so would cause a commit on atime updates, which we don't bother doing.
				3467	* We handle synchronous inodes at the highest possible level.
				3468	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3469	void ext4_dirty_inode(struct inode *inode)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3470	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3471	handle_t *current_handle = ext4_journal_current_handle();
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3472	handle_t *handle;
				3473
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3474	handle = ext4_journal_start(inode, 2);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3475	if (IS_ERR(handle))
				3476	goto out;
				3477	if (current_handle &&
				3478	current_handle->h_transaction != handle->h_transaction) {
				3479	/* This task has a transaction open against a different fs */
				3480	printk(KERN_EMERG "%s: transactions do not match!\n",
Harvey Harrison	46e665e	2008-04-17 10:38:59 -0400	[diff] [blame]	3481	__func__);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3482	} else {
				3483	jbd_debug(5, "marking dirty. outer handle=%p\n",
				3484	current_handle);
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3485	ext4_mark_inode_dirty(handle, inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3486	}
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3487	ext4_journal_stop(handle);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3488	out:
				3489	return;
				3490	}
				3491
				3492	#if 0
				3493	/*
				3494	* Bind an inode's backing buffer_head into this transaction, to prevent
				3495	* it from being flushed to disk early. Unlike
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3496	* ext4_reserve_inode_write, this leaves behind no bh reference and
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3497	* returns no iloc structure, so the caller needs to repeat the iloc
				3498	* lookup to mark the inode dirty later.
				3499	*/
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3500	static int ext4_pin_inode(handle_t handle, struct inode inode)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3501	{
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3502	struct ext4_iloc iloc;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3503
				3504	int err = 0;
				3505	if (handle) {
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3506	err = ext4_get_inode_loc(inode, &iloc);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3507	if (!err) {
				3508	BUFFER_TRACE(iloc.bh, "get_write_access");
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	3509	err = jbd2_journal_get_write_access(handle, iloc.bh);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3510	if (!err)
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3511	err = ext4_journal_dirty_metadata(handle,
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3512	iloc.bh);
				3513	brelse(iloc.bh);
				3514	}
				3515	}
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3516	ext4_std_error(inode->i_sb, err);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3517	return err;
				3518	}
				3519	#endif
				3520
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3521	int ext4_change_inode_journal_flag(struct inode *inode, int val)
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3522	{
				3523	journal_t *journal;
				3524	handle_t *handle;
				3525	int err;
				3526
				3527	/*
				3528	* We have to be very careful here: changing a data block's
				3529	* journaling status dynamically is dangerous. If we write a
				3530	* data block to the journal, change the status and then delete
				3531	* that block, we risk forgetting to revoke the old log record
				3532	* from the journal and so a subsequent replay can corrupt data.
				3533	* So, first we make sure that the journal is empty and that
				3534	* nobody is changing anything.
				3535	*/
				3536
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3537	journal = EXT4_JOURNAL(inode);
Dave Hansen	d699594	2007-07-18 08:33:51 -0400	[diff] [blame]	3538	if (is_journal_aborted(journal))
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3539	return -EROFS;
				3540
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	3541	jbd2_journal_lock_updates(journal);
				3542	jbd2_journal_flush(journal);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3543
				3544	/*
				3545	* OK, there are no updates running now, and all cached data is
				3546	* synced to disk. We are now in a completely consistent state
				3547	* which doesn't have anything in the journal, and we know that
				3548	* no filesystem updates are running, so it is safe to modify
				3549	* the inode's in-core data-journaling state flag now.
				3550	*/
				3551
				3552	if (val)
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3553	EXT4_I(inode)->i_flags \|= EXT4_JOURNAL_DATA_FL;
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3554	else
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3555	EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
				3556	ext4_set_aops(inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3557
Mingming Cao	dab291a	2006-10-11 01:21:01 -0700	[diff] [blame]	3558	jbd2_journal_unlock_updates(journal);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3559
				3560	/* Finally we can mark the inode as dirty. */
				3561
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3562	handle = ext4_journal_start(inode, 1);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3563	if (IS_ERR(handle))
				3564	return PTR_ERR(handle);
				3565
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3566	err = ext4_mark_inode_dirty(handle, inode);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3567	handle->h_sync = 1;
Mingming Cao	617ba13	2006-10-11 01:20:53 -0700	[diff] [blame]	3568	ext4_journal_stop(handle);
				3569	ext4_std_error(inode->i_sb, err);
Dave Kleikamp	ac27a0e	2006-10-11 01:20:50 -0700	[diff] [blame]	3570
				3571	return err;
				3572	}
Aneesh Kumar K.V	2e9ee85	2008-07-11 19:27:31 -0400	[diff] [blame]	3573
				3574	static int ext4_bh_unmapped(handle_t handle, struct buffer_head bh)
				3575	{
				3576	return !buffer_mapped(bh);
				3577	}
				3578
				3579	int ext4_page_mkwrite(struct vm_area_struct vma, struct page page)
				3580	{
				3581	loff_t size;
				3582	unsigned long len;
				3583	int ret = -EINVAL;
				3584	struct file *file = vma->vm_file;
				3585	struct inode *inode = file->f_path.dentry->d_inode;
				3586	struct address_space *mapping = inode->i_mapping;
				3587
				3588	/*
				3589	* Get i_alloc_sem to stop truncates messing with the inode. We cannot
				3590	* get i_mutex because we are already holding mmap_sem.
				3591	*/
				3592	down_read(&inode->i_alloc_sem);
				3593	size = i_size_read(inode);
				3594	if (page->mapping != mapping \|\| size <= page_offset(page)
				3595	\|\| !PageUptodate(page)) {
				3596	/* page got truncated from under us? */
				3597	goto out_unlock;
				3598	}
				3599	ret = 0;
				3600	if (PageMappedToDisk(page))
				3601	goto out_unlock;
				3602
				3603	if (page->index == size >> PAGE_CACHE_SHIFT)
				3604	len = size & ~PAGE_CACHE_MASK;
				3605	else
				3606	len = PAGE_CACHE_SIZE;
				3607
				3608	if (page_has_buffers(page)) {
				3609	/* return if we have all the buffers mapped */
				3610	if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
				3611	ext4_bh_unmapped))
				3612	goto out_unlock;
				3613	}
				3614	/*
				3615	* OK, we need to fill the hole... Do write_begin write_end
				3616	* to do block allocation/reservation.We are not holding
				3617	* inode.i__mutex here. That allow * parallel write_begin,
				3618	* write_end call. lock_page prevent this from happening
				3619	* on the same page though
				3620	*/
				3621	ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
				3622	len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
				3623	if (ret < 0)
				3624	goto out_unlock;
				3625	ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
				3626	len, len, page, NULL);
				3627	if (ret < 0)
				3628	goto out_unlock;
				3629	ret = 0;
				3630	out_unlock:
				3631	up_read(&inode->i_alloc_sem);
				3632	return ret;
				3633	}