#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
+#include <linux/magic.h>
#include <linux/buffer_head.h>
+#include <linux/swap.h>
#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/mpage.h>
#include <linux/uio.h>
#include <linux/namei.h>
#include <linux/log2.h>
-#include <linux/kmemleak.h>
+#include <linux/cleancache.h>
#include <asm/uaccess.h>
#include "internal.h"
{
return &BDEV_I(inode)->bdev;
}
-
EXPORT_SYMBOL(I_BDEV);
/*
- * move the inode from it's current bdi to the a new bdi. if the inode is dirty
- * we need to move it onto the dirty list of @dst so that the inode is always
- * on the right list.
+ * Move the inode from its current bdi to a new bdi. If the inode is dirty we
+ * need to move it onto the dirty list of @dst so that the inode is always on
+ * the right list.
*/
static void bdev_inode_switch_bdi(struct inode *inode,
struct backing_dev_info *dst)
{
- spin_lock(&inode_wb_list_lock);
+ struct backing_dev_info *old = inode->i_data.backing_dev_info;
+
+ if (unlikely(dst == old)) /* deadlock avoidance */
+ return;
+ bdi_lock_two(&old->wb, &dst->wb);
spin_lock(&inode->i_lock);
inode->i_data.backing_dev_info = dst;
if (inode->i_state & I_DIRTY)
list_move(&inode->i_wb_list, &dst->wb.b_dirty);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&old->wb.list_lock);
+ spin_unlock(&dst->wb.list_lock);
}
-static sector_t max_block(struct block_device *bdev)
+/* Kill _all_ buffers and pagecache , dirty or not.. */
+void kill_bdev(struct block_device *bdev)
{
- sector_t retval = ~((sector_t)0);
- loff_t sz = i_size_read(bdev->bd_inode);
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
- if (sz) {
- unsigned int size = block_size(bdev);
- unsigned int sizebits = blksize_bits(size);
- retval = (sz >> sizebits);
- }
- return retval;
-}
+ if (mapping->nrpages == 0)
+ return;
-/* Kill _all_ buffers and pagecache , dirty or not.. */
-static void kill_bdev(struct block_device *bdev)
+ invalidate_bh_lrus();
+ truncate_inode_pages(mapping, 0);
+}
+EXPORT_SYMBOL(kill_bdev);
+
+/* Invalidate clean unused buffers and pagecache. */
+void invalidate_bdev(struct block_device *bdev)
{
- if (bdev->bd_inode->i_mapping->nrpages == 0)
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+ if (mapping->nrpages == 0)
return;
+
invalidate_bh_lrus();
- truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
-}
+ lru_add_drain_all(); /* make sure all lru add caches are flushed */
+ invalidate_mapping_pages(mapping, 0, -1);
+ /* 99% of the time, we don't need to flush the cleancache on the bdev.
+ * But, for the strange corners, lets be cautious
+ */
+ cleancache_invalidate_inode(mapping);
+}
+EXPORT_SYMBOL(invalidate_bdev);
int set_blocksize(struct block_device *bdev, int size)
{
blkdev_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
- if (iblock >= max_block(I_BDEV(inode))) {
- if (create)
- return -EIO;
-
- /*
- * for reads, we're just trying to fill a partial page.
- * return a hole, they will have to call get_block again
- * before they can fill it, and they will get -EIO at that
- * time
- */
- return 0;
- }
bh->b_bdev = I_BDEV(inode);
bh->b_blocknr = iblock;
set_buffer_mapped(bh);
return 0;
}
-static int
-blkdev_get_blocks(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
-{
- sector_t end_block = max_block(I_BDEV(inode));
- unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
-
- if ((iblock + max_blocks) > end_block) {
- max_blocks = end_block - iblock;
- if ((long)max_blocks <= 0) {
- if (create)
- return -EIO; /* write fully beyond EOF */
- /*
- * It is a read which is fully beyond EOF. We return
- * a !buffer_mapped buffer
- */
- max_blocks = 0;
- }
- }
-
- bh->b_bdev = I_BDEV(inode);
- bh->b_blocknr = iblock;
- bh->b_size = max_blocks << inode->i_blkbits;
- if (max_blocks)
- set_buffer_mapped(bh);
- return 0;
-}
-
static ssize_t
blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
struct inode *inode = file->f_mapping->host;
return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
- nr_segs, blkdev_get_blocks, NULL, NULL, 0);
+ nr_segs, blkdev_get_block, NULL, NULL, 0);
}
int __sync_blockdev(struct block_device *bdev, int wait)
/*
* private llseek:
- * for a block special file file->f_path.dentry->d_inode->i_size is zero
+ * for a block special file file_inode(file)->i_size is zero
* so we compute the size by hand (just as in block_read/write above)
*/
-static loff_t block_llseek(struct file *file, loff_t offset, int origin)
+static loff_t block_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *bd_inode = file->f_mapping->host;
loff_t size;
size = i_size_read(bd_inode);
retval = -EINVAL;
- switch (origin) {
+ switch (whence) {
case SEEK_END:
offset += size;
break;
return retval;
}
-int blkdev_fsync(struct file *filp, int datasync)
+int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
struct inode *bd_inode = filp->f_mapping->host;
struct block_device *bdev = I_BDEV(bd_inode);
int error;
+
+ error = filemap_write_and_wait_range(filp->f_mapping, start, end);
+ if (error)
+ return error;
/*
* There is no need to serialise calls to blkdev_issue_flush with
* i_mutex and doing so causes performance issues with concurrent
* O_SYNC writers to a block device.
*/
- mutex_unlock(&bd_inode->i_mutex);
-
error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
if (error == -EOPNOTSUPP)
error = 0;
- mutex_lock(&bd_inode->i_mutex);
-
return error;
}
EXPORT_SYMBOL(blkdev_fsync);
struct inode *inode = container_of(head, struct inode, i_rcu);
struct bdev_inode *bdi = BDEV_I(inode);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(bdev_cachep, bdi);
}
struct list_head *p;
truncate_inode_pages(&inode->i_data, 0);
invalidate_inode_buffers(inode); /* is it needed here? */
- end_writeback(inode);
+ clear_inode(inode);
spin_lock(&bdev_lock);
while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
__bd_forget(list_entry(p, struct inode, i_devices));
static struct dentry *bd_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
+ return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
}
static struct file_system_type bd_type = {
.kill_sb = kill_anon_super,
};
-struct super_block *blockdev_superblock __read_mostly;
+static struct super_block *blockdev_superblock __read_mostly;
void __init bdev_cache_init(void)
{
int err;
- struct vfsmount *bd_mnt;
+ static struct vfsmount *bd_mnt;
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
bd_mnt = kern_mount(&bd_type);
if (IS_ERR(bd_mnt))
panic("Cannot create bdev pseudo-fs");
- /*
- * This vfsmount structure is only used to obtain the
- * blockdev_superblock, so tell kmemleak not to report it.
- */
- kmemleak_not_leak(bd_mnt);
- blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
+ blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
}
/*
if (inode->i_state & I_NEW) {
bdev->bd_contains = NULL;
+ bdev->bd_super = NULL;
bdev->bd_inode = inode;
bdev->bd_block_size = (1 << inode->i_blkbits);
bdev->bd_part_count = 0;
return bdev;
}
+static inline int sb_is_blkdev_sb(struct super_block *sb)
+{
+ return sb == blockdev_superblock;
+}
+
/* Call when you free inode */
void bd_forget(struct inode *inode)
if (!bdev->bd_disk)
return;
- if (disk_partitionable(bdev->bd_disk))
+ if (disk_part_scan_enabled(bdev->bd_disk))
bdev->bd_invalidated = 1;
}
mutex_lock(&bdev->bd_mutex);
check_disk_size_change(disk, bdev);
+ bdev->bd_invalidated = 0;
mutex_unlock(&bdev->bd_mutex);
bdput(bdev);
return ret;
{
unsigned bsize = bdev_logical_block_size(bdev);
- bdev->bd_inode->i_size = size;
+ mutex_lock(&bdev->bd_inode->i_mutex);
+ i_size_write(bdev->bd_inode, size);
+ mutex_unlock(&bdev->bd_inode->i_mutex);
while (bsize < PAGE_CACHE_SIZE) {
if (size & bsize)
break;
static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
{
struct gendisk *disk;
+ struct module *owner;
int ret;
int partno;
int perm = 0;
disk = get_gendisk(bdev->bd_dev, &partno);
if (!disk)
goto out;
+ owner = disk->fops->owner;
disk_block_events(disk);
mutex_lock_nested(&bdev->bd_mutex, for_part);
if (!bdev->bd_openers) {
bdev->bd_disk = disk;
+ bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
if (!partno) {
struct backing_dev_info *bdi;
disk_put_part(bdev->bd_part);
bdev->bd_part = NULL;
bdev->bd_disk = NULL;
+ bdev->bd_queue = NULL;
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
- module_put(disk->fops->owner);
put_disk(disk);
+ module_put(owner);
goto restart;
}
}
- if (!ret && !bdev->bd_openers) {
+ if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
bdi = blk_get_backing_dev_info(bdev);
if (bdi == NULL)
* The latter is necessary to prevent ghost
* partitions on a removed medium.
*/
- if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
- rescan_partitions(disk, bdev);
+ if (bdev->bd_invalidated) {
+ if (!ret)
+ rescan_partitions(disk, bdev);
+ else if (ret == -ENOMEDIUM)
+ invalidate_partitions(disk, bdev);
+ }
if (ret)
goto out_clear;
} else {
if (bdev->bd_disk->fops->open)
ret = bdev->bd_disk->fops->open(bdev, mode);
/* the same as first opener case, read comment there */
- if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM))
- rescan_partitions(bdev->bd_disk, bdev);
+ if (bdev->bd_invalidated) {
+ if (!ret)
+ rescan_partitions(bdev->bd_disk, bdev);
+ else if (ret == -ENOMEDIUM)
+ invalidate_partitions(bdev->bd_disk, bdev);
+ }
if (ret)
goto out_unlock_bdev;
}
/* only one opener holds refs to the module and disk */
- module_put(disk->fops->owner);
put_disk(disk);
+ module_put(owner);
}
bdev->bd_openers++;
if (for_part)
disk_put_part(bdev->bd_part);
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
+ bdev->bd_queue = NULL;
bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
out_unlock_bdev:
mutex_unlock(&bdev->bd_mutex);
disk_unblock_events(disk);
- module_put(disk->fops->owner);
put_disk(disk);
+ module_put(owner);
out:
bdput(bdev);
WARN_ON_ONCE(bdev->bd_holders);
sync_blockdev(bdev);
kill_bdev(bdev);
+ /* ->release can cause the old bdi to disappear,
+ * so must switch it out first
+ */
+ bdev_inode_switch_bdi(bdev->bd_inode,
+ &default_backing_dev_info);
}
if (bdev->bd_contains == bdev) {
if (disk->fops->release)
if (!bdev->bd_openers) {
struct module *owner = disk->fops->owner;
- put_disk(disk);
- module_put(owner);
disk_put_part(bdev->bd_part);
bdev->bd_part = NULL;
bdev->bd_disk = NULL;
- bdev_inode_switch_bdi(bdev->bd_inode,
- &default_backing_dev_info);
if (bdev != bdev->bd_contains)
victim = bdev->bd_contains;
bdev->bd_contains = NULL;
+
+ put_disk(disk);
+ module_put(owner);
}
mutex_unlock(&bdev->bd_mutex);
bdput(bdev);
int blkdev_put(struct block_device *bdev, fmode_t mode)
{
+ mutex_lock(&bdev->bd_mutex);
+
if (mode & FMODE_EXCL) {
bool bdev_free;
* are protected with bdev_lock. bd_mutex is to
* synchronize disk_holder unlinking.
*/
- mutex_lock(&bdev->bd_mutex);
spin_lock(&bdev_lock);
WARN_ON_ONCE(--bdev->bd_holders < 0);
* If this was the last claim, remove holder link and
* unblock evpoll if it was a write holder.
*/
- if (bdev_free) {
- if (bdev->bd_write_holder) {
- disk_unblock_events(bdev->bd_disk);
- disk_check_events(bdev->bd_disk);
- bdev->bd_write_holder = false;
- }
+ if (bdev_free && bdev->bd_write_holder) {
+ disk_unblock_events(bdev->bd_disk);
+ bdev->bd_write_holder = false;
}
-
- mutex_unlock(&bdev->bd_mutex);
}
+ /*
+ * Trigger event checking and tell drivers to flush MEDIA_CHANGE
+ * event. This is to ensure detection of media removal commanded
+ * from userland - e.g. eject(1).
+ */
+ disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
+
+ mutex_unlock(&bdev->bd_mutex);
+
return __blkdev_put(bdev, mode, 0);
}
EXPORT_SYMBOL(blkdev_put);
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
+ struct blk_plug plug;
ssize_t ret;
BUG_ON(iocb->ki_pos != pos);
+ blk_start_plug(&plug);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
if (ret > 0 || ret == -EIOCBQUEUED) {
ssize_t err;
if (err < 0 && ret > 0)
ret = err;
}
+ blk_finish_plug(&plug);
return ret;
}
EXPORT_SYMBOL_GPL(blkdev_aio_write);
+static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *bd_inode = file->f_mapping->host;
+ loff_t size = i_size_read(bd_inode);
+
+ if (pos >= size)
+ return 0;
+
+ size -= pos;
+ if (size < INT_MAX)
+ nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
+ return generic_file_aio_read(iocb, iov, nr_segs, pos);
+}
+
/*
* Try to release a page associated with block device when the system
* is under memory pressure.
.llseek = block_llseek,
.read = do_sync_read,
.write = do_sync_write,
- .aio_read = generic_file_aio_read,
+ .aio_read = blkdev_aio_read,
.aio_write = blkdev_aio_write,
.mmap = generic_file_mmap,
.fsync = blkdev_fsync,
return res;
}
EXPORT_SYMBOL(__invalidate_device);
+
+void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
+{
+ struct inode *inode, *old_inode = NULL;
+
+ spin_lock(&inode_sb_list_lock);
+ list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
+ struct address_space *mapping = inode->i_mapping;
+
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
+ mapping->nrpages == 0) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&inode_sb_list_lock);
+ /*
+ * We hold a reference to 'inode' so it couldn't have been
+ * removed from s_inodes list while we dropped the
+ * inode_sb_list_lock. We cannot iput the inode now as we can
+ * be holding the last reference and we cannot iput it under
+ * inode_sb_list_lock. So we keep the reference and iput it
+ * later.
+ */
+ iput(old_inode);
+ old_inode = inode;
+
+ func(I_BDEV(inode), arg);
+
+ spin_lock(&inode_sb_list_lock);
+ }
+ spin_unlock(&inode_sb_list_lock);
+ iput(old_inode);
+}