[LogFS] Fix bdev erases
Joern Engel [Thu, 4 Mar 2010 20:30:58 +0000 (21:30 +0100)]
Erases for block devices were always just emulated by writing 0xff.
Some time back the write was removed and only the page cache was
changed to 0xff.  Superficialy a good idea with two problems:
1. Touching the page cache isn't necessary either.
2. However, writing out 0xff _is_ necessary for the journal.  As the
   journal is scanned linearly, an old non-overwritten commit entry
   can be used on next mount and cause havoc.

This should fix both aspects.

fs/logfs/dev_bdev.c
fs/logfs/dev_mtd.c
fs/logfs/journal.c
fs/logfs/logfs.h
fs/logfs/segment.c
fs/logfs/super.c

index 58a057b..9718c22 100644 (file)
@@ -167,27 +167,91 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
        generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
 }
 
-static int bdev_erase(struct super_block *sb, loff_t to, size_t len)
+
+static void erase_end_io(struct bio *bio, int err) 
+{ 
+       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 
+       struct super_block *sb = bio->bi_private; 
+       struct logfs_super *super = logfs_super(sb); 
+
+       BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ 
+       BUG_ON(err); 
+       BUG_ON(bio->bi_vcnt == 0); 
+       bio_put(bio); 
+       if (atomic_dec_and_test(&super->s_pending_writes))
+               wake_up(&wq); 
+} 
+
+static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
+               size_t nr_pages)
+{
+       struct logfs_super *super = logfs_super(sb);
+       struct bio *bio;
+       struct request_queue *q = bdev_get_queue(sb->s_bdev);
+       unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
+       int i;
+
+       bio = bio_alloc(GFP_NOFS, max_pages);
+       BUG_ON(!bio); /* FIXME: handle this */
+
+       for (i = 0; i < nr_pages; i++) {
+               if (i >= max_pages) {
+                       /* Block layer cannot split bios :( */
+                       bio->bi_vcnt = i;
+                       bio->bi_idx = 0;
+                       bio->bi_size = i * PAGE_SIZE;
+                       bio->bi_bdev = super->s_bdev;
+                       bio->bi_sector = ofs >> 9;
+                       bio->bi_private = sb;
+                       bio->bi_end_io = erase_end_io;
+                       atomic_inc(&super->s_pending_writes);
+                       submit_bio(WRITE, bio);
+
+                       ofs += i * PAGE_SIZE;
+                       index += i;
+                       nr_pages -= i;
+                       i = 0;
+
+                       bio = bio_alloc(GFP_NOFS, max_pages);
+                       BUG_ON(!bio);
+               }
+               bio->bi_io_vec[i].bv_page = super->s_erase_page;
+               bio->bi_io_vec[i].bv_len = PAGE_SIZE;
+               bio->bi_io_vec[i].bv_offset = 0;
+       }
+       bio->bi_vcnt = nr_pages;
+       bio->bi_idx = 0;
+       bio->bi_size = nr_pages * PAGE_SIZE;
+       bio->bi_bdev = super->s_bdev;
+       bio->bi_sector = ofs >> 9;
+       bio->bi_private = sb;
+       bio->bi_end_io = erase_end_io;
+       atomic_inc(&super->s_pending_writes);
+       submit_bio(WRITE, bio);
+       return 0;
+}
+
+static int bdev_erase(struct super_block *sb, loff_t to, size_t len,
+               int ensure_write)
 {
        struct logfs_super *super = logfs_super(sb);
-       struct address_space *mapping = super->s_mapping_inode->i_mapping;
-       struct page *page;
-       pgoff_t index = to >> PAGE_SHIFT;
-       int i, nr_pages = len >> PAGE_SHIFT;
 
        BUG_ON(to & (PAGE_SIZE - 1));
        BUG_ON(len & (PAGE_SIZE - 1));
 
-       if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
+       if (super->s_flags & LOGFS_SB_FLAG_RO)
                return -EROFS;
 
-       for (i = 0; i < nr_pages; i++) {
-               page = find_get_page(mapping, index + i);
-               if (page) {
-                       memset(page_address(page), 0xFF, PAGE_SIZE);
-                       page_cache_release(page);
-               }
+       if (ensure_write) {
+               /*
+                * Object store doesn't care whether erases happen or not.
+                * But for the journal they are required.  Otherwise a scan
+                * can find an old commit entry and assume it is the current
+                * one, travelling back in time.
+                */
+               do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT);
        }
+
        return 0;
 }
 
index 68e99d0..cafb6ef 100644 (file)
@@ -83,7 +83,8 @@ static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
        return 0;
 }
 
-static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len)
+static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
+               int ensure_write)
 {
        struct mtd_info *mtd = logfs_super(sb)->s_mtd;
        struct erase_info ei;
index 2f2e8e4..c0e7d63 100644 (file)
@@ -392,7 +392,7 @@ static int journal_erase_segment(struct logfs_area *area)
        u64 ofs;
        int err;
 
-       err = logfs_erase_segment(sb, area->a_segno);
+       err = logfs_erase_segment(sb, area->a_segno, 1);
        if (err)
                return err;
 
index e3082ab..7259211 100644 (file)
@@ -151,7 +151,8 @@ struct logfs_device_ops {
        int (*write_sb)(struct super_block *sb, struct page *page);
        int (*readpage)(void *_sb, struct page *page);
        void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
-       int (*erase)(struct super_block *sb, loff_t ofs, size_t len);
+       int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
+                       int ensure_write);
        void (*sync)(struct super_block *sb);
        void (*put_device)(struct super_block *sb);
 };
@@ -327,6 +328,7 @@ struct logfs_super {
        u64      s_feature_compat;
        u64      s_feature_flags;
        u64      s_sb_ofs[2];
+       struct page *s_erase_page;              /* for dev_bdev.c */
        /* alias.c fields */
        struct btree_head32 s_segment_alias;    /* remapped segments */
        int      s_no_object_aliases;
@@ -572,7 +574,7 @@ int get_page_reserve(struct inode *inode, struct page *page);
 extern struct logfs_block_ops indirect_block_ops;
 
 /* segment.c */
-int logfs_erase_segment(struct super_block *sb, u32 ofs);
+int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
 int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
 int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
                level_t level);
index 5f58b74..664cd0d 100644 (file)
@@ -25,14 +25,14 @@ static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
        return 0;
 }
 
-int logfs_erase_segment(struct super_block *sb, u32 segno)
+int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase)
 {
        struct logfs_super *super = logfs_super(sb);
 
        super->s_gec++;
 
        return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
-                       super->s_segsize);
+                       super->s_segsize, ensure_erase);
 }
 
 static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
@@ -798,7 +798,7 @@ static int ostore_erase_segment(struct logfs_area *area)
        u64 ofs;
        int err;
 
-       err = logfs_erase_segment(sb, area->a_segno);
+       err = logfs_erase_segment(sb, area->a_segno, 0);
        if (err)
                return err;
 
index d128a2c..94d80f7 100644 (file)
@@ -317,6 +317,7 @@ static int logfs_make_writeable(struct super_block *sb)
 
 static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
 {
+       struct logfs_super *super = logfs_super(sb);
        struct inode *rootdir;
        int err;
 
@@ -329,15 +330,22 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
        if (!sb->s_root)
                goto fail;
 
+       super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
+       if (!super->s_erase_page)
+               goto fail2;
+       memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
+
        /* FIXME: check for read-only mounts */
        err = logfs_make_writeable(sb);
        if (err)
-               goto fail2;
+               goto fail3;
 
        log_super("LogFS: Finished mounting\n");
        simple_set_mnt(mnt, sb);
        return 0;
 
+fail3:
+       __free_page(super->s_erase_page);
 fail2:
        iput(rootdir);
 fail:
@@ -498,6 +506,8 @@ static void logfs_kill_sb(struct super_block *sb)
        logfs_cleanup_journal(sb);
        logfs_cleanup_areas(sb);
        logfs_cleanup_rw(sb);
+       if (super->s_erase_page)
+               __free_page(super->s_erase_page);
        super->s_devops->put_device(sb);
        mempool_destroy(super->s_btree_pool);
        mempool_destroy(super->s_alias_pool);