md: Support write-intent bitmaps with externally managed metadata.
NeilBrown [Mon, 14 Dec 2009 01:49:56 +0000 (12:49 +1100)]
In this case, the metadata needs to not be in the same
sector as the bitmap.
md will not read/write any bitmap metadata.  Config must be
done via sysfs and when a recovery makes the array non-degraded
again, writing 'true' to 'bitmap/can_clear' will allow bits in
the bitmap to be cleared again.

Signed-off-by: NeilBrown <neilb@suse.de>

Documentation/md.txt
drivers/md/bitmap.c
drivers/md/bitmap.h
drivers/md/md.h

index 18fad68..21d26fb 100644 (file)
@@ -322,6 +322,22 @@ All md devices contain:
      'backlog' sets a limit on the number of concurrent background
      writes.  If there are more than this, new writes will by
      synchronous.
+  bitmap/metadata
+     This can be either 'internal' or 'external'.
+     'internal' is the default and means the metadata for the bitmap
+     is stored in the first 256 bytes of the allocated space and is
+     managed by the md module.
+     'external' means that bitmap metadata is managed externally to
+     the kernel (i.e. by some userspace program)
+  bitmap/can_clear
+     This is either 'true' or 'false'.  If 'true', then bits in the
+     bitmap will be cleared when the corresponding blocks are thought
+     to be in-sync.  If 'false', bits will never be cleared.
+     This is automatically set to 'false' if a write happens on a
+     degraded array, or if the array becomes degraded during a write.
+     When metadata is managed externally, it should be set to true
+     once the array becomes non-degraded, and this fact has been
+     recorded in the metadata.
      
      
      
index 6295849..de5c42d 100644 (file)
@@ -497,6 +497,8 @@ void bitmap_update_sb(struct bitmap *bitmap)
 
        if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
                return;
+       if (bitmap->mddev->bitmap_info.external)
+               return;
        spin_lock_irqsave(&bitmap->lock, flags);
        if (!bitmap->sb_page) { /* no superblock */
                spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -676,16 +678,26 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
  * general bitmap file operations
  */
 
+/*
+ * on-disk bitmap:
+ *
+ * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
+ * file a page at a time. There's a superblock at the start of the file.
+ */
 /* calculate the index of the page that contains this bit */
-static inline unsigned long file_page_index(unsigned long chunk)
+static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk)
 {
-       return CHUNK_BIT_OFFSET(chunk) >> PAGE_BIT_SHIFT;
+       if (!bitmap->mddev->bitmap_info.external)
+               chunk += sizeof(bitmap_super_t) << 3;
+       return chunk >> PAGE_BIT_SHIFT;
 }
 
 /* calculate the (bit) offset of this bit within a page */
-static inline unsigned long file_page_offset(unsigned long chunk)
+static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk)
 {
-       return CHUNK_BIT_OFFSET(chunk) & (PAGE_BITS - 1);
+       if (!bitmap->mddev->bitmap_info.external)
+               chunk += sizeof(bitmap_super_t) << 3;
+       return chunk & (PAGE_BITS - 1);
 }
 
 /*
@@ -698,8 +710,9 @@ static inline unsigned long file_page_offset(unsigned long chunk)
 static inline struct page *filemap_get_page(struct bitmap *bitmap,
                                        unsigned long chunk)
 {
-       if (file_page_index(chunk) >= bitmap->file_pages) return NULL;
-       return bitmap->filemap[file_page_index(chunk) - file_page_index(0)];
+       if (file_page_index(bitmap, chunk) >= bitmap->file_pages) return NULL;
+       return bitmap->filemap[file_page_index(bitmap, chunk)
+                              - file_page_index(bitmap, 0)];
 }
 
 
@@ -722,7 +735,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
        spin_unlock_irqrestore(&bitmap->lock, flags);
 
        while (pages--)
-               if (map[pages]->index != 0) /* 0 is sb_page, release it below */
+               if (map[pages] != sb_page) /* 0 is sb_page, release it below */
                        free_buffers(map[pages]);
        kfree(map);
        kfree(attr);
@@ -833,7 +846,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 
        page = filemap_get_page(bitmap, chunk);
        if (!page) return;
-       bit = file_page_offset(chunk);
+       bit = file_page_offset(bitmap, chunk);
 
        /* set the bit */
        kaddr = kmap_atomic(page, KM_USER0);
@@ -931,14 +944,17 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
                        "recovery\n", bmname(bitmap));
 
        bytes = (chunks + 7) / 8;
+       if (!bitmap->mddev->bitmap_info.external)
+               bytes += sizeof(bitmap_super_t);
 
-       num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE;
+       
+       num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
 
-       if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) {
+       if (file && i_size_read(file->f_mapping->host) < bytes) {
                printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
                        bmname(bitmap),
                        (unsigned long) i_size_read(file->f_mapping->host),
-                       bytes + sizeof(bitmap_super_t));
+                       bytes);
                goto err;
        }
 
@@ -959,17 +975,16 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 
        for (i = 0; i < chunks; i++) {
                int b;
-               index = file_page_index(i);
-               bit = file_page_offset(i);
+               index = file_page_index(bitmap, i);
+               bit = file_page_offset(bitmap, i);
                if (index != oldindex) { /* this is a new page, read it in */
                        int count;
                        /* unmap the old page, we're done with it */
                        if (index == num_pages-1)
-                               count = bytes + sizeof(bitmap_super_t)
-                                       - index * PAGE_SIZE;
+                               count = bytes - index * PAGE_SIZE;
                        else
                                count = PAGE_SIZE;
-                       if (index == 0) {
+                       if (index == 0 && bitmap->sb_page) {
                                /*
                                 * if we're here then the superblock page
                                 * contains some bits (PAGE_SIZE != sizeof sb)
@@ -1164,7 +1179,8 @@ void bitmap_daemon_work(mddev_t *mddev)
                        /* We are possibly going to clear some bits, so make
                         * sure that events_cleared is up-to-date.
                         */
-                       if (bitmap->need_sync) {
+                       if (bitmap->need_sync &&
+                           bitmap->mddev->bitmap_info.external == 0) {
                                bitmap_super_t *sb;
                                bitmap->need_sync = 0;
                                sb = kmap_atomic(bitmap->sb_page, KM_USER0);
@@ -1174,7 +1190,8 @@ void bitmap_daemon_work(mddev_t *mddev)
                                write_page(bitmap, bitmap->sb_page, 1);
                        }
                        spin_lock_irqsave(&bitmap->lock, flags);
-                       clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
+                       if (!bitmap->need_sync)
+                               clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
                }
                bmc = bitmap_get_counter(bitmap,
                                         (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
@@ -1189,7 +1206,7 @@ void bitmap_daemon_work(mddev_t *mddev)
                        if (*bmc == 2) {
                                *bmc=1; /* maybe clear the bit next time */
                                set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
-                       } else if (*bmc == 1) {
+                       } else if (*bmc == 1 && !bitmap->need_sync) {
                                /* we can clear the bit */
                                *bmc = 0;
                                bitmap_count_page(bitmap,
@@ -1199,9 +1216,11 @@ void bitmap_daemon_work(mddev_t *mddev)
                                /* clear the bit */
                                paddr = kmap_atomic(page, KM_USER0);
                                if (bitmap->flags & BITMAP_HOSTENDIAN)
-                                       clear_bit(file_page_offset(j), paddr);
+                                       clear_bit(file_page_offset(bitmap, j),
+                                                 paddr);
                                else
-                                       ext2_clear_bit(file_page_offset(j), paddr);
+                                       ext2_clear_bit(file_page_offset(bitmap, j),
+                                                      paddr);
                                kunmap_atomic(paddr, KM_USER0);
                        }
                } else
@@ -1356,6 +1375,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
                    bitmap->events_cleared < bitmap->mddev->events) {
                        bitmap->events_cleared = bitmap->mddev->events;
                        bitmap->need_sync = 1;
+                       sysfs_notify_dirent(bitmap->sysfs_can_clear);
                }
 
                if (!success && ! (*bmc & NEEDED_MASK))
@@ -1613,6 +1633,9 @@ void bitmap_destroy(mddev_t *mddev)
        if (mddev->thread)
                mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
 
+       if (bitmap->sysfs_can_clear)
+               sysfs_put(bitmap->sysfs_can_clear);
+
        bitmap_free(bitmap);
 }
 
@@ -1629,6 +1652,7 @@ int bitmap_create(mddev_t *mddev)
        struct file *file = mddev->bitmap_info.file;
        int err;
        sector_t start;
+       struct sysfs_dirent *bm;
 
        BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
 
@@ -1648,6 +1672,13 @@ int bitmap_create(mddev_t *mddev)
 
        bitmap->mddev = mddev;
 
+       bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
+       if (bm) {
+               bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear");
+               sysfs_put(bm);
+       } else
+               bitmap->sysfs_can_clear = NULL;
+
        bitmap->file = file;
        if (file) {
                get_file(file);
@@ -1658,7 +1689,16 @@ int bitmap_create(mddev_t *mddev)
                vfs_fsync(file, file->f_dentry, 1);
        }
        /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */
-       err = bitmap_read_sb(bitmap);
+       if (!mddev->bitmap_info.external)
+               err = bitmap_read_sb(bitmap);
+       else {
+               err = 0;
+               if (mddev->bitmap_info.chunksize == 0 ||
+                   mddev->bitmap_info.daemon_sleep == 0)
+                       /* chunksize and time_base need to be
+                        * set first. */
+                       err = -EINVAL;
+       }
        if (err)
                goto error;
 
@@ -1777,7 +1817,8 @@ location_store(mddev_t *mddev, const char *buf, size_t len)
                                return rv;
                        if (offset == 0)
                                return -EINVAL;
-                       if (mddev->major_version == 0 &&
+                       if (mddev->bitmap_info.external == 0 &&
+                           mddev->major_version == 0 &&
                            offset != mddev->bitmap_info.default_offset)
                                return -EINVAL;
                        mddev->bitmap_info.offset = offset;
@@ -1906,11 +1947,66 @@ chunksize_store(mddev_t *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry bitmap_chunksize =
 __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
 
+static ssize_t metadata_show(mddev_t *mddev, char *page)
+{
+       return sprintf(page, "%s\n", (mddev->bitmap_info.external
+                                     ? "external" : "internal"));
+}
+
+static ssize_t metadata_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       if (mddev->bitmap ||
+           mddev->bitmap_info.file ||
+           mddev->bitmap_info.offset)
+               return -EBUSY;
+       if (strncmp(buf, "external", 8) == 0)
+               mddev->bitmap_info.external = 1;
+       else if (strncmp(buf, "internal", 8) == 0)
+               mddev->bitmap_info.external = 0;
+       else
+               return -EINVAL;
+       return len;
+}
+
+static struct md_sysfs_entry bitmap_metadata =
+__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
+
+static ssize_t can_clear_show(mddev_t *mddev, char *page)
+{
+       int len;
+       if (mddev->bitmap)
+               len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ?
+                                            "false" : "true"));
+       else
+               len = sprintf(page, "\n");
+       return len;
+}
+
+static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       if (mddev->bitmap == NULL)
+               return -ENOENT;
+       if (strncmp(buf, "false", 5) == 0)
+               mddev->bitmap->need_sync = 1;
+       else if (strncmp(buf, "true", 4) == 0) {
+               if (mddev->degraded)
+                       return -EBUSY;
+               mddev->bitmap->need_sync = 0;
+       } else
+               return -EINVAL;
+       return len;
+}
+
+static struct md_sysfs_entry bitmap_can_clear =
+__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
+
 static struct attribute *md_bitmap_attrs[] = {
        &bitmap_location.attr,
        &bitmap_timeout.attr,
        &bitmap_backlog.attr,
        &bitmap_chunksize.attr,
+       &bitmap_metadata.attr,
+       &bitmap_can_clear.attr,
        NULL
 };
 struct attribute_group md_bitmap_group = {
index 50ee424..cb821d7 100644 (file)
@@ -118,16 +118,6 @@ typedef __u16 bitmap_counter_t;
                        (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
 #define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
 
-/*
- * on-disk bitmap:
- *
- * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
- * file a page at a time. There's a superblock at the start of the file.
- */
-
-/* map chunks (bits) to file pages - offset by the size of the superblock */
-#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
-
 #endif
 
 /*
@@ -250,6 +240,7 @@ struct bitmap {
        wait_queue_head_t write_wait;
        wait_queue_head_t overflow_wait;
 
+       struct sysfs_dirent *sysfs_can_clear;
 };
 
 /* the bitmap API */
index fce0207..d913888 100644 (file)
@@ -296,6 +296,7 @@ struct mddev_s
                unsigned long           chunksize;
                unsigned long           daemon_sleep; /* how many seconds between updates? */
                unsigned long           max_write_behind; /* write-behind mode */
+               int                     external;
        } bitmap_info;
 
        struct list_head                all_mddevs;