[PATCH] md: Checkpoint and allow restart of raid5 reshape
NeilBrown [Mon, 27 Mar 2006 09:18:11 +0000 (01:18 -0800)]
We allow the superblock to record an 'old' and a 'new' geometry, and a
position where any conversion is up to.  The geometry allows for changing
chunksize, layout and level as well as number of devices.

When using verion-0.90 superblock, we convert the version to 0.91 while the
conversion is happening so that an old kernel will refuse the assemble the
array.  For version-1, we use a feature bit for the same effect.

When starting an array we check for an incomplete reshape and restart the
reshape process if needed.  If the reshape stopped at an awkward time (like
when updating the first stripe) we refuse to assemble the array, and let
user-space worry about it.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

drivers/md/md.c
drivers/md/raid1.c
drivers/md/raid5.c
include/linux/raid/md.h
include/linux/raid/md_k.h
include/linux/raid/md_p.h
include/linux/raid/raid5.h

index d169bc9..b9dfdfc 100644 (file)
@@ -662,7 +662,8 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
        }
 
        if (sb->major_version != 0 ||
-           sb->minor_version != 90) {
+           sb->minor_version < 90 ||
+           sb->minor_version > 91) {
                printk(KERN_WARNING "Bad version number %d.%d on %s\n",
                        sb->major_version, sb->minor_version,
                        b);
@@ -747,6 +748,20 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->bitmap_offset = 0;
                mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 
+               if (mddev->minor_version >= 91) {
+                       mddev->reshape_position = sb->reshape_position;
+                       mddev->delta_disks = sb->delta_disks;
+                       mddev->new_level = sb->new_level;
+                       mddev->new_layout = sb->new_layout;
+                       mddev->new_chunk = sb->new_chunk;
+               } else {
+                       mddev->reshape_position = MaxSector;
+                       mddev->delta_disks = 0;
+                       mddev->new_level = mddev->level;
+                       mddev->new_layout = mddev->layout;
+                       mddev->new_chunk = mddev->chunk_size;
+               }
+
                if (sb->state & (1<<MD_SB_CLEAN))
                        mddev->recovery_cp = MaxSector;
                else {
@@ -841,7 +856,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
        sb->md_magic = MD_SB_MAGIC;
        sb->major_version = mddev->major_version;
-       sb->minor_version = mddev->minor_version;
        sb->patch_version = mddev->patch_version;
        sb->gvalid_words  = 0; /* ignored */
        memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
@@ -860,6 +874,17 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        sb->events_hi = (mddev->events>>32);
        sb->events_lo = (u32)mddev->events;
 
+       if (mddev->reshape_position == MaxSector)
+               sb->minor_version = 90;
+       else {
+               sb->minor_version = 91;
+               sb->reshape_position = mddev->reshape_position;
+               sb->new_level = mddev->new_level;
+               sb->delta_disks = mddev->delta_disks;
+               sb->new_layout = mddev->new_layout;
+               sb->new_chunk = mddev->new_chunk;
+       }
+       mddev->minor_version = sb->minor_version;
        if (mddev->in_sync)
        {
                sb->recovery_cp = mddev->recovery_cp;
@@ -1104,6 +1129,20 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                        }
                        mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
                }
+               if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+                       mddev->reshape_position = le64_to_cpu(sb->reshape_position);
+                       mddev->delta_disks = le32_to_cpu(sb->delta_disks);
+                       mddev->new_level = le32_to_cpu(sb->new_level);
+                       mddev->new_layout = le32_to_cpu(sb->new_layout);
+                       mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
+               } else {
+                       mddev->reshape_position = MaxSector;
+                       mddev->delta_disks = 0;
+                       mddev->new_level = mddev->level;
+                       mddev->new_layout = mddev->layout;
+                       mddev->new_chunk = mddev->chunk_size;
+               }
+
        } else if (mddev->pers == NULL) {
                /* Insist of good event counter while assembling */
                __u64 ev1 = le64_to_cpu(sb->events);
@@ -1175,6 +1214,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
                sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
        }
+       if (mddev->reshape_position != MaxSector) {
+               sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
+               sb->reshape_position = cpu_to_le64(mddev->reshape_position);
+               sb->new_layout = cpu_to_le32(mddev->new_layout);
+               sb->delta_disks = cpu_to_le32(mddev->delta_disks);
+               sb->new_level = cpu_to_le32(mddev->new_level);
+               sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
+       }
 
        max_dev = 0;
        ITERATE_RDEV(mddev,rdev2,tmp)
@@ -1497,7 +1544,7 @@ static void sync_sbs(mddev_t * mddev)
        }
 }
 
-static void md_update_sb(mddev_t * mddev)
+void md_update_sb(mddev_t * mddev)
 {
        int err;
        struct list_head *tmp;
@@ -1574,6 +1621,7 @@ repeat:
        wake_up(&mddev->sb_wait);
 
 }
+EXPORT_SYMBOL_GPL(md_update_sb);
 
 /* words written to sysfs files may, or my not, be \n terminated.
  * We want to accept with case. For this we use cmd_match.
@@ -2545,6 +2593,14 @@ static int do_md_run(mddev_t * mddev)
        mddev->level = pers->level;
        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
+       if (mddev->reshape_position != MaxSector &&
+           pers->reshape == NULL) {
+               /* This personality cannot handle reshaping... */
+               mddev->pers = NULL;
+               module_put(pers->owner);
+               return -EINVAL;
+       }
+
        mddev->recovery = 0;
        mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
        mddev->barriers_work = 1;
@@ -3433,11 +3489,18 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
        mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
        mddev->bitmap_offset = 0;
 
+       mddev->reshape_position = MaxSector;
+
        /*
         * Generate a 128 bit UUID
         */
        get_random_bytes(mddev->uuid, 16);
 
+       mddev->new_level = mddev->level;
+       mddev->new_chunk = mddev->chunk_size;
+       mddev->new_layout = mddev->layout;
+       mddev->delta_disks = 0;
+
        return 0;
 }
 
index 5d88329..b65b8cf 100644 (file)
@@ -1789,6 +1789,11 @@ static int run(mddev_t *mddev)
                       mdname(mddev), mddev->level);
                goto out;
        }
+       if (mddev->reshape_position != MaxSector) {
+               printk("raid1: %s: reshape_position set but not supported\n",
+                      mdname(mddev));
+               goto out;
+       }
        /*
         * copy the already verified devices into our private RAID1
         * bookkeeping area. [whatever we allocate in run(),
index b29135a..20ae32d 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/raid/raid5.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
+#include <linux/kthread.h>
 #include <asm/atomic.h>
 
 #include <linux/raid/bitmap.h>
@@ -1504,6 +1505,7 @@ static void handle_stripe(struct stripe_head *sh)
                clear_bit(STRIPE_EXPANDING, &sh->state);
        } else if (expanded) {
                clear_bit(STRIPE_EXPAND_READY, &sh->state);
+               atomic_dec(&conf->reshape_stripes);
                wake_up(&conf->wait_for_overlap);
                md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
        }
@@ -1875,6 +1877,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                 */
                int i;
                int dd_idx;
+
+               if (sector_nr == 0 &&
+                   conf->expand_progress != 0) {
+                       /* restarting in the middle, skip the initial sectors */
+                       sector_nr = conf->expand_progress;
+                       sector_div(sector_nr, conf->raid_disks-1);
+                       *skipped = 1;
+                       return sector_nr;
+               }
+
+               /* Cannot proceed until we've updated the superblock... */
+               wait_event(conf->wait_for_overlap,
+                          atomic_read(&conf->reshape_stripes)==0);
+               mddev->reshape_position = conf->expand_progress;
+
+               mddev->sb_dirty = 1;
+               md_wakeup_thread(mddev->thread);
+               wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+                       kthread_should_stop());
+
                for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
                        int j;
                        int skipped = 0;
@@ -1882,6 +1904,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                        sh = get_active_stripe(conf, sector_nr+i,
                                               conf->raid_disks, pd_idx, 0);
                        set_bit(STRIPE_EXPANDING, &sh->state);
+                       atomic_inc(&conf->reshape_stripes);
                        /* If any of this stripe is beyond the end of the old
                         * array, then we need to zero those blocks
                         */
@@ -2121,10 +2144,61 @@ static int run(mddev_t *mddev)
                return -EIO;
        }
 
+       if (mddev->reshape_position != MaxSector) {
+               /* Check that we can continue the reshape.
+                * Currently only disks can change, it must
+                * increase, and we must be past the point where
+                * a stripe over-writes itself
+                */
+               sector_t here_new, here_old;
+               int old_disks;
+
+               if (mddev->new_level != mddev->level ||
+                   mddev->new_layout != mddev->layout ||
+                   mddev->new_chunk != mddev->chunk_size) {
+                       printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
+                              mdname(mddev));
+                       return -EINVAL;
+               }
+               if (mddev->delta_disks <= 0) {
+                       printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
+                              mdname(mddev));
+                       return -EINVAL;
+               }
+               old_disks = mddev->raid_disks - mddev->delta_disks;
+               /* reshape_position must be on a new-stripe boundary, and one
+                * further up in new geometry must map after here in old geometry.
+                */
+               here_new = mddev->reshape_position;
+               if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
+                       printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
+                       return -EINVAL;
+               }
+               /* here_new is the stripe we will write to */
+               here_old = mddev->reshape_position;
+               sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
+               /* here_old is the first stripe that we might need to read from */
+               if (here_new >= here_old) {
+                       /* Reading from the same stripe as writing to - bad */
+                       printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
+                       return -EINVAL;
+               }
+               printk(KERN_INFO "raid5: reshape will continue\n");
+               /* OK, we should be able to continue; */
+       }
+
+
        mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
        if ((conf = mddev->private) == NULL)
                goto abort;
-       conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
+       if (mddev->reshape_position == MaxSector) {
+               conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
+       } else {
+               conf->raid_disks = mddev->raid_disks;
+               conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
+       }
+
+       conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
                              GFP_KERNEL);
        if (!conf->disks)
                goto abort;
@@ -2148,7 +2222,7 @@ static int run(mddev_t *mddev)
 
        ITERATE_RDEV(mddev,rdev,tmp) {
                raid_disk = rdev->raid_disk;
-               if (raid_disk >= mddev->raid_disks
+               if (raid_disk >= conf->raid_disks
                    || raid_disk < 0)
                        continue;
                disk = conf->disks + raid_disk;
@@ -2164,7 +2238,6 @@ static int run(mddev_t *mddev)
                }
        }
 
-       conf->raid_disks = mddev->raid_disks;
        /*
         * 0 for a fully functional array, 1 for a degraded array.
         */
@@ -2174,7 +2247,7 @@ static int run(mddev_t *mddev)
        conf->level = mddev->level;
        conf->algorithm = mddev->layout;
        conf->max_nr_stripes = NR_STRIPES;
-       conf->expand_progress = MaxSector;
+       conf->expand_progress = mddev->reshape_position;
 
        /* device size must be a multiple of chunk size */
        mddev->size &= ~(mddev->chunk_size/1024 -1);
@@ -2247,6 +2320,20 @@ static int run(mddev_t *mddev)
 
        print_raid5_conf(conf);
 
+       if (conf->expand_progress != MaxSector) {
+               printk("...ok start reshape thread\n");
+               atomic_set(&conf->reshape_stripes, 0);
+               clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+               clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+               set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+               set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+               mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+                                                       "%s_reshape");
+               /* FIXME if md_register_thread fails?? */
+               md_wakeup_thread(mddev->sync_thread);
+
+       }
+
        /* read-ahead size must cover two whole stripes, which is
         * 2 * (n-1) * chunksize where 'n' is the number of raid devices
         */
@@ -2262,8 +2349,8 @@ static int run(mddev_t *mddev)
 
        mddev->queue->unplug_fn = raid5_unplug_device;
        mddev->queue->issue_flush_fn = raid5_issue_flush;
+       mddev->array_size =  mddev->size * (conf->previous_raid_disks - 1);
 
-       mddev->array_size =  mddev->size * (mddev->raid_disks - 1);
        return 0;
 abort:
        if (conf) {
@@ -2436,7 +2523,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
        /*
         * find the disk ...
         */
-       for (disk=0; disk < mddev->raid_disks; disk++)
+       for (disk=0; disk < conf->raid_disks; disk++)
                if ((p=conf->disks + disk)->rdev == NULL) {
                        clear_bit(In_sync, &rdev->flags);
                        rdev->raid_disk = disk;
@@ -2518,9 +2605,10 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
        if (err)
                return err;
 
+       atomic_set(&conf->reshape_stripes, 0);
        spin_lock_irq(&conf->device_lock);
        conf->previous_raid_disks = conf->raid_disks;
-       mddev->raid_disks = conf->raid_disks = raid_disks;
+       conf->raid_disks = raid_disks;
        conf->expand_progress = 0;
        spin_unlock_irq(&conf->device_lock);
 
@@ -2542,6 +2630,14 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
                }
 
        mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices;
+       mddev->new_chunk = mddev->chunk_size;
+       mddev->new_layout = mddev->layout;
+       mddev->new_level = mddev->level;
+       mddev->raid_disks = raid_disks;
+       mddev->delta_disks = raid_disks - conf->previous_raid_disks;
+       mddev->reshape_position = 0;
+       mddev->sb_dirty = 1;
+
        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
        set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -2552,6 +2648,7 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
                mddev->recovery = 0;
                spin_lock_irq(&conf->device_lock);
                mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+               mddev->delta_disks = 0;
                conf->expand_progress = MaxSector;
                spin_unlock_irq(&conf->device_lock);
                return -EAGAIN;
@@ -2566,20 +2663,23 @@ static void end_reshape(raid5_conf_t *conf)
 {
        struct block_device *bdev;
 
-       conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1);
-       set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
-       conf->mddev->changed = 1;
-
-       bdev = bdget_disk(conf->mddev->gendisk, 0);
-       if (bdev) {
-               mutex_lock(&bdev->bd_inode->i_mutex);
-               i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
-               mutex_unlock(&bdev->bd_inode->i_mutex);
-               bdput(bdev);
+       if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
+               conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
+               set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
+               conf->mddev->changed = 1;
+
+               bdev = bdget_disk(conf->mddev->gendisk, 0);
+               if (bdev) {
+                       mutex_lock(&bdev->bd_inode->i_mutex);
+                       i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
+                       mutex_unlock(&bdev->bd_inode->i_mutex);
+                       bdput(bdev);
+               }
+               spin_lock_irq(&conf->device_lock);
+               conf->expand_progress = MaxSector;
+               spin_unlock_irq(&conf->device_lock);
+               conf->mddev->reshape_position = MaxSector;
        }
-       spin_lock_irq(&conf->device_lock);
-       conf->expand_progress = MaxSector;
-       spin_unlock_irq(&conf->device_lock);
 }
 
 static void raid5_quiesce(mddev_t *mddev, int state)
index 9c77cde..66b44e5 100644 (file)
@@ -95,6 +95,8 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 extern void md_do_sync(mddev_t *mddev);
 extern void md_new_event(mddev_t *mddev);
 
+extern void md_update_sb(mddev_t * mddev);
+
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 
 #endif 
index 4e26ef2..1a6f9f2 100644 (file)
@@ -132,6 +132,14 @@ struct mddev_s
 
        char                            uuid[16];
 
+       /* If the array is being reshaped, we need to record the
+        * new shape and an indication of where we are up to.
+        * This is written to the superblock.
+        * If reshape_position is MaxSector, then no reshape is happening (yet).
+        */
+       sector_t                        reshape_position;
+       int                             delta_disks, new_level, new_layout, new_chunk;
+
        struct mdk_thread_s             *thread;        /* management thread */
        struct mdk_thread_s             *sync_thread;   /* doing resync or reconstruct */
        sector_t                        curr_resync;    /* blocks scheduled */
index c100fa5..774e1ac 100644 (file)
@@ -102,6 +102,18 @@ typedef struct mdp_device_descriptor_s {
 #define MD_SB_ERRORS           1
 
 #define        MD_SB_BITMAP_PRESENT    8 /* bitmap may be present nearby */
+
+/*
+ * Notes:
+ * - if an array is being reshaped (restriped) in order to change the
+ *   the number of active devices in the array, 'raid_disks' will be
+ *   the larger of the old and new numbers.  'delta_disks' will
+ *   be the "new - old".  So if +ve, raid_disks is the new value, and
+ *   "raid_disks-delta_disks" is the old.  If -ve, raid_disks is the
+ *   old value and "raid_disks+delta_disks" is the new (smaller) value.
+ */
+
+
 typedef struct mdp_superblock_s {
        /*
         * Constant generic information
@@ -146,7 +158,13 @@ typedef struct mdp_superblock_s {
        __u32 cp_events_hi;     /* 10 high-order of checkpoint update count   */
 #endif
        __u32 recovery_cp;      /* 11 recovery checkpoint sector count        */
-       __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 12];
+       /* There are only valid for minor_version > 90 */
+       __u64 reshape_position; /* 12,13 next address in array-space for reshape */
+       __u32 new_level;        /* 14 new level we are reshaping to           */
+       __u32 delta_disks;      /* 15 change in number of raid_disks          */
+       __u32 new_layout;       /* 16 new layout                              */
+       __u32 new_chunk;        /* 17 new chunk size (bytes)                  */
+       __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18];
 
        /*
         * Personality information
@@ -207,7 +225,14 @@ struct mdp_superblock_1 {
                                 * NOTE: signed, so bitmap can be before superblock
                                 * only meaningful of feature_map[0] is set.
                                 */
-       __u8    pad1[128-100];  /* set to 0 when written */
+
+       /* These are only valid with feature bit '4' */
+       __u64   reshape_position;       /* next address in array-space for reshape */
+       __u32   new_level;      /* new level we are reshaping to                */
+       __u32   delta_disks;    /* change in number of raid_disks               */
+       __u32   new_layout;     /* new layout                                   */
+       __u32   new_chunk;      /* new chunk size (bytes)                       */
+       __u8    pad1[128-124];  /* set to 0 when written */
 
        /* constant this-device information - 64 bytes */
        __u64   data_offset;    /* sector start of data, often 0 */
@@ -240,8 +265,9 @@ struct mdp_superblock_1 {
 
 /* feature_map bits */
 #define MD_FEATURE_BITMAP_OFFSET       1
+#define        MD_FEATURE_RESHAPE_ACTIVE       4
 
-#define        MD_FEATURE_ALL                  1
+#define        MD_FEATURE_ALL                  5
 
 #endif 
 
index 55c738d..abcdf0d 100644 (file)
@@ -224,6 +224,7 @@ struct raid5_private_data {
        struct list_head        bitmap_list; /* stripes delaying awaiting bitmap update */
        atomic_t                preread_active_stripes; /* stripes with scheduled io */
 
+       atomic_t                reshape_stripes; /* stripes with pending writes for reshape */
        /* unfortunately we need two cache names as we temporarily have
         * two caches.
         */