[PATCH] md: fix bug where new drives added to an md array sometimes don't sync properly
[linux-3.10.git] / drivers / md / md.c
index f6562ee..57fa64f 100644 (file)
 */
 
 #include <linux/module.h>
-#include <linux/config.h>
 #include <linux/kthread.h>
 #include <linux/linkage.h>
 #include <linux/raid/md.h>
 #include <linux/raid/bitmap.h>
 #include <linux/sysctl.h>
-#include <linux/devfs_fs_kernel.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev */
 #include <linux/suspend.h>
 #include <linux/poll.h>
@@ -112,7 +110,7 @@ static ctl_table raid_table[] = {
                .procname       = "speed_limit_min",
                .data           = &sysctl_speed_limit_min,
                .maxlen         = sizeof(int),
-               .mode           = 0644,
+               .mode           = S_IRUGO|S_IWUSR,
                .proc_handler   = &proc_dointvec,
        },
        {
@@ -120,7 +118,7 @@ static ctl_table raid_table[] = {
                .procname       = "speed_limit_max",
                .data           = &sysctl_speed_limit_max,
                .maxlen         = sizeof(int),
-               .mode           = 0644,
+               .mode           = S_IRUGO|S_IWUSR,
                .proc_handler   = &proc_dointvec,
        },
        { .ctl_name = 0 }
@@ -131,7 +129,7 @@ static ctl_table raid_dir_table[] = {
                .ctl_name       = DEV_RAID,
                .procname       = "raid",
                .maxlen         = 0,
-               .mode           = 0555,
+               .mode           = S_IRUGO|S_IXUGO,
                .child          = raid_table,
        },
        { .ctl_name = 0 }
@@ -175,7 +173,7 @@ EXPORT_SYMBOL_GPL(md_new_event);
 /* Alternate version that can be called from interrupts
  * when calling sysfs_notify isn't needed.
  */
-void md_new_event_inintr(mddev_t *mddev)
+static void md_new_event_inintr(mddev_t *mddev)
 {
        atomic_inc(&md_event_count);
        wake_up(&md_event_waiters);
@@ -391,8 +389,12 @@ static int super_written(struct bio *bio, unsigned int bytes_done, int error)
        if (bio->bi_size)
                return 1;
 
-       if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
+       if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+               printk("md: super_written gets error=%d, uptodate=%d\n",
+                      error, test_bit(BIO_UPTODATE, &bio->bi_flags));
+               WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
                md_error(mddev, rdev);
+       }
 
        if (atomic_dec_and_test(&mddev->pending_writes))
                wake_up(&mddev->sb_wait);
@@ -1064,6 +1066,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
        if (rdev->sb_size & bmask)
                rdev-> sb_size = (rdev->sb_size | bmask)+1;
 
+       if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
+               rdev->desc_nr = -1;
+       else
+               rdev->desc_nr = le32_to_cpu(sb->dev_number);
+
        if (refdev == 0)
                ret = 1;
        else {
@@ -1173,7 +1180,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
        }
        if (mddev->level != LEVEL_MULTIPATH) {
                int role;
-               rdev->desc_nr = le32_to_cpu(sb->dev_number);
                role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
                switch(role) {
                case 0xffff: /* spare */
@@ -1406,7 +1412,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
 
-       bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+       bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE);
        if (IS_ERR(bdev)) {
                printk(KERN_ERR "md: could not open %s.\n",
                        __bdevname(dev, b));
@@ -1416,7 +1422,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
        if (err) {
                printk(KERN_ERR "md: could not bd_claim %s.\n",
                        bdevname(bdev, b));
-               blkdev_put(bdev);
+               blkdev_put_partition(bdev);
                return err;
        }
        rdev->bdev = bdev;
@@ -1430,7 +1436,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
        if (!bdev)
                MD_BUG();
        bd_release(bdev);
-       blkdev_put(bdev);
+       blkdev_put_partition(bdev);
 }
 
 void md_autodetect_dev(dev_t dev);
@@ -1585,7 +1591,7 @@ static void sync_sbs(mddev_t * mddev, int nospares)
        }
 }
 
-void md_update_sb(mddev_t * mddev)
+static void md_update_sb(mddev_t * mddev, int force_change)
 {
        int err;
        struct list_head *tmp;
@@ -1595,18 +1601,36 @@ void md_update_sb(mddev_t * mddev)
 
 repeat:
        spin_lock_irq(&mddev->write_lock);
-       sync_req = mddev->in_sync;
-       mddev->utime = get_seconds();
-       if (mddev->sb_dirty == 3)
+
+       set_bit(MD_CHANGE_PENDING, &mddev->flags);
+       if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
+               force_change = 1;
+       if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
                /* just a clean<-> dirty transition, possibly leave spares alone,
                 * though if events isn't the right even/odd, we will have to do
                 * spares after all
                 */
                nospares = 1;
+       if (force_change)
+               nospares = 0;
+       if (mddev->degraded)
+               /* If the array is degraded, then skipping spares is both
+                * dangerous and fairly pointless.
+                * Dangerous because a device that was removed from the array
+                * might have a event_count that still looks up-to-date,
+                * so it can be re-added without a resync.
+                * Pointless because if there are any spares to skip,
+                * then a recovery will happen and soon that array won't
+                * be degraded any more and the spare can go back to sleep then.
+                */
+               nospares = 0;
+
+       sync_req = mddev->in_sync;
+       mddev->utime = get_seconds();
 
        /* If this is just a dirty<->clean transition, and the array is clean
         * and 'events' is odd, we can roll back to the previous clean state */
-       if (mddev->sb_dirty == 3
+       if (nospares
            && (mddev->in_sync && mddev->recovery_cp == MaxSector)
            && (mddev->events & 1))
                mddev->events--;
@@ -1637,7 +1661,6 @@ repeat:
                MD_BUG();
                mddev->events --;
        }
-       mddev->sb_dirty = 2;
        sync_sbs(mddev, nospares);
 
        /*
@@ -1645,7 +1668,7 @@ repeat:
         * nonpersistent superblocks
         */
        if (!mddev->persistent) {
-               mddev->sb_dirty = 0;
+               clear_bit(MD_CHANGE_PENDING, &mddev->flags);
                spin_unlock_irq(&mddev->write_lock);
                wake_up(&mddev->sb_wait);
                return;
@@ -1682,20 +1705,20 @@ repeat:
                        break;
        }
        md_super_wait(mddev);
-       /* if there was a failure, sb_dirty was set to 1, and we re-write super */
+       /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
 
        spin_lock_irq(&mddev->write_lock);
-       if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
+       if (mddev->in_sync != sync_req ||
+           test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
                /* have to write it out again */
                spin_unlock_irq(&mddev->write_lock);
                goto repeat;
        }
-       mddev->sb_dirty = 0;
+       clear_bit(MD_CHANGE_PENDING, &mddev->flags);
        spin_unlock_irq(&mddev->write_lock);
        wake_up(&mddev->sb_wait);
 
 }
-EXPORT_SYMBOL_GPL(md_update_sb);
 
 /* words written to sysfs files may, or my not, be \n terminated.
  * We want to accept with case. For this we use cmd_match.
@@ -1737,6 +1760,10 @@ state_show(mdk_rdev_t *rdev, char *page)
                len += sprintf(page+len, "%sin_sync",sep);
                sep = ",";
        }
+       if (test_bit(WriteMostly, &rdev->flags)) {
+               len += sprintf(page+len, "%swrite_mostly",sep);
+               sep = ",";
+       }
        if (!test_bit(Faulty, &rdev->flags) &&
            !test_bit(In_sync, &rdev->flags)) {
                len += sprintf(page+len, "%sspare", sep);
@@ -1745,8 +1772,40 @@ state_show(mdk_rdev_t *rdev, char *page)
        return len+sprintf(page+len, "\n");
 }
 
-static struct rdev_sysfs_entry
-rdev_state = __ATTR_RO(state);
+static ssize_t
+state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+       /* can write
+        *  faulty  - simulates and error
+        *  remove  - disconnects the device
+        *  writemostly - sets write_mostly
+        *  -writemostly - clears write_mostly
+        */
+       int err = -EINVAL;
+       if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
+               md_error(rdev->mddev, rdev);
+               err = 0;
+       } else if (cmd_match(buf, "remove")) {
+               if (rdev->raid_disk >= 0)
+                       err = -EBUSY;
+               else {
+                       mddev_t *mddev = rdev->mddev;
+                       kick_rdev_from_array(rdev);
+                       md_update_sb(mddev, 1);
+                       md_new_event(mddev);
+                       err = 0;
+               }
+       } else if (cmd_match(buf, "writemostly")) {
+               set_bit(WriteMostly, &rdev->flags);
+               err = 0;
+       } else if (cmd_match(buf, "-writemostly")) {
+               clear_bit(WriteMostly, &rdev->flags);
+               err = 0;
+       }
+       return err ? err : len;
+}
+static struct rdev_sysfs_entry rdev_state =
+__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
 
 static ssize_t
 super_show(mdk_rdev_t *rdev, char *page)
@@ -1777,7 +1836,7 @@ errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
        return -EINVAL;
 }
 static struct rdev_sysfs_entry rdev_errors =
-__ATTR(errors, 0644, errors_show, errors_store);
+__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
 
 static ssize_t
 slot_show(mdk_rdev_t *rdev, char *page)
@@ -1811,7 +1870,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 
 
 static struct rdev_sysfs_entry rdev_slot =
-__ATTR(slot, 0644, slot_show, slot_store);
+__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
 
 static ssize_t
 offset_show(mdk_rdev_t *rdev, char *page)
@@ -1833,7 +1892,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 }
 
 static struct rdev_sysfs_entry rdev_offset =
-__ATTR(offset, 0644, offset_show, offset_store);
+__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
 
 static ssize_t
 rdev_size_show(mdk_rdev_t *rdev, char *page)
@@ -1857,7 +1916,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 }
 
 static struct rdev_sysfs_entry rdev_size =
-__ATTR(size, 0644, rdev_size_show, rdev_size_store);
+__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
 
 static struct attribute *rdev_default_attrs[] = {
        &rdev_state.attr,
@@ -1888,6 +1947,8 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
 
        if (!entry->store)
                return -EIO;
+       if (!capable(CAP_SYS_ADMIN))
+               return -EACCES;
        return entry->store(rdev, page, length);
 }
 
@@ -2094,7 +2155,7 @@ safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
        return len;
 }
 static struct md_sysfs_entry md_safe_delay =
-__ATTR(safe_mode_delay, 0644,safe_delay_show, safe_delay_store);
+__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
 
 static ssize_t
 level_show(mddev_t *mddev, char *page)
@@ -2129,7 +2190,33 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 }
 
 static struct md_sysfs_entry md_level =
-__ATTR(level, 0644, level_show, level_store);
+__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
+
+
+static ssize_t
+layout_show(mddev_t *mddev, char *page)
+{
+       /* just a number, not meaningful for all levels */
+       return sprintf(page, "%d\n", mddev->layout);
+}
+
+static ssize_t
+layout_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       char *e;
+       unsigned long n = simple_strtoul(buf, &e, 10);
+       if (mddev->pers)
+               return -EBUSY;
+
+       if (!*buf || (*e && *e != '\n'))
+               return -EINVAL;
+
+       mddev->layout = n;
+       return len;
+}
+static struct md_sysfs_entry md_layout =
+__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
+
 
 static ssize_t
 raid_disks_show(mddev_t *mddev, char *page)
@@ -2159,7 +2246,7 @@ raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
        return rv ? rv : len;
 }
 static struct md_sysfs_entry md_raid_disks =
-__ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store);
+__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
 
 static ssize_t
 chunk_size_show(mddev_t *mddev, char *page)
@@ -2183,7 +2270,31 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
        return len;
 }
 static struct md_sysfs_entry md_chunk_size =
-__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
+__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
+
+static ssize_t
+resync_start_show(mddev_t *mddev, char *page)
+{
+       return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
+}
+
+static ssize_t
+resync_start_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       /* can only set chunk_size if array is not yet active */
+       char *e;
+       unsigned long long n = simple_strtoull(buf, &e, 10);
+
+       if (mddev->pers)
+               return -EBUSY;
+       if (!*buf || (*e && *e != '\n'))
+               return -EINVAL;
+
+       mddev->recovery_cp = n;
+       return len;
+}
+static struct md_sysfs_entry md_resync_start =
+__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
 
 /*
  * The array state can be:
@@ -2223,7 +2334,7 @@ __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
  */
 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
                   write_pending, active_idle, bad_word};
-char *array_states[] = {
+static char *array_states[] = {
        "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
        "write-pending", "active-idle", NULL };
 
@@ -2323,7 +2434,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
                        spin_lock_irq(&mddev->write_lock);
                        if (atomic_read(&mddev->writes_pending) == 0) {
                                mddev->in_sync = 1;
-                               mddev->sb_dirty = 1;
+                               set_bit(MD_CHANGE_CLEAN, &mddev->flags);
                        }
                        spin_unlock_irq(&mddev->write_lock);
                } else {
@@ -2335,7 +2446,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
        case active:
                if (mddev->pers) {
                        restart_array(mddev);
-                       mddev->sb_dirty = 0;
+                       clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
                        wake_up(&mddev->sb_wait);
                        err = 0;
                } else {
@@ -2353,7 +2464,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
        else
                return len;
 }
-static struct md_sysfs_entry md_array_state = __ATTR(array_state, 0644, array_state_show, array_state_store);
+static struct md_sysfs_entry md_array_state =
+__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
 
 static ssize_t
 null_show(mddev_t *mddev, char *page)
@@ -2413,7 +2525,37 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len)
 }
 
 static struct md_sysfs_entry md_new_device =
-__ATTR(new_dev, 0200, null_show, new_dev_store);
+__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
+
+static ssize_t
+bitmap_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       char *end;
+       unsigned long chunk, end_chunk;
+
+       if (!mddev->bitmap)
+               goto out;
+       /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
+       while (*buf) {
+               chunk = end_chunk = simple_strtoul(buf, &end, 0);
+               if (buf == end) break;
+               if (*end == '-') { /* range */
+                       buf = end + 1;
+                       end_chunk = simple_strtoul(buf, &end, 0);
+                       if (buf == end) break;
+               }
+               if (*end && !isspace(*end)) break;
+               bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
+               buf = end;
+               while (isspace(*buf)) buf++;
+       }
+       bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
+out:
+       return len;
+}
+
+static struct md_sysfs_entry md_bitmap =
+__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
 
 static ssize_t
 size_show(mddev_t *mddev, char *page)
@@ -2439,7 +2581,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
 
        if (mddev->pers) {
                err = update_size(mddev, size);
-               md_update_sb(mddev);
+               md_update_sb(mddev, 1);
        } else {
                if (mddev->size == 0 ||
                    mddev->size > size)
@@ -2451,7 +2593,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
 }
 
 static struct md_sysfs_entry md_size =
-__ATTR(component_size, 0644, size_show, size_store);
+__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
 
 
 /* Metdata version.
@@ -2499,7 +2641,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len)
 }
 
 static struct md_sysfs_entry md_metadata =
-__ATTR(metadata_version, 0644, metadata_show, metadata_store);
+__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
 
 static ssize_t
 action_show(mddev_t *mddev, char *page)
@@ -2567,12 +2709,11 @@ mismatch_cnt_show(mddev_t *mddev, char *page)
                       (unsigned long long) mddev->resync_mismatches);
 }
 
-static struct md_sysfs_entry
-md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
+static struct md_sysfs_entry md_scan_mode =
+__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
 
 
-static struct md_sysfs_entry
-md_mismatches = __ATTR_RO(mismatch_cnt);
+static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
 
 static ssize_t
 sync_min_show(mddev_t *mddev, char *page)
@@ -2631,15 +2772,14 @@ static ssize_t
 sync_speed_show(mddev_t *mddev, char *page)
 {
        unsigned long resync, dt, db;
-       resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+       resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active));
        dt = ((jiffies - mddev->resync_mark) / HZ);
        if (!dt) dt++;
        db = resync - (mddev->resync_mark_cnt);
        return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
 }
 
-static struct md_sysfs_entry
-md_sync_speed = __ATTR_RO(sync_speed);
+static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
 
 static ssize_t
 sync_completed_show(mddev_t *mddev, char *page)
@@ -2655,8 +2795,7 @@ sync_completed_show(mddev_t *mddev, char *page)
        return sprintf(page, "%lu / %lu\n", resync, max_blocks);
 }
 
-static struct md_sysfs_entry
-md_sync_completed = __ATTR_RO(sync_completed);
+static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
 
 static ssize_t
 suspend_lo_show(mddev_t *mddev, char *page)
@@ -2717,9 +2856,11 @@ __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
 
 static struct attribute *md_default_attrs[] = {
        &md_level.attr,
+       &md_layout.attr,
        &md_raid_disks.attr,
        &md_chunk_size.attr,
        &md_size.attr,
+       &md_resync_start.attr,
        &md_metadata.attr,
        &md_new_device.attr,
        &md_safe_delay.attr,
@@ -2736,6 +2877,7 @@ static struct attribute *md_redundancy_attrs[] = {
        &md_sync_completed.attr,
        &md_suspend_lo.attr,
        &md_suspend_hi.attr,
+       &md_bitmap.attr,
        NULL,
 };
 static struct attribute_group md_redundancy_group = {
@@ -2771,6 +2913,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
 
        if (!entry->store)
                return -EIO;
+       if (!capable(CAP_SYS_ADMIN))
+               return -EACCES;
        rv = mddev_lock(mddev);
        if (!rv) {
                rv = entry->store(mddev, page, length);
@@ -2823,13 +2967,10 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
        }
        disk->major = MAJOR(dev);
        disk->first_minor = unit << shift;
-       if (partitioned) {
+       if (partitioned)
                sprintf(disk->disk_name, "md_d%d", unit);
-               sprintf(disk->devfs_name, "md/d%d", unit);
-       } else {
+       else
                sprintf(disk->disk_name, "md%d", unit);
-               sprintf(disk->devfs_name, "md/%d", unit);
-       }
        disk->fops = &md_fops;
        disk->private_data = mddev;
        disk->queue = mddev->queue;
@@ -3008,10 +3149,9 @@ static int do_md_run(mddev_t * mddev)
                }
        
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-       md_wakeup_thread(mddev->thread);
        
-       if (mddev->sb_dirty)
-               md_update_sb(mddev);
+       if (mddev->flags)
+               md_update_sb(mddev, 0);
 
        set_capacity(disk, mddev->array_size<<1);
 
@@ -3029,7 +3169,7 @@ static int do_md_run(mddev_t * mddev)
         * start recovery here.  If we leave it to md_check_recovery,
         * it will remove the drives and not do the right thing
         */
-       if (mddev->degraded) {
+       if (mddev->degraded && !mddev->sync_thread) {
                struct list_head *rtmp;
                int spares = 0;
                ITERATE_RDEV(mddev,rdev,rtmp)
@@ -3050,10 +3190,11 @@ static int do_md_run(mddev_t * mddev)
                                       mdname(mddev));
                                /* leave the spares where they are, it shouldn't hurt */
                                mddev->recovery = 0;
-                       } else
-                               md_wakeup_thread(mddev->sync_thread);
+                       }
                }
        }
+       md_wakeup_thread(mddev->thread);
+       md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
 
        mddev->changed = 1;
        md_new_event(mddev);
@@ -3173,10 +3314,10 @@ static int do_md_stop(mddev_t * mddev, int mode)
                        if (mddev->ro)
                                mddev->ro = 0;
                }
-               if (!mddev->in_sync || mddev->sb_dirty) {
+               if (!mddev->in_sync || mddev->flags) {
                        /* mark array as shutdown cleanly */
                        mddev->in_sync = 1;
-                       md_update_sb(mddev);
+                       md_update_sb(mddev, 1);
                }
                if (mode == 1)
                        set_disk_ro(disk, 1);
@@ -3212,6 +3353,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
                mddev->array_size = 0;
                mddev->size = 0;
                mddev->raid_disks = 0;
+               mddev->recovery_cp = 0;
 
                disk = mddev->gendisk;
                if (disk)
@@ -3271,6 +3413,7 @@ static void autorun_devices(int part)
 
        printk(KERN_INFO "md: autorun ...\n");
        while (!list_empty(&pending_raid_disks)) {
+               int unit;
                dev_t dev;
                LIST_HEAD(candidates);
                rdev0 = list_entry(pending_raid_disks.next,
@@ -3290,16 +3433,19 @@ static void autorun_devices(int part)
                 * mostly sane superblocks. It's time to allocate the
                 * mddev.
                 */
-               if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
+               if (part) {
+                       dev = MKDEV(mdp_major,
+                                   rdev0->preferred_minor << MdpMinorShift);
+                       unit = MINOR(dev) >> MdpMinorShift;
+               } else {
+                       dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
+                       unit = MINOR(dev);
+               }
+               if (rdev0->preferred_minor != unit) {
                        printk(KERN_INFO "md: unit number in %s is bad: %d\n",
                               bdevname(rdev0->bdev, b), rdev0->preferred_minor);
                        break;
                }
-               if (part)
-                       dev = MKDEV(mdp_major,
-                                   rdev0->preferred_minor << MdpMinorShift);
-               else
-                       dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
 
                md_probe(dev, NULL, NULL);
                mddev = mddev_find(dev);
@@ -3337,67 +3483,6 @@ static void autorun_devices(int part)
        printk(KERN_INFO "md: ... autorun DONE.\n");
 }
 
-/*
- * import RAID devices based on one partition
- * if possible, the array gets run as well.
- */
-
-static int autostart_array(dev_t startdev)
-{
-       char b[BDEVNAME_SIZE];
-       int err = -EINVAL, i;
-       mdp_super_t *sb = NULL;
-       mdk_rdev_t *start_rdev = NULL, *rdev;
-
-       start_rdev = md_import_device(startdev, 0, 0);
-       if (IS_ERR(start_rdev))
-               return err;
-
-
-       /* NOTE: this can only work for 0.90.0 superblocks */
-       sb = (mdp_super_t*)page_address(start_rdev->sb_page);
-       if (sb->major_version != 0 ||
-           sb->minor_version != 90 ) {
-               printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
-               export_rdev(start_rdev);
-               return err;
-       }
-
-       if (test_bit(Faulty, &start_rdev->flags)) {
-               printk(KERN_WARNING 
-                       "md: can not autostart based on faulty %s!\n",
-                       bdevname(start_rdev->bdev,b));
-               export_rdev(start_rdev);
-               return err;
-       }
-       list_add(&start_rdev->same_set, &pending_raid_disks);
-
-       for (i = 0; i < MD_SB_DISKS; i++) {
-               mdp_disk_t *desc = sb->disks + i;
-               dev_t dev = MKDEV(desc->major, desc->minor);
-
-               if (!dev)
-                       continue;
-               if (dev == startdev)
-                       continue;
-               if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
-                       continue;
-               rdev = md_import_device(dev, 0, 0);
-               if (IS_ERR(rdev))
-                       continue;
-
-               list_add(&rdev->same_set, &pending_raid_disks);
-       }
-
-       /*
-        * possibly return codes
-        */
-       autorun_devices(0);
-       return 0;
-
-}
-
-
 static int get_version(void __user * arg)
 {
        mdu_version_t ver;
@@ -3705,7 +3790,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
                goto busy;
 
        kick_rdev_from_array(rdev);
-       md_update_sb(mddev);
+       md_update_sb(mddev, 1);
        md_new_event(mddev);
 
        return 0;
@@ -3764,6 +3849,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
        }
        clear_bit(In_sync, &rdev->flags);
        rdev->desc_nr = -1;
+       rdev->saved_raid_disk = -1;
        err = bind_rdev_to_array(rdev, mddev);
        if (err)
                goto abort_export;
@@ -3782,7 +3868,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 
        rdev->raid_disk = -1;
 
-       md_update_sb(mddev);
+       md_update_sb(mddev, 1);
 
        /*
         * Kick recovery, maybe this spare has to be added to the
@@ -3913,7 +3999,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 
        mddev->max_disks     = MD_SB_DISKS;
 
-       mddev->sb_dirty      = 1;
+       mddev->flags         = 0;
+       set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
        mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
        mddev->bitmap_offset = 0;
@@ -4082,7 +4169,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
                        mddev->bitmap_offset = 0;
                }
        }
-       md_update_sb(mddev);
+       md_update_sb(mddev, 1);
        return rv;
 }
 
@@ -4156,27 +4243,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
                goto abort;
        }
 
-
-       if (cmd == START_ARRAY) {
-               /* START_ARRAY doesn't need to lock the array as autostart_array
-                * does the locking, and it could even be a different array
-                */
-               static int cnt = 3;
-               if (cnt > 0 ) {
-                       printk(KERN_WARNING
-                              "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
-                              "This will not be supported beyond July 2006\n",
-                              current->comm, current->pid);
-                       cnt--;
-               }
-               err = autostart_array(new_decode_dev(arg));
-               if (err) {
-                       printk(KERN_WARNING "md: autostart failed!\n");
-                       goto abort;
-               }
-               goto done;
-       }
-
        err = mddev_lock(mddev);
        if (err) {
                printk(KERN_INFO 
@@ -4373,8 +4439,7 @@ static int md_release(struct inode *inode, struct file * file)
 {
        mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
 
-       if (!mddev)
-               BUG();
+       BUG_ON(!mddev);
        mddev_put(mddev);
 
        return 0;
@@ -4502,6 +4567,8 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
                __builtin_return_address(0),__builtin_return_address(1),
                __builtin_return_address(2),__builtin_return_address(3));
 */
+       if (!mddev->pers)
+               return;
        if (!mddev->pers->error_handler)
                return;
        mddev->pers->error_handler(mddev,rdev);
@@ -4582,9 +4649,11 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
        seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
                   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
                    "reshape" :
-                     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
-                      "resync" : "recovery")),
-                     per_milli/10, per_milli % 10,
+                   (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
+                    "check" :
+                    (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
+                     "resync" : "recovery"))),
+                  per_milli/10, per_milli % 10,
                   (unsigned long long) resync,
                   (unsigned long long) max_blocks);
 
@@ -4599,12 +4668,13 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
         */
        dt = ((jiffies - mddev->resync_mark) / HZ);
        if (!dt) dt++;
-       db = resync - (mddev->resync_mark_cnt/2);
-       rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100;
+       db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
+               - mddev->resync_mark_cnt;
+       rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
 
        seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
 
-       seq_printf(seq, " speed=%ldK/sec", db/dt);
+       seq_printf(seq, " speed=%ldK/sec", db/2/dt);
 }
 
 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
@@ -4936,12 +5006,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
                spin_lock_irq(&mddev->write_lock);
                if (mddev->in_sync) {
                        mddev->in_sync = 0;
-                       mddev->sb_dirty = 3;
+                       set_bit(MD_CHANGE_CLEAN, &mddev->flags);
                        md_wakeup_thread(mddev->thread);
                }
                spin_unlock_irq(&mddev->write_lock);
        }
-       wait_event(mddev->sb_wait, mddev->sb_dirty==0);
+       wait_event(mddev->sb_wait, mddev->flags==0);
 }
 
 void md_write_end(mddev_t *mddev)
@@ -4972,6 +5042,7 @@ void md_do_sync(mddev_t *mddev)
        int skipped = 0;
        struct list_head *rtmp;
        mdk_rdev_t *rdev;
+       char *desc;
 
        /* just incase thread restarts... */
        if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -4979,6 +5050,18 @@ void md_do_sync(mddev_t *mddev)
        if (mddev->ro) /* never try to sync a read-only array */
                return;
 
+       if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+               if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+                       desc = "data-check";
+               else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+                       desc = "requested-resync";
+               else
+                       desc = "resync";
+       } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+               desc = "reshape";
+       else
+               desc = "recovery";
+
        /* we overload curr_resync somewhat here.
         * 0 == not engaged in resync at all
         * 2 == checking that there is no conflict with another sync
@@ -5022,10 +5105,10 @@ void md_do_sync(mddev_t *mddev)
                                prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
                                if (!kthread_should_stop() &&
                                    mddev2->curr_resync >= mddev->curr_resync) {
-                                       printk(KERN_INFO "md: delaying resync of %s"
-                                              " until %s has finished resync (they"
+                                       printk(KERN_INFO "md: delaying %s of %s"
+                                              " until %s has finished (they"
                                               " share one or more physical units)\n",
-                                              mdname(mddev), mdname(mddev2));
+                                              desc, mdname(mddev), mdname(mddev2));
                                        mddev_put(mddev2);
                                        schedule();
                                        finish_wait(&resync_wait, &wq);
@@ -5061,12 +5144,12 @@ void md_do_sync(mddev_t *mddev)
                                j = rdev->recovery_offset;
        }
 
-       printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
-       printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
-               " %d KB/sec/disc.\n", speed_min(mddev));
+       printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
+       printk(KERN_INFO "md: minimum _guaranteed_  speed:"
+               " %d KB/sec/disk.\n", speed_min(mddev));
        printk(KERN_INFO "md: using maximum available idle IO bandwidth "
-              "(but not more than %d KB/sec) for reconstruction.\n",
-              speed_max(mddev));
+              "(but not more than %d KB/sec) for %s.\n",
+              speed_max(mddev), desc);
 
        is_mddev_idle(mddev); /* this also initializes IO event counters */
 
@@ -5092,8 +5175,8 @@ void md_do_sync(mddev_t *mddev)
 
        if (j>2) {
                printk(KERN_INFO 
-                       "md: resuming recovery of %s from checkpoint.\n",
-                       mdname(mddev));
+                      "md: resuming %s of %s from checkpoint.\n",
+                      desc, mdname(mddev));
                mddev->curr_resync = j;
        }
 
@@ -5115,6 +5198,7 @@ void md_do_sync(mddev_t *mddev)
 
                j += sectors;
                if (j>1) mddev->curr_resync = j;
+               mddev->curr_mark_cnt = io_sectors;
                if (last_check == 0)
                        /* this is the earliers that rebuilt will be
                         * visible in /proc/mdstat
@@ -5175,7 +5259,7 @@ void md_do_sync(mddev_t *mddev)
                        }
                }
        }
-       printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
+       printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
        /*
         * this also signals 'finished resyncing' to md_stop
         */
@@ -5195,8 +5279,8 @@ void md_do_sync(mddev_t *mddev)
                        if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
                                if (mddev->curr_resync >= mddev->recovery_cp) {
                                        printk(KERN_INFO
-                                              "md: checkpointing recovery of %s.\n",
-                                              mdname(mddev));
+                                              "md: checkpointing %s of %s.\n",
+                                              desc, mdname(mddev));
                                        mddev->recovery_cp = mddev->curr_resync;
                                }
                        } else
@@ -5210,7 +5294,6 @@ void md_do_sync(mddev_t *mddev)
                                    !test_bit(In_sync, &rdev->flags) &&
                                    rdev->recovery_offset < mddev->curr_resync)
                                        rdev->recovery_offset = mddev->curr_resync;
-                       mddev->sb_dirty = 1;
                }
        }
 
@@ -5267,7 +5350,7 @@ void md_check_recovery(mddev_t *mddev)
        }
 
        if ( ! (
-               mddev->sb_dirty ||
+               mddev->flags ||
                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
                test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
                (mddev->safemode == 1) ||
@@ -5283,14 +5366,14 @@ void md_check_recovery(mddev_t *mddev)
                if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
                    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
                        mddev->in_sync = 1;
-                       mddev->sb_dirty = 3;
+                       set_bit(MD_CHANGE_CLEAN, &mddev->flags);
                }
                if (mddev->safemode == 1)
                        mddev->safemode = 0;
                spin_unlock_irq(&mddev->write_lock);
 
-               if (mddev->sb_dirty)
-                       md_update_sb(mddev);
+               if (mddev->flags)
+                       md_update_sb(mddev, 0);
 
 
                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
@@ -5309,7 +5392,7 @@ void md_check_recovery(mddev_t *mddev)
                                /* activate any spares */
                                mddev->pers->spare_active(mddev);
                        }
-                       md_update_sb(mddev);
+                       md_update_sb(mddev, 1);
 
                        /* if array is no-longer degraded, then any saved_raid_disk
                         * information must be scrapped
@@ -5449,37 +5532,16 @@ static void md_geninit(void)
 
 static int __init md_init(void)
 {
-       int minor;
-
-       printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
-                       " MD_SB_DISKS=%d\n",
-                       MD_MAJOR_VERSION, MD_MINOR_VERSION,
-                       MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
-       printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI,
-                       BITMAP_MINOR);
-
        if (register_blkdev(MAJOR_NR, "md"))
                return -1;
        if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
                unregister_blkdev(MAJOR_NR, "md");
                return -1;
        }
-       devfs_mk_dir("md");
-       blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
-                               md_probe, NULL, NULL);
-       blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
+       blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE,
+                           md_probe, NULL, NULL);
+       blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
                            md_probe, NULL, NULL);
-
-       for (minor=0; minor < MAX_MD_DEVS; ++minor)
-               devfs_mk_bdev(MKDEV(MAJOR_NR, minor),
-                               S_IFBLK|S_IRUSR|S_IWUSR,
-                               "md/%d", minor);
-
-       for (minor=0; minor < MAX_MD_DEVS; ++minor)
-               devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift),
-                             S_IFBLK|S_IRUSR|S_IWUSR,
-                             "md/mdp%d", minor);
-
 
        register_reboot_notifier(&md_notifier);
        raid_table_header = register_sysctl_table(raid_root_table, 1);
@@ -5536,15 +5598,9 @@ static __exit void md_exit(void)
 {
        mddev_t *mddev;
        struct list_head *tmp;
-       int i;
-       blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
-       blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
-       for (i=0; i < MAX_MD_DEVS; i++)
-               devfs_remove("md/%d", i);
-       for (i=0; i < MAX_MD_DEVS; i++)
-               devfs_remove("md/d%d", i);
 
-       devfs_remove("md");
+       blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS);
+       blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
 
        unregister_blkdev(MAJOR_NR,"md");
        unregister_blkdev(mdp_major, "mdp");
@@ -5581,8 +5637,8 @@ static int set_ro(const char *val, struct kernel_param *kp)
        return -EINVAL;
 }
 
-module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
-module_param(start_dirty_degraded, int, 0644);
+module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
+module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
 
 
 EXPORT_SYMBOL(register_md_personality);