[PATCH] md: fix bug where new drives added to an md array sometimes don't sync properly
[linux-3.10.git] / drivers / md / md.c
index 92fd010..57fa64f 100644 (file)
 */
 
 #include <linux/module.h>
-#include <linux/config.h>
 #include <linux/kthread.h>
 #include <linux/linkage.h>
 #include <linux/raid/md.h>
 #include <linux/raid/bitmap.h>
 #include <linux/sysctl.h>
-#include <linux/devfs_fs_kernel.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev */
 #include <linux/suspend.h>
 #include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/ctype.h>
 
 #include <linux/init.h>
 
@@ -71,6 +71,10 @@ static void autostart_arrays (int part);
 static LIST_HEAD(pers_list);
 static DEFINE_SPINLOCK(pers_lock);
 
+static void md_print_devices(void);
+
+#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
+
 /*
  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
  * is 1000 KB/sec, so the extra system load does not show up that much.
@@ -106,7 +110,7 @@ static ctl_table raid_table[] = {
                .procname       = "speed_limit_min",
                .data           = &sysctl_speed_limit_min,
                .maxlen         = sizeof(int),
-               .mode           = 0644,
+               .mode           = S_IRUGO|S_IWUSR,
                .proc_handler   = &proc_dointvec,
        },
        {
@@ -114,7 +118,7 @@ static ctl_table raid_table[] = {
                .procname       = "speed_limit_max",
                .data           = &sysctl_speed_limit_max,
                .maxlen         = sizeof(int),
-               .mode           = 0644,
+               .mode           = S_IRUGO|S_IWUSR,
                .proc_handler   = &proc_dointvec,
        },
        { .ctl_name = 0 }
@@ -125,7 +129,7 @@ static ctl_table raid_dir_table[] = {
                .ctl_name       = DEV_RAID,
                .procname       = "raid",
                .maxlen         = 0,
-               .mode           = 0555,
+               .mode           = S_IRUGO|S_IXUGO,
                .child          = raid_table,
        },
        { .ctl_name = 0 }
@@ -162,9 +166,19 @@ void md_new_event(mddev_t *mddev)
 {
        atomic_inc(&md_event_count);
        wake_up(&md_event_waiters);
+       sysfs_notify(&mddev->kobj, NULL, "sync_action");
 }
 EXPORT_SYMBOL_GPL(md_new_event);
 
+/* Alternate version that can be called from interrupts
+ * when calling sysfs_notify isn't needed.
+ */
+static void md_new_event_inintr(mddev_t *mddev)
+{
+       atomic_inc(&md_event_count);
+       wake_up(&md_event_waiters);
+}
+
 /*
  * Enables to iterate over all existing md arrays
  * all_mddevs_lock protects this list.
@@ -214,13 +228,11 @@ static void mddev_put(mddev_t *mddev)
                return;
        if (!mddev->raid_disks && list_empty(&mddev->disks)) {
                list_del(&mddev->all_mddevs);
-               /* that blocks */
+               spin_unlock(&all_mddevs_lock);
                blk_cleanup_queue(mddev->queue);
-               /* that also blocks */
                kobject_unregister(&mddev->kobj);
-               /* result blows... */
-       }
-       spin_unlock(&all_mddevs_lock);
+       } else
+               spin_unlock(&all_mddevs_lock);
 }
 
 static mddev_t * mddev_find(dev_t unit)
@@ -254,7 +266,7 @@ static mddev_t * mddev_find(dev_t unit)
        else
                new->md_minor = MINOR(unit) >> MdpMinorShift;
 
-       init_MUTEX(&new->reconfig_sem);
+       mutex_init(&new->reconfig_mutex);
        INIT_LIST_HEAD(&new->disks);
        INIT_LIST_HEAD(&new->all_mddevs);
        init_timer(&new->safemode_timer);
@@ -276,22 +288,17 @@ static mddev_t * mddev_find(dev_t unit)
 
 static inline int mddev_lock(mddev_t * mddev)
 {
-       return down_interruptible(&mddev->reconfig_sem);
-}
-
-static inline void mddev_lock_uninterruptible(mddev_t * mddev)
-{
-       down(&mddev->reconfig_sem);
+       return mutex_lock_interruptible(&mddev->reconfig_mutex);
 }
 
 static inline int mddev_trylock(mddev_t * mddev)
 {
-       return down_trylock(&mddev->reconfig_sem);
+       return mutex_trylock(&mddev->reconfig_mutex);
 }
 
 static inline void mddev_unlock(mddev_t * mddev)
 {
-       up(&mddev->reconfig_sem);
+       mutex_unlock(&mddev->reconfig_mutex);
 
        md_wakeup_thread(mddev->thread);
 }
@@ -382,8 +389,12 @@ static int super_written(struct bio *bio, unsigned int bytes_done, int error)
        if (bio->bi_size)
                return 1;
 
-       if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
+       if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+               printk("md: super_written gets error=%d, uptodate=%d\n",
+                      error, test_bit(BIO_UPTODATE, &bio->bi_flags));
+               WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
                md_error(mddev, rdev);
+       }
 
        if (atomic_dec_and_test(&mddev->pending_writes))
                wake_up(&mddev->sb_wait);
@@ -728,6 +739,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        mdp_disk_t *desc;
        mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+       __u64 ev1 = md_event(sb);
 
        rdev->raid_disk = -1;
        rdev->flags = 0;
@@ -744,7 +756,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->layout = sb->layout;
                mddev->raid_disks = sb->raid_disks;
                mddev->size = sb->size;
-               mddev->events = md_event(sb);
+               mddev->events = ev1;
                mddev->bitmap_offset = 0;
                mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 
@@ -793,7 +805,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
        } else if (mddev->pers == NULL) {
                /* Insist on good event counter while assembling */
-               __u64 ev1 = md_event(sb);
                ++ev1;
                if (ev1 < mddev->events) 
                        return -EINVAL;
@@ -801,19 +812,21 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                /* if adding to array with a bitmap, then we can accept an
                 * older device ... but not too old.
                 */
-               __u64 ev1 = md_event(sb);
                if (ev1 < mddev->bitmap->events_cleared)
                        return 0;
-       } else /* just a hot-add of a new device, leave raid_disk at -1 */
-               return 0;
+       } else {
+               if (ev1 < mddev->events)
+                       /* just a hot-add of a new device, leave raid_disk at -1 */
+                       return 0;
+       }
 
        if (mddev->level != LEVEL_MULTIPATH) {
                desc = sb->disks + rdev->desc_nr;
 
                if (desc->state & (1<<MD_DISK_FAULTY))
                        set_bit(Faulty, &rdev->flags);
-               else if (desc->state & (1<<MD_DISK_SYNC) &&
-                        desc->raid_disk < mddev->raid_disks) {
+               else if (desc->state & (1<<MD_DISK_SYNC) /* &&
+                           desc->raid_disk < mddev->raid_disks */) {
                        set_bit(In_sync, &rdev->flags);
                        rdev->raid_disk = desc->raid_disk;
                }
@@ -1053,6 +1066,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
        if (rdev->sb_size & bmask)
                rdev-> sb_size = (rdev->sb_size | bmask)+1;
 
+       if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
+               rdev->desc_nr = -1;
+       else
+               rdev->desc_nr = le32_to_cpu(sb->dev_number);
+
        if (refdev == 0)
                ret = 1;
        else {
@@ -1096,6 +1114,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+       __u64 ev1 = le64_to_cpu(sb->events);
 
        rdev->raid_disk = -1;
        rdev->flags = 0;
@@ -1111,7 +1130,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->layout = le32_to_cpu(sb->layout);
                mddev->raid_disks = le32_to_cpu(sb->raid_disks);
                mddev->size = le64_to_cpu(sb->size)/2;
-               mddev->events = le64_to_cpu(sb->events);
+               mddev->events = ev1;
                mddev->bitmap_offset = 0;
                mddev->default_bitmap_offset = 1024 >> 9;
                
@@ -1145,7 +1164,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
        } else if (mddev->pers == NULL) {
                /* Insist of good event counter while assembling */
-               __u64 ev1 = le64_to_cpu(sb->events);
                ++ev1;
                if (ev1 < mddev->events)
                        return -EINVAL;
@@ -1153,15 +1171,15 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                /* If adding to array with a bitmap, then we can accept an
                 * older device, but not too old.
                 */
-               __u64 ev1 = le64_to_cpu(sb->events);
                if (ev1 < mddev->bitmap->events_cleared)
                        return 0;
-       } else /* just a hot-add of a new device, leave raid_disk at -1 */
-               return 0;
-
+       } else {
+               if (ev1 < mddev->events)
+                       /* just a hot-add of a new device, leave raid_disk at -1 */
+                       return 0;
+       }
        if (mddev->level != LEVEL_MULTIPATH) {
                int role;
-               rdev->desc_nr = le32_to_cpu(sb->dev_number);
                role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
                switch(role) {
                case 0xffff: /* spare */
@@ -1170,7 +1188,11 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                        set_bit(Faulty, &rdev->flags);
                        break;
                default:
-                       set_bit(In_sync, &rdev->flags);
+                       if ((le32_to_cpu(sb->feature_map) &
+                            MD_FEATURE_RECOVERY_OFFSET))
+                               rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
+                       else
+                               set_bit(In_sync, &rdev->flags);
                        rdev->raid_disk = role;
                        break;
                }
@@ -1194,6 +1216,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
        sb->feature_map = 0;
        sb->pad0 = 0;
+       sb->recovery_offset = cpu_to_le64(0);
        memset(sb->pad1, 0, sizeof(sb->pad1));
        memset(sb->pad2, 0, sizeof(sb->pad2));
        memset(sb->pad3, 0, sizeof(sb->pad3));
@@ -1214,6 +1237,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
                sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
        }
+
+       if (rdev->raid_disk >= 0 &&
+           !test_bit(In_sync, &rdev->flags) &&
+           rdev->recovery_offset > 0) {
+               sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
+               sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
+       }
+
        if (mddev->reshape_position != MaxSector) {
                sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
                sb->reshape_position = cpu_to_le64(mddev->reshape_position);
@@ -1238,11 +1269,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
                        sb->dev_roles[i] = cpu_to_le16(0xfffe);
                else if (test_bit(In_sync, &rdev2->flags))
                        sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+               else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
+                       sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
                else
                        sb->dev_roles[i] = cpu_to_le16(0xffff);
        }
 
-       sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
        sb->sb_csum = calc_sb_1_csum(sb);
 }
 
@@ -1380,7 +1412,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
 
-       bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+       bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE);
        if (IS_ERR(bdev)) {
                printk(KERN_ERR "md: could not open %s.\n",
                        __bdevname(dev, b));
@@ -1390,7 +1422,7 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
        if (err) {
                printk(KERN_ERR "md: could not bd_claim %s.\n",
                        bdevname(bdev, b));
-               blkdev_put(bdev);
+               blkdev_put_partition(bdev);
                return err;
        }
        rdev->bdev = bdev;
@@ -1404,7 +1436,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
        if (!bdev)
                MD_BUG();
        bd_release(bdev);
-       blkdev_put(bdev);
+       blkdev_put_partition(bdev);
 }
 
 void md_autodetect_dev(dev_t dev);
@@ -1503,7 +1535,7 @@ static void print_rdev(mdk_rdev_t *rdev)
                printk(KERN_INFO "md: no rdev superblock!\n");
 }
 
-void md_print_devices(void)
+static void md_print_devices(void)
 {
        struct list_head *tmp, *tmp2;
        mdk_rdev_t *rdev;
@@ -1532,30 +1564,93 @@ void md_print_devices(void)
 }
 
 
-static void sync_sbs(mddev_t * mddev)
+static void sync_sbs(mddev_t * mddev, int nospares)
 {
+       /* Update each superblock (in-memory image), but
+        * if we are allowed to, skip spares which already
+        * have the right event counter, or have one earlier
+        * (which would mean they aren't being marked as dirty
+        * with the rest of the array)
+        */
        mdk_rdev_t *rdev;
        struct list_head *tmp;
 
        ITERATE_RDEV(mddev,rdev,tmp) {
-               super_types[mddev->major_version].
-                       sync_super(mddev, rdev);
-               rdev->sb_loaded = 1;
+               if (rdev->sb_events == mddev->events ||
+                   (nospares &&
+                    rdev->raid_disk < 0 &&
+                    (rdev->sb_events&1)==0 &&
+                    rdev->sb_events+1 == mddev->events)) {
+                       /* Don't update this superblock */
+                       rdev->sb_loaded = 2;
+               } else {
+                       super_types[mddev->major_version].
+                               sync_super(mddev, rdev);
+                       rdev->sb_loaded = 1;
+               }
        }
 }
 
-void md_update_sb(mddev_t * mddev)
+static void md_update_sb(mddev_t * mddev, int force_change)
 {
        int err;
        struct list_head *tmp;
        mdk_rdev_t *rdev;
        int sync_req;
+       int nospares = 0;
 
 repeat:
        spin_lock_irq(&mddev->write_lock);
+
+       set_bit(MD_CHANGE_PENDING, &mddev->flags);
+       if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
+               force_change = 1;
+       if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
+               /* just a clean<-> dirty transition, possibly leave spares alone,
+                * though if events isn't the right even/odd, we will have to do
+                * spares after all
+                */
+               nospares = 1;
+       if (force_change)
+               nospares = 0;
+       if (mddev->degraded)
+               /* If the array is degraded, then skipping spares is both
+                * dangerous and fairly pointless.
+                * Dangerous because a device that was removed from the array
+                * might have a event_count that still looks up-to-date,
+                * so it can be re-added without a resync.
+                * Pointless because if there are any spares to skip,
+                * then a recovery will happen and soon that array won't
+                * be degraded any more and the spare can go back to sleep then.
+                */
+               nospares = 0;
+
        sync_req = mddev->in_sync;
        mddev->utime = get_seconds();
-       mddev->events ++;
+
+       /* If this is just a dirty<->clean transition, and the array is clean
+        * and 'events' is odd, we can roll back to the previous clean state */
+       if (nospares
+           && (mddev->in_sync && mddev->recovery_cp == MaxSector)
+           && (mddev->events & 1))
+               mddev->events--;
+       else {
+               /* otherwise we have to go forward and ... */
+               mddev->events ++;
+               if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
+                       /* .. if the array isn't clean, insist on an odd 'events' */
+                       if ((mddev->events&1)==0) {
+                               mddev->events++;
+                               nospares = 0;
+                       }
+               } else {
+                       /* otherwise insist on an even 'events' (for clean states) */
+                       if ((mddev->events&1)) {
+                               mddev->events++;
+                               nospares = 0;
+                       }
+               }
+       }
 
        if (!mddev->events) {
                /*
@@ -1566,15 +1661,14 @@ repeat:
                MD_BUG();
                mddev->events --;
        }
-       mddev->sb_dirty = 2;
-       sync_sbs(mddev);
+       sync_sbs(mddev, nospares);
 
        /*
         * do not write anything to disk if using
         * nonpersistent superblocks
         */
        if (!mddev->persistent) {
-               mddev->sb_dirty = 0;
+               clear_bit(MD_CHANGE_PENDING, &mddev->flags);
                spin_unlock_irq(&mddev->write_lock);
                wake_up(&mddev->sb_wait);
                return;
@@ -1589,6 +1683,8 @@ repeat:
        ITERATE_RDEV(mddev,rdev,tmp) {
                char b[BDEVNAME_SIZE];
                dprintk(KERN_INFO "md: ");
+               if (rdev->sb_loaded != 1)
+                       continue; /* no noise on spare devices */
                if (test_bit(Faulty, &rdev->flags))
                        dprintk("(skipping faulty ");
 
@@ -1600,6 +1696,7 @@ repeat:
                        dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
                                bdevname(rdev->bdev,b),
                                (unsigned long long)rdev->sb_offset);
+                       rdev->sb_events = mddev->events;
 
                } else
                        dprintk(")\n");
@@ -1608,20 +1705,20 @@ repeat:
                        break;
        }
        md_super_wait(mddev);
-       /* if there was a failure, sb_dirty was set to 1, and we re-write super */
+       /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
 
        spin_lock_irq(&mddev->write_lock);
-       if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
+       if (mddev->in_sync != sync_req ||
+           test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
                /* have to write it out again */
                spin_unlock_irq(&mddev->write_lock);
                goto repeat;
        }
-       mddev->sb_dirty = 0;
+       clear_bit(MD_CHANGE_PENDING, &mddev->flags);
        spin_unlock_irq(&mddev->write_lock);
        wake_up(&mddev->sb_wait);
 
 }
-EXPORT_SYMBOL_GPL(md_update_sb);
 
 /* words written to sysfs files may, or my not, be \n terminated.
  * We want to accept with case. For this we use cmd_match.
@@ -1663,6 +1760,10 @@ state_show(mdk_rdev_t *rdev, char *page)
                len += sprintf(page+len, "%sin_sync",sep);
                sep = ",";
        }
+       if (test_bit(WriteMostly, &rdev->flags)) {
+               len += sprintf(page+len, "%swrite_mostly",sep);
+               sep = ",";
+       }
        if (!test_bit(Faulty, &rdev->flags) &&
            !test_bit(In_sync, &rdev->flags)) {
                len += sprintf(page+len, "%sspare", sep);
@@ -1671,8 +1772,40 @@ state_show(mdk_rdev_t *rdev, char *page)
        return len+sprintf(page+len, "\n");
 }
 
-static struct rdev_sysfs_entry
-rdev_state = __ATTR_RO(state);
+static ssize_t
+state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+       /* can write
+        *  faulty  - simulates and error
+        *  remove  - disconnects the device
+        *  writemostly - sets write_mostly
+        *  -writemostly - clears write_mostly
+        */
+       int err = -EINVAL;
+       if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
+               md_error(rdev->mddev, rdev);
+               err = 0;
+       } else if (cmd_match(buf, "remove")) {
+               if (rdev->raid_disk >= 0)
+                       err = -EBUSY;
+               else {
+                       mddev_t *mddev = rdev->mddev;
+                       kick_rdev_from_array(rdev);
+                       md_update_sb(mddev, 1);
+                       md_new_event(mddev);
+                       err = 0;
+               }
+       } else if (cmd_match(buf, "writemostly")) {
+               set_bit(WriteMostly, &rdev->flags);
+               err = 0;
+       } else if (cmd_match(buf, "-writemostly")) {
+               clear_bit(WriteMostly, &rdev->flags);
+               err = 0;
+       }
+       return err ? err : len;
+}
+static struct rdev_sysfs_entry rdev_state =
+__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
 
 static ssize_t
 super_show(mdk_rdev_t *rdev, char *page)
@@ -1703,7 +1836,7 @@ errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
        return -EINVAL;
 }
 static struct rdev_sysfs_entry rdev_errors =
-__ATTR(errors, 0644, errors_show, errors_store);
+__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
 
 static ssize_t
 slot_show(mdk_rdev_t *rdev, char *page)
@@ -1737,7 +1870,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 
 
 static struct rdev_sysfs_entry rdev_slot =
-__ATTR(slot, 0644, slot_show, slot_store);
+__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
 
 static ssize_t
 offset_show(mdk_rdev_t *rdev, char *page)
@@ -1759,7 +1892,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 }
 
 static struct rdev_sysfs_entry rdev_offset =
-__ATTR(offset, 0644, offset_show, offset_store);
+__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
 
 static ssize_t
 rdev_size_show(mdk_rdev_t *rdev, char *page)
@@ -1783,7 +1916,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 }
 
 static struct rdev_sysfs_entry rdev_size =
-__ATTR(size, 0644, rdev_size_show, rdev_size_store);
+__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
 
 static struct attribute *rdev_default_attrs[] = {
        &rdev_state.attr,
@@ -1814,6 +1947,8 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
 
        if (!entry->store)
                return -EIO;
+       if (!capable(CAP_SYS_ADMIN))
+               return -EACCES;
        return entry->store(rdev, page, length);
 }
 
@@ -1869,6 +2004,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
        rdev->desc_nr = -1;
        rdev->flags = 0;
        rdev->data_offset = 0;
+       rdev->sb_events = 0;
        atomic_set(&rdev->nr_pending, 0);
        atomic_set(&rdev->read_errors, 0);
        atomic_set(&rdev->corrected_errors, 0);
@@ -1974,6 +2110,54 @@ static void analyze_sbs(mddev_t * mddev)
 }
 
 static ssize_t
+safe_delay_show(mddev_t *mddev, char *page)
+{
+       int msec = (mddev->safemode_delay*1000)/HZ;
+       return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
+}
+static ssize_t
+safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
+{
+       int scale=1;
+       int dot=0;
+       int i;
+       unsigned long msec;
+       char buf[30];
+       char *e;
+       /* remove a period, and count digits after it */
+       if (len >= sizeof(buf))
+               return -EINVAL;
+       strlcpy(buf, cbuf, len);
+       buf[len] = 0;
+       for (i=0; i<len; i++) {
+               if (dot) {
+                       if (isdigit(buf[i])) {
+                               buf[i-1] = buf[i];
+                               scale *= 10;
+                       }
+                       buf[i] = 0;
+               } else if (buf[i] == '.') {
+                       dot=1;
+                       buf[i] = 0;
+               }
+       }
+       msec = simple_strtoul(buf, &e, 10);
+       if (e == buf || (*e && *e != '\n'))
+               return -EINVAL;
+       msec = (msec * 1000) / scale;
+       if (msec == 0)
+               mddev->safemode_delay = 0;
+       else {
+               mddev->safemode_delay = (msec*HZ)/1000;
+               if (mddev->safemode_delay == 0)
+                       mddev->safemode_delay = 1;
+       }
+       return len;
+}
+static struct md_sysfs_entry md_safe_delay =
+__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
+
+static ssize_t
 level_show(mddev_t *mddev, char *page)
 {
        struct mdk_personality *p = mddev->pers;
@@ -2006,7 +2190,33 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 }
 
 static struct md_sysfs_entry md_level =
-__ATTR(level, 0644, level_show, level_store);
+__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
+
+
+static ssize_t
+layout_show(mddev_t *mddev, char *page)
+{
+       /* just a number, not meaningful for all levels */
+       return sprintf(page, "%d\n", mddev->layout);
+}
+
+static ssize_t
+layout_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       char *e;
+       unsigned long n = simple_strtoul(buf, &e, 10);
+       if (mddev->pers)
+               return -EBUSY;
+
+       if (!*buf || (*e && *e != '\n'))
+               return -EINVAL;
+
+       mddev->layout = n;
+       return len;
+}
+static struct md_sysfs_entry md_layout =
+__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
+
 
 static ssize_t
 raid_disks_show(mddev_t *mddev, char *page)
@@ -2036,7 +2246,7 @@ raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
        return rv ? rv : len;
 }
 static struct md_sysfs_entry md_raid_disks =
-__ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store);
+__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
 
 static ssize_t
 chunk_size_show(mddev_t *mddev, char *page)
@@ -2060,7 +2270,202 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
        return len;
 }
 static struct md_sysfs_entry md_chunk_size =
-__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
+__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
+
+static ssize_t
+resync_start_show(mddev_t *mddev, char *page)
+{
+       return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
+}
+
+static ssize_t
+resync_start_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       /* can only set chunk_size if array is not yet active */
+       char *e;
+       unsigned long long n = simple_strtoull(buf, &e, 10);
+
+       if (mddev->pers)
+               return -EBUSY;
+       if (!*buf || (*e && *e != '\n'))
+               return -EINVAL;
+
+       mddev->recovery_cp = n;
+       return len;
+}
+static struct md_sysfs_entry md_resync_start =
+__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
+
+/*
+ * The array state can be:
+ *
+ * clear
+ *     No devices, no size, no level
+ *     Equivalent to STOP_ARRAY ioctl
+ * inactive
+ *     May have some settings, but array is not active
+ *        all IO results in error
+ *     When written, doesn't tear down array, but just stops it
+ * suspended (not supported yet)
+ *     All IO requests will block. The array can be reconfigured.
+ *     Writing this, if accepted, will block until array is quiessent
+ * readonly
+ *     no resync can happen.  no superblocks get written.
+ *     write requests fail
+ * read-auto
+ *     like readonly, but behaves like 'clean' on a write request.
+ *
+ * clean - no pending writes, but otherwise active.
+ *     When written to inactive array, starts without resync
+ *     If a write request arrives then
+ *       if metadata is known, mark 'dirty' and switch to 'active'.
+ *       if not known, block and switch to write-pending
+ *     If written to an active array that has pending writes, then fails.
+ * active
+ *     fully active: IO and resync can be happening.
+ *     When written to inactive array, starts with resync
+ *
+ * write-pending
+ *     clean, but writes are blocked waiting for 'active' to be written.
+ *
+ * active-idle
+ *     like active, but no writes have been seen for a while (100msec).
+ *
+ */
+enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
+                  write_pending, active_idle, bad_word};
+static char *array_states[] = {
+       "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
+       "write-pending", "active-idle", NULL };
+
+static int match_word(const char *word, char **list)
+{
+       int n;
+       for (n=0; list[n]; n++)
+               if (cmd_match(word, list[n]))
+                       break;
+       return n;
+}
+
+static ssize_t
+array_state_show(mddev_t *mddev, char *page)
+{
+       enum array_state st = inactive;
+
+       if (mddev->pers)
+               switch(mddev->ro) {
+               case 1:
+                       st = readonly;
+                       break;
+               case 2:
+                       st = read_auto;
+                       break;
+               case 0:
+                       if (mddev->in_sync)
+                               st = clean;
+                       else if (mddev->safemode)
+                               st = active_idle;
+                       else
+                               st = active;
+               }
+       else {
+               if (list_empty(&mddev->disks) &&
+                   mddev->raid_disks == 0 &&
+                   mddev->size == 0)
+                       st = clear;
+               else
+                       st = inactive;
+       }
+       return sprintf(page, "%s\n", array_states[st]);
+}
+
+static int do_md_stop(mddev_t * mddev, int ro);
+static int do_md_run(mddev_t * mddev);
+static int restart_array(mddev_t *mddev);
+
+static ssize_t
+array_state_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       int err = -EINVAL;
+       enum array_state st = match_word(buf, array_states);
+       switch(st) {
+       case bad_word:
+               break;
+       case clear:
+               /* stopping an active array */
+               if (mddev->pers) {
+                       if (atomic_read(&mddev->active) > 1)
+                               return -EBUSY;
+                       err = do_md_stop(mddev, 0);
+               }
+               break;
+       case inactive:
+               /* stopping an active array */
+               if (mddev->pers) {
+                       if (atomic_read(&mddev->active) > 1)
+                               return -EBUSY;
+                       err = do_md_stop(mddev, 2);
+               }
+               break;
+       case suspended:
+               break; /* not supported yet */
+       case readonly:
+               if (mddev->pers)
+                       err = do_md_stop(mddev, 1);
+               else {
+                       mddev->ro = 1;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case read_auto:
+               /* stopping an active array */
+               if (mddev->pers) {
+                       err = do_md_stop(mddev, 1);
+                       if (err == 0)
+                               mddev->ro = 2; /* FIXME mark devices writable */
+               } else {
+                       mddev->ro = 2;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case clean:
+               if (mddev->pers) {
+                       restart_array(mddev);
+                       spin_lock_irq(&mddev->write_lock);
+                       if (atomic_read(&mddev->writes_pending) == 0) {
+                               mddev->in_sync = 1;
+                               set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+                       }
+                       spin_unlock_irq(&mddev->write_lock);
+               } else {
+                       mddev->ro = 0;
+                       mddev->recovery_cp = MaxSector;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case active:
+               if (mddev->pers) {
+                       restart_array(mddev);
+                       clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
+                       wake_up(&mddev->sb_wait);
+                       err = 0;
+               } else {
+                       mddev->ro = 0;
+                       err = do_md_run(mddev);
+               }
+               break;
+       case write_pending:
+       case active_idle:
+               /* these cannot be set */
+               break;
+       }
+       if (err)
+               return err;
+       else
+               return len;
+}
+static struct md_sysfs_entry md_array_state =
+__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
 
 static ssize_t
 null_show(mddev_t *mddev, char *page)
@@ -2120,7 +2525,37 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len)
 }
 
 static struct md_sysfs_entry md_new_device =
-__ATTR(new_dev, 0200, null_show, new_dev_store);
+__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
+
+static ssize_t
+bitmap_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       char *end;
+       unsigned long chunk, end_chunk;
+
+       if (!mddev->bitmap)
+               goto out;
+       /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
+       while (*buf) {
+               chunk = end_chunk = simple_strtoul(buf, &end, 0);
+               if (buf == end) break;
+               if (*end == '-') { /* range */
+                       buf = end + 1;
+                       end_chunk = simple_strtoul(buf, &end, 0);
+                       if (buf == end) break;
+               }
+               if (*end && !isspace(*end)) break;
+               bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
+               buf = end;
+               while (isspace(*buf)) buf++;
+       }
+       bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
+out:
+       return len;
+}
+
+static struct md_sysfs_entry md_bitmap =
+__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
 
 static ssize_t
 size_show(mddev_t *mddev, char *page)
@@ -2146,7 +2581,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
 
        if (mddev->pers) {
                err = update_size(mddev, size);
-               md_update_sb(mddev);
+               md_update_sb(mddev, 1);
        } else {
                if (mddev->size == 0 ||
                    mddev->size > size)
@@ -2158,7 +2593,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
 }
 
 static struct md_sysfs_entry md_size =
-__ATTR(component_size, 0644, size_show, size_store);
+__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
 
 
 /* Metdata version.
@@ -2206,7 +2641,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len)
 }
 
 static struct md_sysfs_entry md_metadata =
-__ATTR(metadata_version, 0644, metadata_show, metadata_store);
+__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
 
 static ssize_t
 action_show(mddev_t *mddev, char *page)
@@ -2257,7 +2692,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
        } else {
                if (cmd_match(page, "check"))
                        set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
-               else if (cmd_match(page, "repair"))
+               else if (!cmd_match(page, "repair"))
                        return -EINVAL;
                set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
                set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -2274,12 +2709,11 @@ mismatch_cnt_show(mddev_t *mddev, char *page)
                       (unsigned long long) mddev->resync_mismatches);
 }
 
-static struct md_sysfs_entry
-md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
+static struct md_sysfs_entry md_scan_mode =
+__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
 
 
-static struct md_sysfs_entry
-md_mismatches = __ATTR_RO(mismatch_cnt);
+static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
 
 static ssize_t
 sync_min_show(mddev_t *mddev, char *page)
@@ -2338,15 +2772,14 @@ static ssize_t
 sync_speed_show(mddev_t *mddev, char *page)
 {
        unsigned long resync, dt, db;
-       resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+       resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active));
        dt = ((jiffies - mddev->resync_mark) / HZ);
        if (!dt) dt++;
        db = resync - (mddev->resync_mark_cnt);
        return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
 }
 
-static struct md_sysfs_entry
-md_sync_speed = __ATTR_RO(sync_speed);
+static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
 
 static ssize_t
 sync_completed_show(mddev_t *mddev, char *page)
@@ -2362,8 +2795,7 @@ sync_completed_show(mddev_t *mddev, char *page)
        return sprintf(page, "%lu / %lu\n", resync, max_blocks);
 }
 
-static struct md_sysfs_entry
-md_sync_completed = __ATTR_RO(sync_completed);
+static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
 
 static ssize_t
 suspend_lo_show(mddev_t *mddev, char *page)
@@ -2424,11 +2856,15 @@ __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
 
 static struct attribute *md_default_attrs[] = {
        &md_level.attr,
+       &md_layout.attr,
        &md_raid_disks.attr,
        &md_chunk_size.attr,
        &md_size.attr,
+       &md_resync_start.attr,
        &md_metadata.attr,
        &md_new_device.attr,
+       &md_safe_delay.attr,
+       &md_array_state.attr,
        NULL,
 };
 
@@ -2441,6 +2877,7 @@ static struct attribute *md_redundancy_attrs[] = {
        &md_sync_completed.attr,
        &md_suspend_lo.attr,
        &md_suspend_hi.attr,
+       &md_bitmap.attr,
        NULL,
 };
 static struct attribute_group md_redundancy_group = {
@@ -2458,9 +2895,11 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 
        if (!entry->show)
                return -EIO;
-       mddev_lock(mddev);
-       rv = entry->show(mddev, page);
-       mddev_unlock(mddev);
+       rv = mddev_lock(mddev);
+       if (!rv) {
+               rv = entry->show(mddev, page);
+               mddev_unlock(mddev);
+       }
        return rv;
 }
 
@@ -2474,9 +2913,13 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
 
        if (!entry->store)
                return -EIO;
-       mddev_lock(mddev);
-       rv = entry->store(mddev, page, length);
-       mddev_unlock(mddev);
+       if (!capable(CAP_SYS_ADMIN))
+               return -EACCES;
+       rv = mddev_lock(mddev);
+       if (!rv) {
+               rv = entry->store(mddev, page, length);
+               mddev_unlock(mddev);
+       }
        return rv;
 }
 
@@ -2500,7 +2943,7 @@ int mdp_major = 0;
 
 static struct kobject *md_probe(dev_t dev, int *part, void *data)
 {
-       static DECLARE_MUTEX(disks_sem);
+       static DEFINE_MUTEX(disks_mutex);
        mddev_t *mddev = mddev_find(dev);
        struct gendisk *disk;
        int partitioned = (MAJOR(dev) != MD_MAJOR);
@@ -2510,33 +2953,30 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
        if (!mddev)
                return NULL;
 
-       down(&disks_sem);
+       mutex_lock(&disks_mutex);
        if (mddev->gendisk) {
-               up(&disks_sem);
+               mutex_unlock(&disks_mutex);
                mddev_put(mddev);
                return NULL;
        }
        disk = alloc_disk(1 << shift);
        if (!disk) {
-               up(&disks_sem);
+               mutex_unlock(&disks_mutex);
                mddev_put(mddev);
                return NULL;
        }
        disk->major = MAJOR(dev);
        disk->first_minor = unit << shift;
-       if (partitioned) {
+       if (partitioned)
                sprintf(disk->disk_name, "md_d%d", unit);
-               sprintf(disk->devfs_name, "md/d%d", unit);
-       } else {
+       else
                sprintf(disk->disk_name, "md%d", unit);
-               sprintf(disk->devfs_name, "md/%d", unit);
-       }
        disk->fops = &md_fops;
        disk->private_data = mddev;
        disk->queue = mddev->queue;
        add_disk(disk);
        mddev->gendisk = disk;
-       up(&disks_sem);
+       mutex_unlock(&disks_mutex);
        mddev->kobj.parent = &disk->kobj;
        mddev->kobj.k_name = NULL;
        snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md");
@@ -2545,8 +2985,6 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
        return NULL;
 }
 
-void md_wakeup_thread(mdk_thread_t *thread);
-
 static void md_safemode_timeout(unsigned long data)
 {
        mddev_t *mddev = (mddev_t *) data;
@@ -2700,7 +3138,7 @@ static int do_md_run(mddev_t * mddev)
        mddev->safemode = 0;
        mddev->safemode_timer.function = md_safemode_timeout;
        mddev->safemode_timer.data = (unsigned long) mddev;
-       mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
+       mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
        mddev->in_sync = 1;
 
        ITERATE_RDEV(mddev,rdev,tmp)
@@ -2711,10 +3149,9 @@ static int do_md_run(mddev_t * mddev)
                }
        
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-       md_wakeup_thread(mddev->thread);
        
-       if (mddev->sb_dirty)
-               md_update_sb(mddev);
+       if (mddev->flags)
+               md_update_sb(mddev, 0);
 
        set_capacity(disk, mddev->array_size<<1);
 
@@ -2728,6 +3165,37 @@ static int do_md_run(mddev_t * mddev)
        mddev->queue->queuedata = mddev;
        mddev->queue->make_request_fn = mddev->pers->make_request;
 
+       /* If there is a partially-recovered drive we need to
+        * start recovery here.  If we leave it to md_check_recovery,
+        * it will remove the drives and not do the right thing
+        */
+       if (mddev->degraded && !mddev->sync_thread) {
+               struct list_head *rtmp;
+               int spares = 0;
+               ITERATE_RDEV(mddev,rdev,rtmp)
+                       if (rdev->raid_disk >= 0 &&
+                           !test_bit(In_sync, &rdev->flags) &&
+                           !test_bit(Faulty, &rdev->flags))
+                               /* complete an interrupted recovery */
+                               spares++;
+               if (spares && mddev->pers->sync_request) {
+                       mddev->recovery = 0;
+                       set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+                       mddev->sync_thread = md_register_thread(md_do_sync,
+                                                               mddev,
+                                                               "%s_resync");
+                       if (!mddev->sync_thread) {
+                               printk(KERN_ERR "%s: could not start resync"
+                                      " thread...\n",
+                                      mdname(mddev));
+                               /* leave the spares where they are, it shouldn't hurt */
+                               mddev->recovery = 0;
+                       }
+               }
+       }
+       md_wakeup_thread(mddev->thread);
+       md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
+
        mddev->changed = 1;
        md_new_event(mddev);
        return 0;
@@ -2761,18 +3229,47 @@ static int restart_array(mddev_t *mddev)
                 */
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                md_wakeup_thread(mddev->thread);
+               md_wakeup_thread(mddev->sync_thread);
                err = 0;
-       } else {
-               printk(KERN_ERR "md: %s has no personality assigned.\n",
-                       mdname(mddev));
+       } else
                err = -EINVAL;
-       }
 
 out:
        return err;
 }
 
-static int do_md_stop(mddev_t * mddev, int ro)
+/* similar to deny_write_access, but accounts for our holding a reference
+ * to the file ourselves */
+static int deny_bitmap_write_access(struct file * file)
+{
+       struct inode *inode = file->f_mapping->host;
+
+       spin_lock(&inode->i_lock);
+       if (atomic_read(&inode->i_writecount) > 1) {
+               spin_unlock(&inode->i_lock);
+               return -ETXTBSY;
+       }
+       atomic_set(&inode->i_writecount, -1);
+       spin_unlock(&inode->i_lock);
+
+       return 0;
+}
+
+static void restore_bitmap_write_access(struct file *file)
+{
+       struct inode *inode = file->f_mapping->host;
+
+       spin_lock(&inode->i_lock);
+       atomic_set(&inode->i_writecount, 1);
+       spin_unlock(&inode->i_lock);
+}
+
+/* mode:
+ *   0 - completely stop and dis-assemble array
+ *   1 - switch to readonly
+ *   2 - stop but do not disassemble array
+ */
+static int do_md_stop(mddev_t * mddev, int mode)
 {
        int err = 0;
        struct gendisk *disk = mddev->gendisk;
@@ -2784,6 +3281,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
                }
 
                if (mddev->sync_thread) {
+                       set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
                        md_unregister_thread(mddev->sync_thread);
                        mddev->sync_thread = NULL;
@@ -2793,12 +3291,15 @@ static int do_md_stop(mddev_t * mddev, int ro)
 
                invalidate_partition(disk, 0);
 
-               if (ro) {
+               switch(mode) {
+               case 1: /* readonly */
                        err  = -ENXIO;
                        if (mddev->ro==1)
                                goto out;
                        mddev->ro = 1;
-               } else {
+                       break;
+               case 0: /* disassemble */
+               case 2: /* stop */
                        bitmap_flush(mddev);
                        md_super_wait(mddev);
                        if (mddev->ro)
@@ -2813,19 +3314,20 @@ static int do_md_stop(mddev_t * mddev, int ro)
                        if (mddev->ro)
                                mddev->ro = 0;
                }
-               if (!mddev->in_sync) {
+               if (!mddev->in_sync || mddev->flags) {
                        /* mark array as shutdown cleanly */
                        mddev->in_sync = 1;
-                       md_update_sb(mddev);
+                       md_update_sb(mddev, 1);
                }
-               if (ro)
+               if (mode == 1)
                        set_disk_ro(disk, 1);
+               clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
        }
 
        /*
         * Free resources if final stop
         */
-       if (!ro) {
+       if (mode == 0) {
                mdk_rdev_t *rdev;
                struct list_head *tmp;
                struct gendisk *disk;
@@ -2833,7 +3335,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
 
                bitmap_destroy(mddev);
                if (mddev->bitmap_file) {
-                       atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1);
+                       restore_bitmap_write_access(mddev->bitmap_file);
                        fput(mddev->bitmap_file);
                        mddev->bitmap_file = NULL;
                }
@@ -2849,11 +3351,15 @@ static int do_md_stop(mddev_t * mddev, int ro)
                export_array(mddev);
 
                mddev->array_size = 0;
+               mddev->size = 0;
+               mddev->raid_disks = 0;
+               mddev->recovery_cp = 0;
+
                disk = mddev->gendisk;
                if (disk)
                        set_capacity(disk, 0);
                mddev->changed = 1;
-       } else
+       } else if (mddev->pers)
                printk(KERN_INFO "md: %s switched to read-only mode.\n",
                        mdname(mddev));
        err = 0;
@@ -2907,6 +3413,7 @@ static void autorun_devices(int part)
 
        printk(KERN_INFO "md: autorun ...\n");
        while (!list_empty(&pending_raid_disks)) {
+               int unit;
                dev_t dev;
                LIST_HEAD(candidates);
                rdev0 = list_entry(pending_raid_disks.next,
@@ -2926,16 +3433,19 @@ static void autorun_devices(int part)
                 * mostly sane superblocks. It's time to allocate the
                 * mddev.
                 */
-               if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
+               if (part) {
+                       dev = MKDEV(mdp_major,
+                                   rdev0->preferred_minor << MdpMinorShift);
+                       unit = MINOR(dev) >> MdpMinorShift;
+               } else {
+                       dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
+                       unit = MINOR(dev);
+               }
+               if (rdev0->preferred_minor != unit) {
                        printk(KERN_INFO "md: unit number in %s is bad: %d\n",
                               bdevname(rdev0->bdev, b), rdev0->preferred_minor);
                        break;
                }
-               if (part)
-                       dev = MKDEV(mdp_major,
-                                   rdev0->preferred_minor << MdpMinorShift);
-               else
-                       dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
 
                md_probe(dev, NULL, NULL);
                mddev = mddev_find(dev);
@@ -2973,67 +3483,6 @@ static void autorun_devices(int part)
        printk(KERN_INFO "md: ... autorun DONE.\n");
 }
 
-/*
- * import RAID devices based on one partition
- * if possible, the array gets run as well.
- */
-
-static int autostart_array(dev_t startdev)
-{
-       char b[BDEVNAME_SIZE];
-       int err = -EINVAL, i;
-       mdp_super_t *sb = NULL;
-       mdk_rdev_t *start_rdev = NULL, *rdev;
-
-       start_rdev = md_import_device(startdev, 0, 0);
-       if (IS_ERR(start_rdev))
-               return err;
-
-
-       /* NOTE: this can only work for 0.90.0 superblocks */
-       sb = (mdp_super_t*)page_address(start_rdev->sb_page);
-       if (sb->major_version != 0 ||
-           sb->minor_version != 90 ) {
-               printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
-               export_rdev(start_rdev);
-               return err;
-       }
-
-       if (test_bit(Faulty, &start_rdev->flags)) {
-               printk(KERN_WARNING 
-                       "md: can not autostart based on faulty %s!\n",
-                       bdevname(start_rdev->bdev,b));
-               export_rdev(start_rdev);
-               return err;
-       }
-       list_add(&start_rdev->same_set, &pending_raid_disks);
-
-       for (i = 0; i < MD_SB_DISKS; i++) {
-               mdp_disk_t *desc = sb->disks + i;
-               dev_t dev = MKDEV(desc->major, desc->minor);
-
-               if (!dev)
-                       continue;
-               if (dev == startdev)
-                       continue;
-               if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
-                       continue;
-               rdev = md_import_device(dev, 0, 0);
-               if (IS_ERR(rdev))
-                       continue;
-
-               list_add(&rdev->same_set, &pending_raid_disks);
-       }
-
-       /*
-        * possibly return codes
-        */
-       autorun_devices(0);
-       return 0;
-
-}
-
-
 static int get_version(void __user * arg)
 {
        mdu_version_t ver;
@@ -3256,6 +3705,17 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 
                rdev->raid_disk = -1;
                err = bind_rdev_to_array(rdev, mddev);
+               if (!err && !mddev->pers->hot_remove_disk) {
+                       /* If there is hot_add_disk but no hot_remove_disk
+                        * then added disks for geometry changes,
+                        * and should be added immediately.
+                        */
+                       super_types[mddev->major_version].
+                               validate_super(mddev, rdev);
+                       err = mddev->pers->hot_add_disk(mddev, rdev);
+                       if (err)
+                               unbind_rdev_from_array(rdev);
+               }
                if (err)
                        export_rdev(rdev);
 
@@ -3330,7 +3790,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
                goto busy;
 
        kick_rdev_from_array(rdev);
-       md_update_sb(mddev);
+       md_update_sb(mddev, 1);
        md_new_event(mddev);
 
        return 0;
@@ -3389,6 +3849,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
        }
        clear_bit(In_sync, &rdev->flags);
        rdev->desc_nr = -1;
+       rdev->saved_raid_disk = -1;
        err = bind_rdev_to_array(rdev, mddev);
        if (err)
                goto abort_export;
@@ -3407,7 +3868,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 
        rdev->raid_disk = -1;
 
-       md_update_sb(mddev);
+       md_update_sb(mddev, 1);
 
        /*
         * Kick recovery, maybe this spare has to be added to the
@@ -3426,23 +3887,6 @@ abort_export:
        return err;
 }
 
-/* similar to deny_write_access, but accounts for our holding a reference
- * to the file ourselves */
-static int deny_bitmap_write_access(struct file * file)
-{
-       struct inode *inode = file->f_mapping->host;
-
-       spin_lock(&inode->i_lock);
-       if (atomic_read(&inode->i_writecount) > 1) {
-               spin_unlock(&inode->i_lock);
-               return -ETXTBSY;
-       }
-       atomic_set(&inode->i_writecount, -1);
-       spin_unlock(&inode->i_lock);
-
-       return 0;
-}
-
 static int set_bitmap_file(mddev_t *mddev, int fd)
 {
        int err;
@@ -3483,12 +3927,17 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
                mddev->pers->quiesce(mddev, 1);
                if (fd >= 0)
                        err = bitmap_create(mddev);
-               if (fd < 0 || err)
+               if (fd < 0 || err) {
                        bitmap_destroy(mddev);
+                       fd = -1; /* make sure to put the file */
+               }
                mddev->pers->quiesce(mddev, 0);
-       } else if (fd < 0) {
-               if (mddev->bitmap_file)
+       }
+       if (fd < 0) {
+               if (mddev->bitmap_file) {
+                       restore_bitmap_write_access(mddev->bitmap_file);
                        fput(mddev->bitmap_file);
+               }
                mddev->bitmap_file = NULL;
        }
 
@@ -3550,7 +3999,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 
        mddev->max_disks     = MD_SB_DISKS;
 
-       mddev->sb_dirty      = 1;
+       mddev->flags         = 0;
+       set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
        mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
        mddev->bitmap_offset = 0;
@@ -3575,6 +4025,7 @@ static int update_size(mddev_t *mddev, unsigned long size)
        mdk_rdev_t * rdev;
        int rv;
        struct list_head *tmp;
+       int fit = (size == 0);
 
        if (mddev->pers->resize == NULL)
                return -EINVAL;
@@ -3592,7 +4043,6 @@ static int update_size(mddev_t *mddev, unsigned long size)
                return -EBUSY;
        ITERATE_RDEV(mddev,rdev,tmp) {
                sector_t avail;
-               int fit = (size == 0);
                if (rdev->sb_offset > rdev->data_offset)
                        avail = (rdev->sb_offset*2) - rdev->data_offset;
                else
@@ -3719,7 +4169,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
                        mddev->bitmap_offset = 0;
                }
        }
-       md_update_sb(mddev);
+       md_update_sb(mddev, 1);
        return rv;
 }
 
@@ -3793,27 +4243,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
                goto abort;
        }
 
-
-       if (cmd == START_ARRAY) {
-               /* START_ARRAY doesn't need to lock the array as autostart_array
-                * does the locking, and it could even be a different array
-                */
-               static int cnt = 3;
-               if (cnt > 0 ) {
-                       printk(KERN_WARNING
-                              "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
-                              "This will not be supported beyond July 2006\n",
-                              current->comm, current->pid);
-                       cnt--;
-               }
-               err = autostart_array(new_decode_dev(arg));
-               if (err) {
-                       printk(KERN_WARNING "md: autostart failed!\n");
-                       goto abort;
-               }
-               goto done;
-       }
-
        err = mddev_lock(mddev);
        if (err) {
                printk(KERN_INFO 
@@ -3969,11 +4398,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
                        goto done_unlock;
 
                default:
-                       if (_IOC_TYPE(cmd) == MD_MAJOR)
-                               printk(KERN_WARNING "md: %s(pid %d) used"
-                                       " obsolete MD ioctl, upgrade your"
-                                       " software to use new ictls.\n",
-                                       current->comm, current->pid);
                        err = -EINVAL;
                        goto abort_unlock;
        }
@@ -4015,8 +4439,7 @@ static int md_release(struct inode *inode, struct file * file)
 {
        mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
 
-       if (!mddev)
-               BUG();
+       BUG_ON(!mddev);
        mddev_put(mddev);
 
        return 0;
@@ -4144,13 +4567,15 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
                __builtin_return_address(0),__builtin_return_address(1),
                __builtin_return_address(2),__builtin_return_address(3));
 */
+       if (!mddev->pers)
+               return;
        if (!mddev->pers->error_handler)
                return;
        mddev->pers->error_handler(mddev,rdev);
        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
-       md_new_event(mddev);
+       md_new_event_inintr(mddev);
 }
 
 /* seq_file implementation /proc/mdstat */
@@ -4224,9 +4649,11 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
        seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
                   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
                    "reshape" :
-                     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
-                      "resync" : "recovery")),
-                     per_milli/10, per_milli % 10,
+                   (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
+                    "check" :
+                    (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
+                     "resync" : "recovery"))),
+                  per_milli/10, per_milli % 10,
                   (unsigned long long) resync,
                   (unsigned long long) max_blocks);
 
@@ -4241,12 +4668,13 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
         */
        dt = ((jiffies - mddev->resync_mark) / HZ);
        if (!dt) dt++;
-       db = resync - (mddev->resync_mark_cnt/2);
-       rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100;
+       db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
+               - mddev->resync_mark_cnt;
+       rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
 
        seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
 
-       seq_printf(seq, " speed=%ldK/sec", db/dt);
+       seq_printf(seq, " speed=%ldK/sec", db/2/dt);
 }
 
 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
@@ -4341,8 +4769,9 @@ static int md_seq_show(struct seq_file *seq, void *v)
                return 0;
        }
 
-       if (mddev_lock(mddev)!=0) 
+       if (mddev_lock(mddev) < 0)
                return -EINTR;
+
        if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
                seq_printf(seq, "%s : %sactive", mdname(mddev),
                                                mddev->pers ? "" : "in");
@@ -4577,12 +5006,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
                spin_lock_irq(&mddev->write_lock);
                if (mddev->in_sync) {
                        mddev->in_sync = 0;
-                       mddev->sb_dirty = 1;
+                       set_bit(MD_CHANGE_CLEAN, &mddev->flags);
                        md_wakeup_thread(mddev->thread);
                }
                spin_unlock_irq(&mddev->write_lock);
        }
-       wait_event(mddev->sb_wait, mddev->sb_dirty==0);
+       wait_event(mddev->sb_wait, mddev->flags==0);
 }
 
 void md_write_end(mddev_t *mddev)
@@ -4590,7 +5019,7 @@ void md_write_end(mddev_t *mddev)
        if (atomic_dec_and_test(&mddev->writes_pending)) {
                if (mddev->safemode == 2)
                        md_wakeup_thread(mddev->thread);
-               else
+               else if (mddev->safemode_delay)
                        mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
        }
 }
@@ -4611,10 +5040,27 @@ void md_do_sync(mddev_t *mddev)
        struct list_head *tmp;
        sector_t last_check;
        int skipped = 0;
+       struct list_head *rtmp;
+       mdk_rdev_t *rdev;
+       char *desc;
 
        /* just incase thread restarts... */
        if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
                return;
+       if (mddev->ro) /* never try to sync a read-only array */
+               return;
+
+       if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+               if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+                       desc = "data-check";
+               else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+                       desc = "requested-resync";
+               else
+                       desc = "resync";
+       } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+               desc = "reshape";
+       else
+               desc = "recovery";
 
        /* we overload curr_resync somewhat here.
         * 0 == not engaged in resync at all
@@ -4659,10 +5105,10 @@ void md_do_sync(mddev_t *mddev)
                                prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
                                if (!kthread_should_stop() &&
                                    mddev2->curr_resync >= mddev->curr_resync) {
-                                       printk(KERN_INFO "md: delaying resync of %s"
-                                              " until %s has finished resync (they"
+                                       printk(KERN_INFO "md: delaying %s of %s"
+                                              " until %s has finished (they"
                                               " share one or more physical units)\n",
-                                              mdname(mddev), mdname(mddev2));
+                                              desc, mdname(mddev), mdname(mddev2));
                                        mddev_put(mddev2);
                                        schedule();
                                        finish_wait(&resync_wait, &wq);
@@ -4673,32 +5119,40 @@ void md_do_sync(mddev_t *mddev)
                }
        } while (mddev->curr_resync < 2);
 
+       j = 0;
        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
                /* resync follows the size requested by the personality,
                 * which defaults to physical size, but can be virtual size
                 */
                max_sectors = mddev->resync_max_sectors;
                mddev->resync_mismatches = 0;
+               /* we don't use the checkpoint if there's a bitmap */
+               if (!mddev->bitmap &&
+                   !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+                       j = mddev->recovery_cp;
        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
                max_sectors = mddev->size << 1;
-       else
+       else {
                /* recovery follows the physical size of devices */
                max_sectors = mddev->size << 1;
+               j = MaxSector;
+               ITERATE_RDEV(mddev,rdev,rtmp)
+                       if (rdev->raid_disk >= 0 &&
+                           !test_bit(Faulty, &rdev->flags) &&
+                           !test_bit(In_sync, &rdev->flags) &&
+                           rdev->recovery_offset < j)
+                               j = rdev->recovery_offset;
+       }
 
-       printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
-       printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
-               " %d KB/sec/disc.\n", speed_min(mddev));
+       printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
+       printk(KERN_INFO "md: minimum _guaranteed_  speed:"
+               " %d KB/sec/disk.\n", speed_min(mddev));
        printk(KERN_INFO "md: using maximum available idle IO bandwidth "
-              "(but not more than %d KB/sec) for reconstruction.\n",
-              speed_max(mddev));
+              "(but not more than %d KB/sec) for %s.\n",
+              speed_max(mddev), desc);
 
        is_mddev_idle(mddev); /* this also initializes IO event counters */
-       /* we don't use the checkpoint if there's a bitmap */
-       if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap
-           && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
-               j = mddev->recovery_cp;
-       else
-               j = 0;
+
        io_sectors = 0;
        for (m = 0; m < SYNC_MARKS; m++) {
                mark[m] = jiffies;
@@ -4721,8 +5175,8 @@ void md_do_sync(mddev_t *mddev)
 
        if (j>2) {
                printk(KERN_INFO 
-                       "md: resuming recovery of %s from checkpoint.\n",
-                       mdname(mddev));
+                      "md: resuming %s of %s from checkpoint.\n",
+                      desc, mdname(mddev));
                mddev->curr_resync = j;
        }
 
@@ -4744,6 +5198,7 @@ void md_do_sync(mddev_t *mddev)
 
                j += sectors;
                if (j>1) mddev->curr_resync = j;
+               mddev->curr_mark_cnt = io_sectors;
                if (last_check == 0)
                        /* this is the earliers that rebuilt will be
                         * visible in /proc/mdstat
@@ -4804,7 +5259,7 @@ void md_do_sync(mddev_t *mddev)
                        }
                }
        }
-       printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
+       printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
        /*
         * this also signals 'finished resyncing' to md_stop
         */
@@ -4819,15 +5274,27 @@ void md_do_sync(mddev_t *mddev)
        if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
            test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
            !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
-           mddev->curr_resync > 2 &&
-           mddev->curr_resync >= mddev->recovery_cp) {
-               if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
-                       printk(KERN_INFO 
-                               "md: checkpointing recovery of %s.\n",
-                               mdname(mddev));
-                       mddev->recovery_cp = mddev->curr_resync;
-               } else
-                       mddev->recovery_cp = MaxSector;
+           mddev->curr_resync > 2) {
+               if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+                       if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+                               if (mddev->curr_resync >= mddev->recovery_cp) {
+                                       printk(KERN_INFO
+                                              "md: checkpointing %s of %s.\n",
+                                              desc, mdname(mddev));
+                                       mddev->recovery_cp = mddev->curr_resync;
+                               }
+                       } else
+                               mddev->recovery_cp = MaxSector;
+               } else {
+                       if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+                               mddev->curr_resync = MaxSector;
+                       ITERATE_RDEV(mddev,rdev,rtmp)
+                               if (rdev->raid_disk >= 0 &&
+                                   !test_bit(Faulty, &rdev->flags) &&
+                                   !test_bit(In_sync, &rdev->flags) &&
+                                   rdev->recovery_offset < mddev->curr_resync)
+                                       rdev->recovery_offset = mddev->curr_resync;
+               }
        }
 
  skip:
@@ -4883,7 +5350,7 @@ void md_check_recovery(mddev_t *mddev)
        }
 
        if ( ! (
-               mddev->sb_dirty ||
+               mddev->flags ||
                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
                test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
                (mddev->safemode == 1) ||
@@ -4892,21 +5359,21 @@ void md_check_recovery(mddev_t *mddev)
                ))
                return;
 
-       if (mddev_trylock(mddev)==0) {
+       if (mddev_trylock(mddev)) {
                int spares =0;
 
                spin_lock_irq(&mddev->write_lock);
                if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
                    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
                        mddev->in_sync = 1;
-                       mddev->sb_dirty = 1;
+                       set_bit(MD_CHANGE_CLEAN, &mddev->flags);
                }
                if (mddev->safemode == 1)
                        mddev->safemode = 0;
                spin_unlock_irq(&mddev->write_lock);
 
-               if (mddev->sb_dirty)
-                       md_update_sb(mddev);
+               if (mddev->flags)
+                       md_update_sb(mddev, 0);
 
 
                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
@@ -4925,7 +5392,7 @@ void md_check_recovery(mddev_t *mddev)
                                /* activate any spares */
                                mddev->pers->spare_active(mddev);
                        }
-                       md_update_sb(mddev);
+                       md_update_sb(mddev, 1);
 
                        /* if array is no-longer degraded, then any saved_raid_disk
                         * information must be scrapped
@@ -4948,6 +5415,8 @@ void md_check_recovery(mddev_t *mddev)
                clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
                clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 
+               if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+                       goto unlock;
                /* no recovery is running.
                 * remove any failed drives, then
                 * add spares if possible.
@@ -4970,6 +5439,7 @@ void md_check_recovery(mddev_t *mddev)
                        ITERATE_RDEV(mddev,rdev,rtmp)
                                if (rdev->raid_disk < 0
                                    && !test_bit(Faulty, &rdev->flags)) {
+                                       rdev->recovery_offset = 0;
                                        if (mddev->pers->hot_add_disk(mddev,rdev)) {
                                                char nm[20];
                                                sprintf(nm, "rd%d", rdev->raid_disk);
@@ -5028,8 +5498,10 @@ static int md_notify_reboot(struct notifier_block *this,
                printk(KERN_INFO "md: stopping all md devices.\n");
 
                ITERATE_MDDEV(mddev,tmp)
-                       if (mddev_trylock(mddev)==0)
+                       if (mddev_trylock(mddev)) {
                                do_md_stop (mddev, 1);
+                               mddev_unlock(mddev);
+                       }
                /*
                 * certain more exotic SCSI devices are known to be
                 * volatile wrt too early system reboots. While the
@@ -5060,37 +5532,16 @@ static void md_geninit(void)
 
 static int __init md_init(void)
 {
-       int minor;
-
-       printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
-                       " MD_SB_DISKS=%d\n",
-                       MD_MAJOR_VERSION, MD_MINOR_VERSION,
-                       MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
-       printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI,
-                       BITMAP_MINOR);
-
        if (register_blkdev(MAJOR_NR, "md"))
                return -1;
        if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
                unregister_blkdev(MAJOR_NR, "md");
                return -1;
        }
-       devfs_mk_dir("md");
-       blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
-                               md_probe, NULL, NULL);
-       blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
+       blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE,
+                           md_probe, NULL, NULL);
+       blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
                            md_probe, NULL, NULL);
-
-       for (minor=0; minor < MAX_MD_DEVS; ++minor)
-               devfs_mk_bdev(MKDEV(MAJOR_NR, minor),
-                               S_IFBLK|S_IRUSR|S_IWUSR,
-                               "md/%d", minor);
-
-       for (minor=0; minor < MAX_MD_DEVS; ++minor)
-               devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift),
-                             S_IFBLK|S_IRUSR|S_IWUSR,
-                             "md/mdp%d", minor);
-
 
        register_reboot_notifier(&md_notifier);
        raid_table_header = register_sysctl_table(raid_root_table, 1);
@@ -5147,15 +5598,9 @@ static __exit void md_exit(void)
 {
        mddev_t *mddev;
        struct list_head *tmp;
-       int i;
-       blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
-       blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
-       for (i=0; i < MAX_MD_DEVS; i++)
-               devfs_remove("md/%d", i);
-       for (i=0; i < MAX_MD_DEVS; i++)
-               devfs_remove("md/d%d", i);
 
-       devfs_remove("md");
+       blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS);
+       blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
 
        unregister_blkdev(MAJOR_NR,"md");
        unregister_blkdev(mdp_major, "mdp");
@@ -5192,8 +5637,8 @@ static int set_ro(const char *val, struct kernel_param *kp)
        return -EINVAL;
 }
 
-module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
-module_param(start_dirty_degraded, int, 0644);
+module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
+module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
 
 
 EXPORT_SYMBOL(register_md_personality);
@@ -5205,7 +5650,6 @@ EXPORT_SYMBOL(md_write_end);
 EXPORT_SYMBOL(md_register_thread);
 EXPORT_SYMBOL(md_unregister_thread);
 EXPORT_SYMBOL(md_wakeup_thread);
-EXPORT_SYMBOL(md_print_devices);
 EXPORT_SYMBOL(md_check_recovery);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md");