]> nv-tegra.nvidia Code Review - linux-2.6.git/blobdiff - drivers/md/md.c
Merge branch 'for-linus' of git://neil.brown.name/md
[linux-2.6.git] / drivers / md / md.c
index 117ea5fde56824f86839339cd968c380509ff6cf..ed5727c089a9403b2ad10bb7e581beae5b0d36ac 100644 (file)
@@ -201,24 +201,75 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
                )
 
 
-static int md_fail_request(struct request_queue *q, struct bio *bio)
+/* Rather than calling directly into the personality make_request function,
+ * IO requests come here first so that we can check if the device is
+ * being suspended pending a reconfiguration.
+ * We hold a refcount over the call to ->make_request.  By the time that
+ * call has finished, the bio has been linked into some internal structure
+ * and so is visible to ->quiesce(), so we don't need the refcount any more.
+ */
+static int md_make_request(struct request_queue *q, struct bio *bio)
 {
-       bio_io_error(bio);
-       return 0;
+       mddev_t *mddev = q->queuedata;
+       int rv;
+       if (mddev == NULL || mddev->pers == NULL) {
+               bio_io_error(bio);
+               return 0;
+       }
+       rcu_read_lock();
+       if (mddev->suspended) {
+               DEFINE_WAIT(__wait);
+               for (;;) {
+                       prepare_to_wait(&mddev->sb_wait, &__wait,
+                                       TASK_UNINTERRUPTIBLE);
+                       if (!mddev->suspended)
+                               break;
+                       rcu_read_unlock();
+                       schedule();
+                       rcu_read_lock();
+               }
+               finish_wait(&mddev->sb_wait, &__wait);
+       }
+       atomic_inc(&mddev->active_io);
+       rcu_read_unlock();
+       rv = mddev->pers->make_request(q, bio);
+       if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
+               wake_up(&mddev->sb_wait);
+
+       return rv;
+}
+
+static void mddev_suspend(mddev_t *mddev)
+{
+       BUG_ON(mddev->suspended);
+       mddev->suspended = 1;
+       synchronize_rcu();
+       wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
+       mddev->pers->quiesce(mddev, 1);
+       md_unregister_thread(mddev->thread);
+       mddev->thread = NULL;
+       /* we now know that no code is executing in the personality module,
+        * except possibly the tail end of a ->bi_end_io function, but that
+        * is certain to complete before the module has a chance to get
+        * unloaded
+        */
+}
+
+static void mddev_resume(mddev_t *mddev)
+{
+       mddev->suspended = 0;
+       wake_up(&mddev->sb_wait);
+       mddev->pers->quiesce(mddev, 0);
 }
 
+
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
        atomic_inc(&mddev->active);
        return mddev;
 }
 
-static void mddev_delayed_delete(struct work_struct *ws)
-{
-       mddev_t *mddev = container_of(ws, mddev_t, del_work);
-       kobject_del(&mddev->kobj);
-       kobject_put(&mddev->kobj);
-}
+static void mddev_delayed_delete(struct work_struct *ws);
 
 static void mddev_put(mddev_t *mddev)
 {
@@ -314,6 +365,7 @@ static mddev_t * mddev_find(dev_t unit)
        init_timer(&new->safemode_timer);
        atomic_set(&new->active, 1);
        atomic_set(&new->openers, 0);
+       atomic_set(&new->active_io, 0);
        spin_lock_init(&new->write_lock);
        init_waitqueue_head(&new->sb_wait);
        init_waitqueue_head(&new->recovery_wait);
@@ -330,6 +382,11 @@ static inline int mddev_lock(mddev_t * mddev)
        return mutex_lock_interruptible(&mddev->reconfig_mutex);
 }
 
+static inline int mddev_is_locked(mddev_t *mddev)
+{
+       return mutex_is_locked(&mddev->reconfig_mutex);
+}
+
 static inline int mddev_trylock(mddev_t * mddev)
 {
        return mutex_trylock(&mddev->reconfig_mutex);
@@ -2225,16 +2282,34 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
        return 1;
 }
 
+static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
+{
+       unsigned long long blocks;
+       sector_t new;
+
+       if (strict_strtoull(buf, 10, &blocks) < 0)
+               return -EINVAL;
+
+       if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
+               return -EINVAL; /* sector conversion overflow */
+
+       new = blocks * 2;
+       if (new != blocks * 2)
+               return -EINVAL; /* unsigned long long to sector_t overflow */
+
+       *sectors = new;
+       return 0;
+}
+
 static ssize_t
 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
        mddev_t *my_mddev = rdev->mddev;
        sector_t oldsectors = rdev->sectors;
-       unsigned long long sectors;
+       sector_t sectors;
 
-       if (strict_strtoull(buf, 10, &sectors) < 0)
+       if (strict_blocks_to_sectors(buf, &sectors) < 0)
                return -EINVAL;
-       sectors *= 2;
        if (my_mddev->pers && rdev->raid_disk >= 0) {
                if (my_mddev->persistent) {
                        sectors = super_types[my_mddev->major_version].
@@ -2592,18 +2667,101 @@ level_show(mddev_t *mddev, char *page)
 static ssize_t
 level_store(mddev_t *mddev, const char *buf, size_t len)
 {
+       char level[16];
        ssize_t rv = len;
-       if (mddev->pers)
+       struct mdk_personality *pers;
+       void *priv;
+
+       if (mddev->pers == NULL) {
+               if (len == 0)
+                       return 0;
+               if (len >= sizeof(mddev->clevel))
+                       return -ENOSPC;
+               strncpy(mddev->clevel, buf, len);
+               if (mddev->clevel[len-1] == '\n')
+                       len--;
+               mddev->clevel[len] = 0;
+               mddev->level = LEVEL_NONE;
+               return rv;
+       }
+
+       /* request to change the personality.  Need to ensure:
+        *  - array is not engaged in resync/recovery/reshape
+        *  - old personality can be suspended
+        *  - new personality will access other array.
+        */
+
+       if (mddev->sync_thread || mddev->reshape_position != MaxSector)
                return -EBUSY;
-       if (len == 0)
-               return 0;
-       if (len >= sizeof(mddev->clevel))
-               return -ENOSPC;
-       strncpy(mddev->clevel, buf, len);
-       if (mddev->clevel[len-1] == '\n')
+
+       if (!mddev->pers->quiesce) {
+               printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
+                      mdname(mddev), mddev->pers->name);
+               return -EINVAL;
+       }
+
+       /* Now find the new personality */
+       if (len == 0 || len >= sizeof(level))
+               return -EINVAL;
+       strncpy(level, buf, len);
+       if (level[len-1] == '\n')
                len--;
-       mddev->clevel[len] = 0;
-       mddev->level = LEVEL_NONE;
+       level[len] = 0;
+
+       request_module("md-%s", level);
+       spin_lock(&pers_lock);
+       pers = find_pers(LEVEL_NONE, level);
+       if (!pers || !try_module_get(pers->owner)) {
+               spin_unlock(&pers_lock);
+               printk(KERN_WARNING "md: personality %s not loaded\n", level);
+               return -EINVAL;
+       }
+       spin_unlock(&pers_lock);
+
+       if (pers == mddev->pers) {
+               /* Nothing to do! */
+               module_put(pers->owner);
+               return rv;
+       }
+       if (!pers->takeover) {
+               module_put(pers->owner);
+               printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
+                      mdname(mddev), level);
+               return -EINVAL;
+       }
+
+       /* ->takeover must set new_* and/or delta_disks
+        * if it succeeds, and may set them when it fails.
+        */
+       priv = pers->takeover(mddev);
+       if (IS_ERR(priv)) {
+               mddev->new_level = mddev->level;
+               mddev->new_layout = mddev->layout;
+               mddev->new_chunk = mddev->chunk_size;
+               mddev->raid_disks -= mddev->delta_disks;
+               mddev->delta_disks = 0;
+               module_put(pers->owner);
+               printk(KERN_WARNING "md: %s: %s would not accept array\n",
+                      mdname(mddev), level);
+               return PTR_ERR(priv);
+       }
+
+       /* Looks like we have a winner */
+       mddev_suspend(mddev);
+       mddev->pers->stop(mddev);
+       module_put(mddev->pers->owner);
+       mddev->pers = pers;
+       mddev->private = priv;
+       strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+       mddev->level = mddev->new_level;
+       mddev->layout = mddev->new_layout;
+       mddev->chunk_size = mddev->new_chunk;
+       mddev->delta_disks = 0;
+       pers->run(mddev);
+       mddev_resume(mddev);
+       set_bit(MD_CHANGE_DEVS, &mddev->flags);
+       set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+       md_wakeup_thread(mddev->thread);
        return rv;
 }
 
@@ -2631,12 +2789,18 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
        if (!*buf || (*e && *e != '\n'))
                return -EINVAL;
 
-       if (mddev->pers)
-               return -EBUSY;
-
-       mddev->new_layout = n;
-       if (mddev->reshape_position == MaxSector)
-               mddev->layout = n;
+       if (mddev->pers) {
+               int err;
+               if (mddev->pers->reconfig == NULL)
+                       return -EBUSY;
+               err = mddev->pers->reconfig(mddev, n, -1);
+               if (err)
+                       return err;
+       } else {
+               mddev->new_layout = n;
+               if (mddev->reshape_position == MaxSector)
+                       mddev->layout = n;
+       }
        return len;
 }
 static struct md_sysfs_entry md_layout =
@@ -2693,19 +2857,24 @@ chunk_size_show(mddev_t *mddev, char *page)
 static ssize_t
 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
 {
-       /* can only set chunk_size if array is not yet active */
        char *e;
        unsigned long n = simple_strtoul(buf, &e, 10);
 
        if (!*buf || (*e && *e != '\n'))
                return -EINVAL;
 
-       if (mddev->pers)
-               return -EBUSY;
-
-       mddev->new_chunk = n;
-       if (mddev->reshape_position == MaxSector)
-               mddev->chunk_size = n;
+       if (mddev->pers) {
+               int err;
+               if (mddev->pers->reconfig == NULL)
+                       return -EBUSY;
+               err = mddev->pers->reconfig(mddev, -1, n);
+               if (err)
+                       return err;
+       } else {
+               mddev->new_chunk = n;
+               if (mddev->reshape_position == MaxSector)
+                       mddev->chunk_size = n;
+       }
        return len;
 }
 static struct md_sysfs_entry md_chunk_size =
@@ -2714,6 +2883,8 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
 static ssize_t
 resync_start_show(mddev_t *mddev, char *page)
 {
+       if (mddev->recovery_cp == MaxSector)
+               return sprintf(page, "none\n");
        return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
 }
 
@@ -3031,12 +3202,11 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
         * not increase it (except from 0).
         * If array is active, we can try an on-line resize
         */
-       unsigned long long sectors;
-       int err = strict_strtoull(buf, 10, &sectors);
+       sector_t sectors;
+       int err = strict_blocks_to_sectors(buf, &sectors);
 
        if (err < 0)
                return err;
-       sectors *= 2;
        if (mddev->pers) {
                err = update_size(mddev, sectors);
                md_update_sb(mddev, 1);
@@ -3296,6 +3466,8 @@ static ssize_t
 sync_speed_show(mddev_t *mddev, char *page)
 {
        unsigned long resync, dt, db;
+       if (mddev->curr_resync == 0)
+               return sprintf(page, "none\n");
        resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
        dt = (jiffies - mddev->resync_mark) / HZ;
        if (!dt) dt++;
@@ -3476,6 +3648,57 @@ static struct md_sysfs_entry md_reshape_position =
 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
        reshape_position_store);
 
+static ssize_t
+array_size_show(mddev_t *mddev, char *page)
+{
+       if (mddev->external_size)
+               return sprintf(page, "%llu\n",
+                              (unsigned long long)mddev->array_sectors/2);
+       else
+               return sprintf(page, "default\n");
+}
+
+static ssize_t
+array_size_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       sector_t sectors;
+
+       if (strncmp(buf, "default", 7) == 0) {
+               if (mddev->pers)
+                       sectors = mddev->pers->size(mddev, 0, 0);
+               else
+                       sectors = mddev->array_sectors;
+
+               mddev->external_size = 0;
+       } else {
+               if (strict_blocks_to_sectors(buf, &sectors) < 0)
+                       return -EINVAL;
+               if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
+                       return -EINVAL;
+
+               mddev->external_size = 1;
+       }
+
+       mddev->array_sectors = sectors;
+       set_capacity(mddev->gendisk, mddev->array_sectors);
+       if (mddev->pers) {
+               struct block_device *bdev = bdget_disk(mddev->gendisk, 0);
+
+               if (bdev) {
+                       mutex_lock(&bdev->bd_inode->i_mutex);
+                       i_size_write(bdev->bd_inode,
+                                    (loff_t)mddev->array_sectors << 9);
+                       mutex_unlock(&bdev->bd_inode->i_mutex);
+                       bdput(bdev);
+               }
+       }
+
+       return len;
+}
+
+static struct md_sysfs_entry md_array_size =
+__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
+       array_size_store);
 
 static struct attribute *md_default_attrs[] = {
        &md_level.attr,
@@ -3489,6 +3712,7 @@ static struct attribute *md_default_attrs[] = {
        &md_safe_delay.attr,
        &md_array_state.attr,
        &md_reshape_position.attr,
+       &md_array_size.attr,
        NULL,
 };
 
@@ -3582,6 +3806,21 @@ static struct kobj_type md_ktype = {
 
 int mdp_major = 0;
 
+static void mddev_delayed_delete(struct work_struct *ws)
+{
+       mddev_t *mddev = container_of(ws, mddev_t, del_work);
+
+       if (mddev->private == &md_redundancy_group) {
+               sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
+               if (mddev->sysfs_action)
+                       sysfs_put(mddev->sysfs_action);
+               mddev->sysfs_action = NULL;
+               mddev->private = NULL;
+       }
+       kobject_del(&mddev->kobj);
+       kobject_put(&mddev->kobj);
+}
+
 static int md_alloc(dev_t dev, char *name)
 {
        static DEFINE_MUTEX(disks_mutex);
@@ -3632,10 +3871,12 @@ static int md_alloc(dev_t dev, char *name)
                mddev_put(mddev);
                return -ENOMEM;
        }
+       mddev->queue->queuedata = mddev;
+
        /* Can be unlocked because the queue is new: no concurrency */
        queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
 
-       blk_queue_make_request(mddev->queue, md_fail_request);
+       blk_queue_make_request(mddev->queue, md_make_request);
 
        disk = alloc_disk(1 << shift);
        if (!disk) {
@@ -3892,7 +4133,17 @@ static int do_md_run(mddev_t * mddev)
        err = mddev->pers->run(mddev);
        if (err)
                printk(KERN_ERR "md: pers->run() failed ...\n");
-       else if (mddev->pers->sync_request) {
+       else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
+               WARN_ONCE(!mddev->external_size, "%s: default size too small,"
+                         " but 'external_size' not in effect?\n", __func__);
+               printk(KERN_ERR
+                      "md: invalid array_size %llu > default size %llu\n",
+                      (unsigned long long)mddev->array_sectors / 2,
+                      (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
+               err = -EINVAL;
+               mddev->pers->stop(mddev);
+       }
+       if (err == 0 && mddev->pers->sync_request) {
                err = bitmap_create(mddev);
                if (err) {
                        printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -3938,16 +4189,6 @@ static int do_md_run(mddev_t * mddev)
 
        set_capacity(disk, mddev->array_sectors);
 
-       /* If we call blk_queue_make_request here, it will
-        * re-initialise max_sectors etc which may have been
-        * refined inside -> run.  So just set the bits we need to set.
-        * Most initialisation happended when we called
-        * blk_queue_make_request(..., md_fail_request)
-        * earlier.
-        */
-       mddev->queue->queuedata = mddev;
-       mddev->queue->make_request_fn = mddev->pers->make_request;
-
        /* If there is a partially-recovered drive we need to
         * start recovery here.  If we leave it to md_check_recovery,
         * it will remove the drives and not do the right thing
@@ -4077,18 +4318,14 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                        md_super_wait(mddev);
                        if (mddev->ro)
                                set_disk_ro(disk, 0);
-                       blk_queue_make_request(mddev->queue, md_fail_request);
+
                        mddev->pers->stop(mddev);
                        mddev->queue->merge_bvec_fn = NULL;
                        mddev->queue->unplug_fn = NULL;
                        mddev->queue->backing_dev_info.congested_fn = NULL;
-                       if (mddev->pers->sync_request) {
-                               sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
-                               if (mddev->sysfs_action)
-                                       sysfs_put(mddev->sysfs_action);
-                               mddev->sysfs_action = NULL;
-                       }
                        module_put(mddev->pers->owner);
+                       if (mddev->pers->sync_request)
+                               mddev->private = &md_redundancy_group;
                        mddev->pers = NULL;
                        /* tell userspace to handle 'inactive' */
                        sysfs_notify_dirent(mddev->sysfs_state);
@@ -4138,6 +4375,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                export_array(mddev);
 
                mddev->array_sectors = 0;
+               mddev->external_size = 0;
                mddev->dev_sectors = 0;
                mddev->raid_disks = 0;
                mddev->recovery_cp = 0;
@@ -4834,6 +5072,17 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
        return 0;
 }
 
+void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
+{
+       WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
+
+       if (mddev->external_size)
+               return;
+
+       mddev->array_sectors = array_sectors;
+}
+EXPORT_SYMBOL(md_set_array_sectors);
+
 static int update_size(mddev_t *mddev, sector_t num_sectors)
 {
        mdk_rdev_t *rdev;
@@ -5382,6 +5631,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
 
 void md_unregister_thread(mdk_thread_t *thread)
 {
+       if (!thread)
+               return;
        dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
 
        kthread_stop(thread->tsk);
@@ -6392,6 +6643,9 @@ void md_check_recovery(mddev_t *mddev)
                                        sysfs_notify(&mddev->kobj, NULL,
                                                     "degraded");
                        }
+                       if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+                           mddev->pers->finish_reshape)
+                               mddev->pers->finish_reshape(mddev);
                        md_update_sb(mddev, 1);
 
                        /* if array is no-longer degraded, then any saved_raid_disk