*
* Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
*
- * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
+ * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
* Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
*
* Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
- if (r_queue->unplug_fn)
- r_queue->unplug_fn(r_queue);
+ blk_unplug(r_queue);
rdev_dec_pending(rdev, mddev);
rcu_read_lock();
}
+static int flush_pending_writes(conf_t *conf)
+{
+ /* Any writes that have been queued but are awaiting
+ * bitmap updates get flushed here.
+ * We return 1 if any requests were actually submitted.
+ */
+ int rv = 0;
+
+ spin_lock_irq(&conf->device_lock);
+
+ if (conf->pending_bio_list.head) {
+ struct bio *bio;
+ bio = bio_list_get(&conf->pending_bio_list);
+ blk_remove_plug(conf->mddev->queue);
+ spin_unlock_irq(&conf->device_lock);
+ /* flush any pending bitmap writes to
+ * disk before proceeding w/ I/O */
+ bitmap_unplug(conf->mddev->bitmap);
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ bio->bi_next = NULL;
+ generic_make_request(bio);
+ bio = next;
+ }
+ rv = 1;
+ } else
+ spin_unlock_irq(&conf->device_lock);
+ return rv;
+}
+
/* Barriers....
* Sometimes we need to suspend IO while we do something else,
* either some resync/recovery, or reconfigure the array.
/* stop syncio and normal IO and wait for everything to
* go quite.
* We increment barrier and nr_waiting, and then
- * wait until barrier+nr_pending match nr_queued+2
+ * wait until nr_pending match nr_queued+1
+ * This is called in the context of one normal IO request
+ * that has failed. Thus any sync request that might be pending
+ * will be blocked by nr_pending, and we need to wait for
+ * pending IO requests to complete or be queued for re-try.
+ * Thus the number queued (nr_queued) plus this request (1)
+ * must match the number of pending IOs (nr_pending) before
+ * we continue.
*/
spin_lock_irq(&conf->resync_lock);
conf->barrier++;
conf->nr_waiting++;
wait_event_lock_irq(conf->wait_barrier,
- conf->barrier+conf->nr_pending == conf->nr_queued+2,
+ conf->nr_pending == conf->nr_queued+1,
conf->resync_lock,
- raid1_unplug(conf->mddev->queue));
+ ({ flush_pending_writes(conf);
+ raid1_unplug(conf->mddev->queue); }));
spin_unlock_irq(&conf->resync_lock);
}
static void unfreeze_array(conf_t *conf)
r1bio_t *r1_bio;
struct bio *read_bio;
int i, targets = 0, disks;
- mdk_rdev_t *rdev;
- struct bitmap *bitmap = mddev->bitmap;
+ struct bitmap *bitmap;
unsigned long flags;
struct bio_list bl;
struct page **behind_pages = NULL;
const int rw = bio_data_dir(bio);
const int do_sync = bio_sync(bio);
int do_barriers;
+ mdk_rdev_t *blocked_rdev;
/*
* Register the new request and wait if the reconstruction
wait_barrier(conf);
+ bitmap = mddev->bitmap;
+
disk_stat_inc(mddev->gendisk, ios[rw]);
disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
first = 0;
}
#endif
+ retry_write:
+ blocked_rdev = NULL;
rcu_read_lock();
for (i = 0; i < disks; i++) {
- if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL &&
- !test_bit(Faulty, &rdev->flags)) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+ atomic_inc(&rdev->nr_pending);
+ blocked_rdev = rdev;
+ break;
+ }
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
if (test_bit(Faulty, &rdev->flags)) {
rdev_dec_pending(rdev, mddev);
}
rcu_read_unlock();
+ if (unlikely(blocked_rdev)) {
+ /* Wait for this device to become unblocked */
+ int j;
+
+ for (j = 0; j < i; j++)
+ if (r1_bio->bios[j])
+ rdev_dec_pending(conf->mirrors[j].rdev, mddev);
+
+ allow_barrier(conf);
+ md_wait_for_blocked_rdev(blocked_rdev, mddev);
+ wait_barrier(conf);
+ goto retry_write;
+ }
+
BUG_ON(targets == 0); /* we never fail the last device */
if (targets < conf->raid_disks) {
blk_plug_device(mddev->queue);
spin_unlock_irqrestore(&conf->device_lock, flags);
+ /* In case raid1d snuck into freeze_array */
+ wake_up(&conf->wait_barrier);
+
if (do_sync)
md_wakeup_thread(mddev->thread);
#if 0
/*
* if recovery is running, make sure it aborts.
*/
- set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
} else
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
- printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"
- " Operation continuing on %d devices\n",
+ printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n"
+ "raid1: Operation continuing on %d devices.\n",
bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
}
static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{
conf_t *conf = mddev->private;
- int found = 0;
+ int err = -EEXIST;
int mirror = 0;
mirror_info_t *p;
+ int first = 0;
+ int last = mddev->raid_disks - 1;
+
+ if (rdev->raid_disk >= 0)
+ first = last = rdev->raid_disk;
- for (mirror=0; mirror < mddev->raid_disks; mirror++)
+ for (mirror = first; mirror <= last; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) {
blk_queue_stack_limits(mddev->queue,
p->head_position = 0;
rdev->raid_disk = mirror;
- found = 1;
+ err = 0;
/* As all devices are equivalent, we don't need a full recovery
* if this was recently any drive of the array
*/
}
print_conf(conf);
- return found;
+ return err;
}
static int raid1_remove_disk(mddev_t *mddev, int number)
err = -EBUSY;
goto abort;
}
+ /* Only remove non-faulty devices is recovery
+ * is not possible.
+ */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ mddev->degraded < conf->raid_disks) {
+ err = -EBUSY;
+ goto abort;
+ }
p->rdev = NULL;
synchronize_rcu();
if (atomic_read(&rdev->nr_pending)) {
j = 0;
if (j >= 0)
mddev->resync_mismatches += r1_bio->sectors;
- if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
+ if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
+ && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
sbio->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[i].rdev, mddev);
} else {
/* fixup the bio for reuse */
+ int size;
sbio->bi_vcnt = vcnt;
sbio->bi_size = r1_bio->sectors << 9;
sbio->bi_idx = 0;
sbio->bi_sector = r1_bio->sector +
conf->mirrors[i].rdev->data_offset;
sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
- for (j = 0; j < vcnt ; j++)
- memcpy(page_address(sbio->bi_io_vec[j].bv_page),
+ size = sbio->bi_size;
+ for (j = 0; j < vcnt ; j++) {
+ struct bio_vec *bi;
+ bi = &sbio->bi_io_vec[j];
+ bi->bv_offset = 0;
+ if (size > PAGE_SIZE)
+ bi->bv_len = PAGE_SIZE;
+ else
+ bi->bv_len = size;
+ size -= PAGE_SIZE;
+ memcpy(page_address(bi->bv_page),
page_address(pbio->bi_io_vec[j].bv_page),
PAGE_SIZE);
+ }
}
}
for (;;) {
char b[BDEVNAME_SIZE];
- spin_lock_irqsave(&conf->device_lock, flags);
-
- if (conf->pending_bio_list.head) {
- bio = bio_list_get(&conf->pending_bio_list);
- blk_remove_plug(mddev->queue);
- spin_unlock_irqrestore(&conf->device_lock, flags);
- /* flush any pending bitmap writes to disk before proceeding w/ I/O */
- bitmap_unplug(mddev->bitmap);
-
- while (bio) { /* submit pending writes */
- struct bio *next = bio->bi_next;
- bio->bi_next = NULL;
- generic_make_request(bio);
- bio = next;
- }
- unplug = 1;
- continue;
- }
+ unplug += flush_pending_writes(conf);
- if (list_empty(head))
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (list_empty(head)) {
+ spin_unlock_irqrestore(&conf->device_lock, flags);
break;
+ }
r1_bio = list_entry(head->prev, r1bio_t, retry_list);
list_del(head->prev);
conf->nr_queued--;
}
}
}
- spin_unlock_irqrestore(&conf->device_lock, flags);
if (unplug)
unplug_slaves(mddev);
}
if (!go_faster && conf->nr_waiting)
msleep_interruptible(1000);
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr);
raise_barrier(conf);
conf->next_resync = sector_nr;
return rv;
}
+ if (max_sector > mddev->resync_max)
+ max_sector = mddev->resync_max; /* Don't do IO beyond here */
nr_sectors = 0;
sync_blocks = 0;
do {
if (!conf->r1bio_pool)
goto out_no_mem;
- ITERATE_RDEV(mddev, rdev, tmp) {
+ spin_lock_init(&conf->device_lock);
+ mddev->queue->queue_lock = &conf->device_lock;
+
+ rdev_for_each(rdev, tmp, mddev) {
disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0)
}
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
- spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
spin_lock_init(&conf->resync_lock);
/*
* Ok, everything is just fine now
*/
- mddev->array_size = mddev->size;
+ mddev->array_sectors = mddev->size * 2;
mddev->queue->unplug_fn = raid1_unplug;
mddev->queue->backing_dev_info.congested_fn = raid1_congested;
* any io in the removed space completes, but it hardly seems
* worth it.
*/
- mddev->array_size = sectors>>1;
- set_capacity(mddev->gendisk, mddev->array_size << 1);
+ mddev->array_sectors = sectors;
+ set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
- if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {
+ if (mddev->array_sectors / 2 > mddev->size &&
+ mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->size << 1;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
- mddev->size = mddev->array_size;
+ mddev->size = mddev->array_sectors / 2;
mddev->resync_max_sectors = sectors;
return 0;
}
conf_t *conf = mddev_to_conf(mddev);
int cnt, raid_disks;
unsigned long flags;
- int d, d2;
+ int d, d2, err;
/* Cannot change chunk_size, layout, or level */
if (mddev->chunk_size != mddev->new_chunk ||
return -EINVAL;
}
- md_allow_write(mddev);
+ err = md_allow_write(mddev);
+ if (err)
+ return err;
raid_disks = mddev->raid_disks + mddev->delta_disks;