[PATCH] md: tidyup some issues with raid1 resync and prepare for catching read errors
NeilBrown [Fri, 6 Jan 2006 08:20:21 +0000 (00:20 -0800)]
We are dereferencing ->rdev without an rcu lock!

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

drivers/md/raid1.c

index b3856db..ea1f1eb 100644 (file)
@@ -177,6 +177,13 @@ static inline void free_r1bio(r1bio_t *r1_bio)
 static inline void put_buf(r1bio_t *r1_bio)
 {
        conf_t *conf = mddev_to_conf(r1_bio->mddev);
+       int i;
+
+       for (i=0; i<conf->raid_disks; i++) {
+               struct bio *bio = r1_bio->bios[i];
+               if (bio->bi_end_io)
+                       rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
+       }
 
        mempool_free(r1_bio, conf->r1buf_pool);
 
@@ -1085,7 +1092,6 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
                         conf->mirrors[r1_bio->read_disk].rdev);
        } else
                set_bit(R1BIO_Uptodate, &r1_bio->state);
-       rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev);
        reschedule_retry(r1_bio);
        return 0;
 }
@@ -1116,7 +1122,6 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
                md_done_sync(mddev, r1_bio->sectors, uptodate);
                put_buf(r1_bio);
        }
-       rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
        return 0;
 }
 
@@ -1153,10 +1158,14 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
        atomic_set(&r1_bio->remaining, 1);
        for (i = 0; i < disks ; i++) {
                wbio = r1_bio->bios[i];
-               if (wbio->bi_end_io != end_sync_write)
+               if (wbio->bi_end_io == NULL ||
+                   (wbio->bi_end_io == end_sync_read &&
+                    (i == r1_bio->read_disk ||
+                     !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
                        continue;
 
-               atomic_inc(&conf->mirrors[i].rdev->nr_pending);
+               wbio->bi_rw = WRITE;
+               wbio->bi_end_io = end_sync_write;
                atomic_inc(&r1_bio->remaining);
                md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
 
@@ -1388,14 +1397,13 @@ static int init_resync(conf_t *conf)
 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
        conf_t *conf = mddev_to_conf(mddev);
-       mirror_info_t *mirror;
        r1bio_t *r1_bio;
        struct bio *bio;
        sector_t max_sector, nr_sectors;
-       int disk;
+       int disk = -1;
        int i;
-       int wonly;
-       int write_targets = 0;
+       int wonly = -1;
+       int write_targets = 0, read_targets = 0;
        int sync_blocks;
        int still_degraded = 0;
 
@@ -1447,44 +1455,24 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
        conf->next_resync = sector_nr;
 
+       r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
+       rcu_read_lock();
        /*
-        * If reconstructing, and >1 working disc,
-        * could dedicate one to rebuild and others to
-        * service read requests ..
+        * If we get a correctably read error during resync or recovery,
+        * we might want to read from a different device.  So we
+        * flag all drives that could conceivably be read from for READ,
+        * and any others (which will be non-In_sync devices) for WRITE.
+        * If a read fails, we try reading from something else for which READ
+        * is OK.
         */
-       disk = conf->last_used;
-       /* make sure disk is operational */
-       wonly = disk;
-       while (conf->mirrors[disk].rdev == NULL ||
-              !test_bit(In_sync, &conf->mirrors[disk].rdev->flags) ||
-              test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
-               ) {
-               if (conf->mirrors[disk].rdev  &&
-                   test_bit(In_sync, &conf->mirrors[disk].rdev->flags))
-                       wonly = disk;
-               if (disk <= 0)
-                       disk = conf->raid_disks;
-               disk--;
-               if (disk == conf->last_used) {
-                       disk = wonly;
-                       break;
-               }
-       }
-       conf->last_used = disk;
-       atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
-
-
-       mirror = conf->mirrors + disk;
-
-       r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
 
        r1_bio->mddev = mddev;
        r1_bio->sector = sector_nr;
        r1_bio->state = 0;
        set_bit(R1BIO_IsSync, &r1_bio->state);
-       r1_bio->read_disk = disk;
 
        for (i=0; i < conf->raid_disks; i++) {
+               mdk_rdev_t *rdev;
                bio = r1_bio->bios[i];
 
                /* take from bio_init */
@@ -1499,35 +1487,49 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                bio->bi_end_io = NULL;
                bio->bi_private = NULL;
 
-               if (i == disk) {
-                       bio->bi_rw = READ;
-                       bio->bi_end_io = end_sync_read;
-               } else if (conf->mirrors[i].rdev == NULL ||
-                          test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
+               rdev = rcu_dereference(conf->mirrors[i].rdev);
+               if (rdev == NULL ||
+                          test_bit(Faulty, &rdev->flags)) {
                        still_degraded = 1;
                        continue;
-               } else if (!test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
-                          sector_nr + RESYNC_SECTORS > mddev->recovery_cp   ||
-                          test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+               } else if (!test_bit(In_sync, &rdev->flags)) {
                        bio->bi_rw = WRITE;
                        bio->bi_end_io = end_sync_write;
                        write_targets ++;
-               } else
-                       /* no need to read or write here */
-                       continue;
-               bio->bi_sector = sector_nr + conf->mirrors[i].rdev->data_offset;
-               bio->bi_bdev = conf->mirrors[i].rdev->bdev;
+               } else {
+                       /* may need to read from here */
+                       bio->bi_rw = READ;
+                       bio->bi_end_io = end_sync_read;
+                       if (test_bit(WriteMostly, &rdev->flags)) {
+                               if (wonly < 0)
+                                       wonly = i;
+                       } else {
+                               if (disk < 0)
+                                       disk = i;
+                       }
+                       read_targets++;
+               }
+               atomic_inc(&rdev->nr_pending);
+               bio->bi_sector = sector_nr + rdev->data_offset;
+               bio->bi_bdev = rdev->bdev;
                bio->bi_private = r1_bio;
        }
+       rcu_read_unlock();
+       if (disk < 0)
+               disk = wonly;
+       r1_bio->read_disk = disk;
 
-       if (write_targets == 0) {
+       if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
+               /* extra read targets are also write targets */
+               write_targets += read_targets-1;
+
+       if (write_targets == 0 || read_targets == 0) {
                /* There is nowhere to write, so all non-sync
                 * drives must be failed - so we are finished
                 */
                sector_t rv = max_sector - sector_nr;
                *skipped = 1;
                put_buf(r1_bio);
-               rdev_dec_pending(conf->mirrors[disk].rdev, mddev);
                return rv;
        }
 
@@ -1578,10 +1580,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                sync_blocks -= (len>>9);
        } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
  bio_full:
-       bio = r1_bio->bios[disk];
+       bio = r1_bio->bios[r1_bio->read_disk];
        r1_bio->sectors = nr_sectors;
 
-       md_sync_acct(mirror->rdev->bdev, nr_sectors);
+       md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev, nr_sectors);
 
        generic_make_request(bio);