md: fix possible raid1/raid10 deadlock on read error during resync

author NeilBrown <neilb@suse.de>

Tue, 4 Mar 2008 22:29:35 +0000 (14:29 -0800)

committer Linus Torvalds <torvalds@woody.linux-foundation.org>

Wed, 5 Mar 2008 00:35:18 +0000 (16:35 -0800)
author NeilBrown <neilb@suse.de>
Tue, 4 Mar 2008 22:29:35 +0000 (14:29 -0800)
committer Linus Torvalds <torvalds@woody.linux-foundation.org>
Wed, 5 Mar 2008 00:35:18 +0000 (16:35 -0800)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c

index 38f076a3400d6202aba4b7e7f460ff81a3d8c426..ff61b309129aa8ffa9dbd00987a71c1b6eb35bf5 100644 (file)
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -704,13 +704,20 @@ static void freeze_array(conf_t *conf)
         /* stop syncio and normal IO and wait for everything to
          * go quite.
          * We increment barrier and nr_waiting, and then
-        * wait until barrier+nr_pending match nr_queued+2
+        * wait until nr_pending match nr_queued+1
+        * This is called in the context of one normal IO request
+        * that has failed. Thus any sync request that might be pending
+        * will be blocked by nr_pending, and we need to wait for
+        * pending IO requests to complete or be queued for re-try.
+        * Thus the number queued (nr_queued) plus this request (1)
+        * must match the number of pending IOs (nr_pending) before
+        * we continue.
          */
         spin_lock_irq(&conf->resync_lock);
         conf->barrier++;
         conf->nr_waiting++;
         wait_event_lock_irq(conf->wait_barrier,
-                           conf->barrier+conf->nr_pending == conf->nr_queued+2,
+                           conf->nr_pending == conf->nr_queued+1,
                             conf->resync_lock,
                             ({ flush_pending_writes(conf);
                                raid1_unplug(conf->mddev->queue); }));
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c

index 6c486d839c99d1f875ce9f99f54b0dcfaf6188aa..8e5671d2f3d3cdd1225137f33616d065d1ea80e4 100644 (file)
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -747,13 +747,20 @@ static void freeze_array(conf_t *conf)
         /* stop syncio and normal IO and wait for everything to
          * go quiet.
          * We increment barrier and nr_waiting, and then
-        * wait until barrier+nr_pending match nr_queued+2
+        * wait until nr_pending match nr_queued+1
+        * This is called in the context of one normal IO request
+        * that has failed. Thus any sync request that might be pending
+        * will be blocked by nr_pending, and we need to wait for
+        * pending IO requests to complete or be queued for re-try.
+        * Thus the number queued (nr_queued) plus this request (1)
+        * must match the number of pending IOs (nr_pending) before
+        * we continue.
          */
         spin_lock_irq(&conf->resync_lock);
         conf->barrier++;
         conf->nr_waiting++;
         wait_event_lock_irq(conf->wait_barrier,
-                           conf->barrier+conf->nr_pending == conf->nr_queued+2,
+                           conf->nr_pending == conf->nr_queued+1,
                             conf->resync_lock,
                             ({ flush_pending_writes(conf);
                                raid10_unplug(conf->mddev->queue); }));
author	NeilBrown <neilb@suse.de>
	Tue, 4 Mar 2008 22:29:35 +0000 (14:29 -0800)
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>
	Wed, 5 Mar 2008 00:35:18 +0000 (16:35 -0800)
drivers/md/raid1.c		patch \| blob \| history
drivers/md/raid10.c		patch \| blob \| history