block: make bi_phys_segments an unsigned int instead of short
[linux-2.6.git] / drivers / md / raid5.c
index 8a6f101..37e5465 100644 (file)
 const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
 #endif
 
+/*
+ * We maintain a biased count of active stripes in the bottom 16 bits of
+ * bi_phys_segments, and a count of processed stripes in the upper 16 bits
+ */
+static inline int raid5_bi_phys_segments(struct bio *bio)
+{
+       return bio->bi_phys_segments & 0xffff;
+}
+
+static inline int raid5_bi_hw_segments(struct bio *bio)
+{
+       return (bio->bi_phys_segments >> 16) & 0xffff;
+}
+
+static inline int raid5_dec_bi_phys_segments(struct bio *bio)
+{
+       --bio->bi_phys_segments;
+       return raid5_bi_phys_segments(bio);
+}
+
+static inline int raid5_dec_bi_hw_segments(struct bio *bio)
+{
+       unsigned short val = raid5_bi_hw_segments(bio);
+
+       --val;
+       bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
+       return val;
+}
+
+static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
+{
+       bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
+}
+
 static inline int raid6_next_disk(int disk, int raid_disks)
 {
        disk++;
@@ -507,7 +541,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
                        while (rbi && rbi->bi_sector <
                                dev->sector + STRIPE_SECTORS) {
                                rbi2 = r5_next_bio(rbi, dev->sector);
-                               if (--rbi->bi_phys_segments == 0) {
+                               if (!raid5_dec_bi_phys_segments(rbi)) {
                                        rbi->bi_next = return_bi;
                                        return_bi = rbi;
                                }
@@ -1725,7 +1759,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
        if (*bip)
                bi->bi_next = *bip;
        *bip = bi;
-       bi->bi_phys_segments ++;
+       bi->bi_phys_segments++;
        spin_unlock_irq(&conf->device_lock);
        spin_unlock(&sh->lock);
 
@@ -1819,7 +1853,7 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
                        sh->dev[i].sector + STRIPE_SECTORS) {
                        struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                       if (--bi->bi_phys_segments == 0) {
+                       if (!raid5_dec_bi_phys_segments(bi)) {
                                md_write_end(conf->mddev);
                                bi->bi_next = *return_bi;
                                *return_bi = bi;
@@ -1834,7 +1868,7 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
                       sh->dev[i].sector + STRIPE_SECTORS) {
                        struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                       if (--bi->bi_phys_segments == 0) {
+                       if (!raid5_dec_bi_phys_segments(bi)) {
                                md_write_end(conf->mddev);
                                bi->bi_next = *return_bi;
                                *return_bi = bi;
@@ -1858,7 +1892,7 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
                                struct bio *nextbi =
                                        r5_next_bio(bi, sh->dev[i].sector);
                                clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                               if (--bi->bi_phys_segments == 0) {
+                               if (!raid5_dec_bi_phys_segments(bi)) {
                                        bi->bi_next = *return_bi;
                                        *return_bi = bi;
                                }
@@ -2033,7 +2067,7 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
                                while (wbi && wbi->bi_sector <
                                        dev->sector + STRIPE_SECTORS) {
                                        wbi2 = r5_next_bio(wbi, dev->sector);
-                                       if (--wbi->bi_phys_segments == 0) {
+                                       if (!raid5_dec_bi_phys_segments(wbi)) {
                                                md_write_end(conf->mddev);
                                                wbi->bi_next = *return_bi;
                                                *return_bi = wbi;
@@ -2507,7 +2541,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
  *
  */
 
-static void handle_stripe5(struct stripe_head *sh)
+static bool handle_stripe5(struct stripe_head *sh)
 {
        raid5_conf_t *conf = sh->raid_conf;
        int disks = sh->disks, i;
@@ -2568,10 +2602,10 @@ static void handle_stripe5(struct stripe_head *sh)
                if (dev->written)
                        s.written++;
                rdev = rcu_dereference(conf->disks[i].rdev);
-               if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+               if (blocked_rdev == NULL &&
+                   rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
                        blocked_rdev = rdev;
                        atomic_inc(&rdev->nr_pending);
-                       break;
                }
                if (!rdev || !test_bit(In_sync, &rdev->flags)) {
                        /* The ReadError flag will just be confusing now */
@@ -2588,8 +2622,14 @@ static void handle_stripe5(struct stripe_head *sh)
        rcu_read_unlock();
 
        if (unlikely(blocked_rdev)) {
-               set_bit(STRIPE_HANDLE, &sh->state);
-               goto unlock;
+               if (s.syncing || s.expanding || s.expanded ||
+                   s.to_write || s.written) {
+                       set_bit(STRIPE_HANDLE, &sh->state);
+                       goto unlock;
+               }
+               /* There is nothing for the blocked_rdev to block */
+               rdev_dec_pending(blocked_rdev, conf->mddev);
+               blocked_rdev = NULL;
        }
 
        if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
@@ -2717,10 +2757,11 @@ static void handle_stripe5(struct stripe_head *sh)
        if (sh->reconstruct_state == reconstruct_state_result) {
                sh->reconstruct_state = reconstruct_state_idle;
                clear_bit(STRIPE_EXPANDING, &sh->state);
-               for (i = conf->raid_disks; i--; )
+               for (i = conf->raid_disks; i--; ) {
                        set_bit(R5_Wantwrite, &sh->dev[i].flags);
-                       set_bit(R5_LOCKED, &dev->flags);
+                       set_bit(R5_LOCKED, &sh->dev[i].flags);
                        s.locked++;
+               }
        }
 
        if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
@@ -2754,9 +2795,11 @@ static void handle_stripe5(struct stripe_head *sh)
        ops_run_io(sh, &s);
 
        return_io(return_bi);
+
+       return blocked_rdev == NULL;
 }
 
-static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
+static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 {
        raid6_conf_t *conf = sh->raid_conf;
        int disks = sh->disks;
@@ -2805,7 +2848,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                                copy_data(0, rbi, dev->page, dev->sector);
                                rbi2 = r5_next_bio(rbi, dev->sector);
                                spin_lock_irq(&conf->device_lock);
-                               if (--rbi->bi_phys_segments == 0) {
+                               if (!raid5_dec_bi_phys_segments(rbi)) {
                                        rbi->bi_next = return_bi;
                                        return_bi = rbi;
                                }
@@ -2829,10 +2872,10 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                if (dev->written)
                        s.written++;
                rdev = rcu_dereference(conf->disks[i].rdev);
-               if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+               if (blocked_rdev == NULL &&
+                   rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
                        blocked_rdev = rdev;
                        atomic_inc(&rdev->nr_pending);
-                       break;
                }
                if (!rdev || !test_bit(In_sync, &rdev->flags)) {
                        /* The ReadError flag will just be confusing now */
@@ -2850,9 +2893,16 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
        rcu_read_unlock();
 
        if (unlikely(blocked_rdev)) {
-               set_bit(STRIPE_HANDLE, &sh->state);
-               goto unlock;
+               if (s.syncing || s.expanding || s.expanded ||
+                   s.to_write || s.written) {
+                       set_bit(STRIPE_HANDLE, &sh->state);
+                       goto unlock;
+               }
+               /* There is nothing for the blocked_rdev to block */
+               rdev_dec_pending(blocked_rdev, conf->mddev);
+               blocked_rdev = NULL;
        }
+
        pr_debug("locked=%d uptodate=%d to_read=%d"
               " to_write=%d failed=%d failed_num=%d,%d\n",
               s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
@@ -2967,14 +3017,17 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
        ops_run_io(sh, &s);
 
        return_io(return_bi);
+
+       return blocked_rdev == NULL;
 }
 
-static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
+/* returns true if the stripe was handled */
+static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 {
        if (sh->raid_conf->level == 6)
-               handle_stripe6(sh, tmp_page);
+               return handle_stripe6(sh, tmp_page);
        else
-               handle_stripe5(sh);
+               return handle_stripe5(sh);
 }
 
 
@@ -3073,15 +3126,17 @@ static int raid5_congested(void *data, int bits)
 /* We want read requests to align with chunks where possible,
  * but write requests don't need to.
  */
-static int raid5_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec)
+static int raid5_mergeable_bvec(struct request_queue *q,
+                               struct bvec_merge_data *bvm,
+                               struct bio_vec *biovec)
 {
        mddev_t *mddev = q->queuedata;
-       sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+       sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
        int max;
        unsigned int chunk_sectors = mddev->chunk_size >> 9;
-       unsigned int bio_sectors = bio->bi_size >> 9;
+       unsigned int bio_sectors = bvm->bi_size >> 9;
 
-       if (bio_data_dir(bio) == WRITE)
+       if ((bvm->bi_rw & 1) == WRITE)
                return biovec->bv_len; /* always allow writes to be mergeable */
 
        max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
@@ -3134,8 +3189,11 @@ static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
        if(bi) {
                conf->retry_read_aligned_list = bi->bi_next;
                bi->bi_next = NULL;
+               /*
+                * this sets the active strip count to 1 and the processed
+                * strip count to zero (upper 8 bits)
+                */
                bi->bi_phys_segments = 1; /* biased count of active stripes */
-               bi->bi_hw_segments = 0; /* count of processed stripes */
        }
 
        return bi;
@@ -3185,8 +3243,7 @@ static int bio_fits_rdev(struct bio *bi)
        if ((bi->bi_size>>9) > q->max_sectors)
                return 0;
        blk_recount_segments(q, bi);
-       if (bi->bi_phys_segments > q->max_phys_segments ||
-           bi->bi_hw_segments > q->max_hw_segments)
+       if (bi->bi_phys_segments > q->max_phys_segments)
                return 0;
 
        if (q->merge_bvec_fn)
@@ -3447,7 +3504,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
                        
        }
        spin_lock_irq(&conf->device_lock);
-       remaining = --bi->bi_phys_segments;
+       remaining = raid5_dec_bi_phys_segments(bi);
        spin_unlock_irq(&conf->device_lock);
        if (remaining == 0) {
 
@@ -3690,7 +3747,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
        clear_bit(STRIPE_INSYNC, &sh->state);
        spin_unlock(&sh->lock);
 
-       handle_stripe(sh, NULL);
+       /* wait for any blocked device to be handled */
+       while(unlikely(!handle_stripe(sh, NULL)))
+               ;
        release_stripe(sh);
 
        return STRIPE_SECTORS;
@@ -3729,7 +3788,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
                     sector += STRIPE_SECTORS,
                     scnt++) {
 
-               if (scnt < raid_bio->bi_hw_segments)
+               if (scnt < raid5_bi_hw_segments(raid_bio))
                        /* already done this stripe */
                        continue;
 
@@ -3737,7 +3796,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 
                if (!sh) {
                        /* failed to get a stripe - must wait */
-                       raid_bio->bi_hw_segments = scnt;
+                       raid5_set_bi_hw_segments(raid_bio, scnt);
                        conf->retry_read_aligned = raid_bio;
                        return handled;
                }
@@ -3745,7 +3804,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
                set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
                if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
                        release_stripe(sh);
-                       raid_bio->bi_hw_segments = scnt;
+                       raid5_set_bi_hw_segments(raid_bio, scnt);
                        conf->retry_read_aligned = raid_bio;
                        return handled;
                }
@@ -3755,7 +3814,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
                handled++;
        }
        spin_lock_irq(&conf->device_lock);
-       remaining = --raid_bio->bi_phys_segments;
+       remaining = raid5_dec_bi_phys_segments(raid_bio);
        spin_unlock_irq(&conf->device_lock);
        if (remaining == 0)
                bio_endio(raid_bio, 0);
@@ -4436,6 +4495,9 @@ static int raid5_check_reshape(mddev_t *mddev)
                return -EINVAL; /* Cannot shrink array or change level yet */
        if (mddev->delta_disks == 0)
                return 0; /* nothing to do */
+       if (mddev->bitmap)
+               /* Cannot grow a bitmap yet */
+               return -EBUSY;
 
        /* Can only proceed if there are plenty of stripe_heads.
         * We need a minimum of one full stripe,, and for sensible progress