block: remove support for bio remapping from ->make_request
[linux-3.10.git] / drivers / md / raid10.c
1 /*
2  * raid10.c : Multiple Devices driver for Linux
3  *
4  * Copyright (C) 2000-2004 Neil Brown
5  *
6  * RAID-10 support for md.
7  *
8  * Base on code in raid1.c.  See raid1.c for further copyright information.
9  *
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2, or (at your option)
14  * any later version.
15  *
16  * You should have received a copy of the GNU General Public License
17  * (for example /usr/src/linux/COPYING); if not, write to the Free
18  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19  */
20
21 #include <linux/slab.h>
22 #include <linux/delay.h>
23 #include <linux/blkdev.h>
24 #include <linux/seq_file.h>
25 #include <linux/ratelimit.h>
26 #include "md.h"
27 #include "raid10.h"
28 #include "raid0.h"
29 #include "bitmap.h"
30
31 /*
32  * RAID10 provides a combination of RAID0 and RAID1 functionality.
33  * The layout of data is defined by
34  *    chunk_size
35  *    raid_disks
36  *    near_copies (stored in low byte of layout)
37  *    far_copies (stored in second byte of layout)
38  *    far_offset (stored in bit 16 of layout )
39  *
40  * The data to be stored is divided into chunks using chunksize.
41  * Each device is divided into far_copies sections.
42  * In each section, chunks are laid out in a style similar to raid0, but
43  * near_copies copies of each chunk is stored (each on a different drive).
44  * The starting device for each section is offset near_copies from the starting
45  * device of the previous section.
46  * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
47  * drive.
48  * near_copies and far_copies must be at least one, and their product is at most
49  * raid_disks.
50  *
51  * If far_offset is true, then the far_copies are handled a bit differently.
52  * The copies are still in different stripes, but instead of be very far apart
53  * on disk, there are adjacent stripes.
54  */
55
56 /*
57  * Number of guaranteed r10bios in case of extreme VM load:
58  */
59 #define NR_RAID10_BIOS 256
60
61 static void allow_barrier(conf_t *conf);
62 static void lower_barrier(conf_t *conf);
63
64 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
65 {
66         conf_t *conf = data;
67         int size = offsetof(struct r10bio_s, devs[conf->copies]);
68
69         /* allocate a r10bio with room for raid_disks entries in the bios array */
70         return kzalloc(size, gfp_flags);
71 }
72
73 static void r10bio_pool_free(void *r10_bio, void *data)
74 {
75         kfree(r10_bio);
76 }
77
78 /* Maximum size of each resync request */
79 #define RESYNC_BLOCK_SIZE (64*1024)
80 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
81 /* amount of memory to reserve for resync requests */
82 #define RESYNC_WINDOW (1024*1024)
83 /* maximum number of concurrent requests, memory permitting */
84 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
85
86 /*
87  * When performing a resync, we need to read and compare, so
88  * we need as many pages are there are copies.
89  * When performing a recovery, we need 2 bios, one for read,
90  * one for write (we recover only one drive per r10buf)
91  *
92  */
93 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
94 {
95         conf_t *conf = data;
96         struct page *page;
97         r10bio_t *r10_bio;
98         struct bio *bio;
99         int i, j;
100         int nalloc;
101
102         r10_bio = r10bio_pool_alloc(gfp_flags, conf);
103         if (!r10_bio)
104                 return NULL;
105
106         if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
107                 nalloc = conf->copies; /* resync */
108         else
109                 nalloc = 2; /* recovery */
110
111         /*
112          * Allocate bios.
113          */
114         for (j = nalloc ; j-- ; ) {
115                 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
116                 if (!bio)
117                         goto out_free_bio;
118                 r10_bio->devs[j].bio = bio;
119         }
120         /*
121          * Allocate RESYNC_PAGES data pages and attach them
122          * where needed.
123          */
124         for (j = 0 ; j < nalloc; j++) {
125                 bio = r10_bio->devs[j].bio;
126                 for (i = 0; i < RESYNC_PAGES; i++) {
127                         if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
128                                                 &conf->mddev->recovery)) {
129                                 /* we can share bv_page's during recovery */
130                                 struct bio *rbio = r10_bio->devs[0].bio;
131                                 page = rbio->bi_io_vec[i].bv_page;
132                                 get_page(page);
133                         } else
134                                 page = alloc_page(gfp_flags);
135                         if (unlikely(!page))
136                                 goto out_free_pages;
137
138                         bio->bi_io_vec[i].bv_page = page;
139                 }
140         }
141
142         return r10_bio;
143
144 out_free_pages:
145         for ( ; i > 0 ; i--)
146                 safe_put_page(bio->bi_io_vec[i-1].bv_page);
147         while (j--)
148                 for (i = 0; i < RESYNC_PAGES ; i++)
149                         safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
150         j = -1;
151 out_free_bio:
152         while ( ++j < nalloc )
153                 bio_put(r10_bio->devs[j].bio);
154         r10bio_pool_free(r10_bio, conf);
155         return NULL;
156 }
157
158 static void r10buf_pool_free(void *__r10_bio, void *data)
159 {
160         int i;
161         conf_t *conf = data;
162         r10bio_t *r10bio = __r10_bio;
163         int j;
164
165         for (j=0; j < conf->copies; j++) {
166                 struct bio *bio = r10bio->devs[j].bio;
167                 if (bio) {
168                         for (i = 0; i < RESYNC_PAGES; i++) {
169                                 safe_put_page(bio->bi_io_vec[i].bv_page);
170                                 bio->bi_io_vec[i].bv_page = NULL;
171                         }
172                         bio_put(bio);
173                 }
174         }
175         r10bio_pool_free(r10bio, conf);
176 }
177
178 static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
179 {
180         int i;
181
182         for (i = 0; i < conf->copies; i++) {
183                 struct bio **bio = & r10_bio->devs[i].bio;
184                 if (!BIO_SPECIAL(*bio))
185                         bio_put(*bio);
186                 *bio = NULL;
187         }
188 }
189
190 static void free_r10bio(r10bio_t *r10_bio)
191 {
192         conf_t *conf = r10_bio->mddev->private;
193
194         put_all_bios(conf, r10_bio);
195         mempool_free(r10_bio, conf->r10bio_pool);
196 }
197
198 static void put_buf(r10bio_t *r10_bio)
199 {
200         conf_t *conf = r10_bio->mddev->private;
201
202         mempool_free(r10_bio, conf->r10buf_pool);
203
204         lower_barrier(conf);
205 }
206
207 static void reschedule_retry(r10bio_t *r10_bio)
208 {
209         unsigned long flags;
210         mddev_t *mddev = r10_bio->mddev;
211         conf_t *conf = mddev->private;
212
213         spin_lock_irqsave(&conf->device_lock, flags);
214         list_add(&r10_bio->retry_list, &conf->retry_list);
215         conf->nr_queued ++;
216         spin_unlock_irqrestore(&conf->device_lock, flags);
217
218         /* wake up frozen array... */
219         wake_up(&conf->wait_barrier);
220
221         md_wakeup_thread(mddev->thread);
222 }
223
224 /*
225  * raid_end_bio_io() is called when we have finished servicing a mirrored
226  * operation and are ready to return a success/failure code to the buffer
227  * cache layer.
228  */
229 static void raid_end_bio_io(r10bio_t *r10_bio)
230 {
231         struct bio *bio = r10_bio->master_bio;
232         int done;
233         conf_t *conf = r10_bio->mddev->private;
234
235         if (bio->bi_phys_segments) {
236                 unsigned long flags;
237                 spin_lock_irqsave(&conf->device_lock, flags);
238                 bio->bi_phys_segments--;
239                 done = (bio->bi_phys_segments == 0);
240                 spin_unlock_irqrestore(&conf->device_lock, flags);
241         } else
242                 done = 1;
243         if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
244                 clear_bit(BIO_UPTODATE, &bio->bi_flags);
245         if (done) {
246                 bio_endio(bio, 0);
247                 /*
248                  * Wake up any possible resync thread that waits for the device
249                  * to go idle.
250                  */
251                 allow_barrier(conf);
252         }
253         free_r10bio(r10_bio);
254 }
255
256 /*
257  * Update disk head position estimator based on IRQ completion info.
258  */
259 static inline void update_head_pos(int slot, r10bio_t *r10_bio)
260 {
261         conf_t *conf = r10_bio->mddev->private;
262
263         conf->mirrors[r10_bio->devs[slot].devnum].head_position =
264                 r10_bio->devs[slot].addr + (r10_bio->sectors);
265 }
266
267 /*
268  * Find the disk number which triggered given bio
269  */
270 static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio,
271                          struct bio *bio, int *slotp)
272 {
273         int slot;
274
275         for (slot = 0; slot < conf->copies; slot++)
276                 if (r10_bio->devs[slot].bio == bio)
277                         break;
278
279         BUG_ON(slot == conf->copies);
280         update_head_pos(slot, r10_bio);
281
282         if (slotp)
283                 *slotp = slot;
284         return r10_bio->devs[slot].devnum;
285 }
286
287 static void raid10_end_read_request(struct bio *bio, int error)
288 {
289         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
290         r10bio_t *r10_bio = bio->bi_private;
291         int slot, dev;
292         conf_t *conf = r10_bio->mddev->private;
293
294
295         slot = r10_bio->read_slot;
296         dev = r10_bio->devs[slot].devnum;
297         /*
298          * this branch is our 'one mirror IO has finished' event handler:
299          */
300         update_head_pos(slot, r10_bio);
301
302         if (uptodate) {
303                 /*
304                  * Set R10BIO_Uptodate in our master bio, so that
305                  * we will return a good error code to the higher
306                  * levels even if IO on some other mirrored buffer fails.
307                  *
308                  * The 'master' represents the composite IO operation to
309                  * user-side. So if something waits for IO, then it will
310                  * wait for the 'master' bio.
311                  */
312                 set_bit(R10BIO_Uptodate, &r10_bio->state);
313                 raid_end_bio_io(r10_bio);
314                 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
315         } else {
316                 /*
317                  * oops, read error - keep the refcount on the rdev
318                  */
319                 char b[BDEVNAME_SIZE];
320                 printk_ratelimited(KERN_ERR
321                                    "md/raid10:%s: %s: rescheduling sector %llu\n",
322                                    mdname(conf->mddev),
323                                    bdevname(conf->mirrors[dev].rdev->bdev, b),
324                                    (unsigned long long)r10_bio->sector);
325                 set_bit(R10BIO_ReadError, &r10_bio->state);
326                 reschedule_retry(r10_bio);
327         }
328 }
329
330 static void close_write(r10bio_t *r10_bio)
331 {
332         /* clear the bitmap if all writes complete successfully */
333         bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
334                         r10_bio->sectors,
335                         !test_bit(R10BIO_Degraded, &r10_bio->state),
336                         0);
337         md_write_end(r10_bio->mddev);
338 }
339
340 static void raid10_end_write_request(struct bio *bio, int error)
341 {
342         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
343         r10bio_t *r10_bio = bio->bi_private;
344         int dev;
345         int dec_rdev = 1;
346         conf_t *conf = r10_bio->mddev->private;
347         int slot;
348
349         dev = find_bio_disk(conf, r10_bio, bio, &slot);
350
351         /*
352          * this branch is our 'one mirror IO has finished' event handler:
353          */
354         if (!uptodate) {
355                 set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags);
356                 set_bit(R10BIO_WriteError, &r10_bio->state);
357                 dec_rdev = 0;
358         } else {
359                 /*
360                  * Set R10BIO_Uptodate in our master bio, so that
361                  * we will return a good error code for to the higher
362                  * levels even if IO on some other mirrored buffer fails.
363                  *
364                  * The 'master' represents the composite IO operation to
365                  * user-side. So if something waits for IO, then it will
366                  * wait for the 'master' bio.
367                  */
368                 sector_t first_bad;
369                 int bad_sectors;
370
371                 set_bit(R10BIO_Uptodate, &r10_bio->state);
372
373                 /* Maybe we can clear some bad blocks. */
374                 if (is_badblock(conf->mirrors[dev].rdev,
375                                 r10_bio->devs[slot].addr,
376                                 r10_bio->sectors,
377                                 &first_bad, &bad_sectors)) {
378                         bio_put(bio);
379                         r10_bio->devs[slot].bio = IO_MADE_GOOD;
380                         dec_rdev = 0;
381                         set_bit(R10BIO_MadeGood, &r10_bio->state);
382                 }
383         }
384
385         /*
386          *
387          * Let's see if all mirrored write operations have finished
388          * already.
389          */
390         if (atomic_dec_and_test(&r10_bio->remaining)) {
391                 if (test_bit(R10BIO_WriteError, &r10_bio->state))
392                         reschedule_retry(r10_bio);
393                 else {
394                         close_write(r10_bio);
395                         if (test_bit(R10BIO_MadeGood, &r10_bio->state))
396                                 reschedule_retry(r10_bio);
397                         else
398                                 raid_end_bio_io(r10_bio);
399                 }
400         }
401         if (dec_rdev)
402                 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
403 }
404
405
406 /*
407  * RAID10 layout manager
408  * As well as the chunksize and raid_disks count, there are two
409  * parameters: near_copies and far_copies.
410  * near_copies * far_copies must be <= raid_disks.
411  * Normally one of these will be 1.
412  * If both are 1, we get raid0.
413  * If near_copies == raid_disks, we get raid1.
414  *
415  * Chunks are laid out in raid0 style with near_copies copies of the
416  * first chunk, followed by near_copies copies of the next chunk and
417  * so on.
418  * If far_copies > 1, then after 1/far_copies of the array has been assigned
419  * as described above, we start again with a device offset of near_copies.
420  * So we effectively have another copy of the whole array further down all
421  * the drives, but with blocks on different drives.
422  * With this layout, and block is never stored twice on the one device.
423  *
424  * raid10_find_phys finds the sector offset of a given virtual sector
425  * on each device that it is on.
426  *
427  * raid10_find_virt does the reverse mapping, from a device and a
428  * sector offset to a virtual address
429  */
430
431 static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
432 {
433         int n,f;
434         sector_t sector;
435         sector_t chunk;
436         sector_t stripe;
437         int dev;
438
439         int slot = 0;
440
441         /* now calculate first sector/dev */
442         chunk = r10bio->sector >> conf->chunk_shift;
443         sector = r10bio->sector & conf->chunk_mask;
444
445         chunk *= conf->near_copies;
446         stripe = chunk;
447         dev = sector_div(stripe, conf->raid_disks);
448         if (conf->far_offset)
449                 stripe *= conf->far_copies;
450
451         sector += stripe << conf->chunk_shift;
452
453         /* and calculate all the others */
454         for (n=0; n < conf->near_copies; n++) {
455                 int d = dev;
456                 sector_t s = sector;
457                 r10bio->devs[slot].addr = sector;
458                 r10bio->devs[slot].devnum = d;
459                 slot++;
460
461                 for (f = 1; f < conf->far_copies; f++) {
462                         d += conf->near_copies;
463                         if (d >= conf->raid_disks)
464                                 d -= conf->raid_disks;
465                         s += conf->stride;
466                         r10bio->devs[slot].devnum = d;
467                         r10bio->devs[slot].addr = s;
468                         slot++;
469                 }
470                 dev++;
471                 if (dev >= conf->raid_disks) {
472                         dev = 0;
473                         sector += (conf->chunk_mask + 1);
474                 }
475         }
476         BUG_ON(slot != conf->copies);
477 }
478
479 static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
480 {
481         sector_t offset, chunk, vchunk;
482
483         offset = sector & conf->chunk_mask;
484         if (conf->far_offset) {
485                 int fc;
486                 chunk = sector >> conf->chunk_shift;
487                 fc = sector_div(chunk, conf->far_copies);
488                 dev -= fc * conf->near_copies;
489                 if (dev < 0)
490                         dev += conf->raid_disks;
491         } else {
492                 while (sector >= conf->stride) {
493                         sector -= conf->stride;
494                         if (dev < conf->near_copies)
495                                 dev += conf->raid_disks - conf->near_copies;
496                         else
497                                 dev -= conf->near_copies;
498                 }
499                 chunk = sector >> conf->chunk_shift;
500         }
501         vchunk = chunk * conf->raid_disks + dev;
502         sector_div(vchunk, conf->near_copies);
503         return (vchunk << conf->chunk_shift) + offset;
504 }
505
506 /**
507  *      raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
508  *      @q: request queue
509  *      @bvm: properties of new bio
510  *      @biovec: the request that could be merged to it.
511  *
512  *      Return amount of bytes we can accept at this offset
513  *      If near_copies == raid_disk, there are no striping issues,
514  *      but in that case, the function isn't called at all.
515  */
516 static int raid10_mergeable_bvec(struct request_queue *q,
517                                  struct bvec_merge_data *bvm,
518                                  struct bio_vec *biovec)
519 {
520         mddev_t *mddev = q->queuedata;
521         sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
522         int max;
523         unsigned int chunk_sectors = mddev->chunk_sectors;
524         unsigned int bio_sectors = bvm->bi_size >> 9;
525
526         max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
527         if (max < 0) max = 0; /* bio_add cannot handle a negative return */
528         if (max <= biovec->bv_len && bio_sectors == 0)
529                 return biovec->bv_len;
530         else
531                 return max;
532 }
533
534 /*
535  * This routine returns the disk from which the requested read should
536  * be done. There is a per-array 'next expected sequential IO' sector
537  * number - if this matches on the next IO then we use the last disk.
538  * There is also a per-disk 'last know head position' sector that is
539  * maintained from IRQ contexts, both the normal and the resync IO
540  * completion handlers update this position correctly. If there is no
541  * perfect sequential match then we pick the disk whose head is closest.
542  *
543  * If there are 2 mirrors in the same 2 devices, performance degrades
544  * because position is mirror, not device based.
545  *
546  * The rdev for the device selected will have nr_pending incremented.
547  */
548
549 /*
550  * FIXME: possibly should rethink readbalancing and do it differently
551  * depending on near_copies / far_copies geometry.
552  */
553 static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
554 {
555         const sector_t this_sector = r10_bio->sector;
556         int disk, slot;
557         int sectors = r10_bio->sectors;
558         int best_good_sectors;
559         sector_t new_distance, best_dist;
560         mdk_rdev_t *rdev;
561         int do_balance;
562         int best_slot;
563
564         raid10_find_phys(conf, r10_bio);
565         rcu_read_lock();
566 retry:
567         sectors = r10_bio->sectors;
568         best_slot = -1;
569         best_dist = MaxSector;
570         best_good_sectors = 0;
571         do_balance = 1;
572         /*
573          * Check if we can balance. We can balance on the whole
574          * device if no resync is going on (recovery is ok), or below
575          * the resync window. We take the first readable disk when
576          * above the resync window.
577          */
578         if (conf->mddev->recovery_cp < MaxSector
579             && (this_sector + sectors >= conf->next_resync))
580                 do_balance = 0;
581
582         for (slot = 0; slot < conf->copies ; slot++) {
583                 sector_t first_bad;
584                 int bad_sectors;
585                 sector_t dev_sector;
586
587                 if (r10_bio->devs[slot].bio == IO_BLOCKED)
588                         continue;
589                 disk = r10_bio->devs[slot].devnum;
590                 rdev = rcu_dereference(conf->mirrors[disk].rdev);
591                 if (rdev == NULL)
592                         continue;
593                 if (!test_bit(In_sync, &rdev->flags))
594                         continue;
595
596                 dev_sector = r10_bio->devs[slot].addr;
597                 if (is_badblock(rdev, dev_sector, sectors,
598                                 &first_bad, &bad_sectors)) {
599                         if (best_dist < MaxSector)
600                                 /* Already have a better slot */
601                                 continue;
602                         if (first_bad <= dev_sector) {
603                                 /* Cannot read here.  If this is the
604                                  * 'primary' device, then we must not read
605                                  * beyond 'bad_sectors' from another device.
606                                  */
607                                 bad_sectors -= (dev_sector - first_bad);
608                                 if (!do_balance && sectors > bad_sectors)
609                                         sectors = bad_sectors;
610                                 if (best_good_sectors > sectors)
611                                         best_good_sectors = sectors;
612                         } else {
613                                 sector_t good_sectors =
614                                         first_bad - dev_sector;
615                                 if (good_sectors > best_good_sectors) {
616                                         best_good_sectors = good_sectors;
617                                         best_slot = slot;
618                                 }
619                                 if (!do_balance)
620                                         /* Must read from here */
621                                         break;
622                         }
623                         continue;
624                 } else
625                         best_good_sectors = sectors;
626
627                 if (!do_balance)
628                         break;
629
630                 /* This optimisation is debatable, and completely destroys
631                  * sequential read speed for 'far copies' arrays.  So only
632                  * keep it for 'near' arrays, and review those later.
633                  */
634                 if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
635                         break;
636
637                 /* for far > 1 always use the lowest address */
638                 if (conf->far_copies > 1)
639                         new_distance = r10_bio->devs[slot].addr;
640                 else
641                         new_distance = abs(r10_bio->devs[slot].addr -
642                                            conf->mirrors[disk].head_position);
643                 if (new_distance < best_dist) {
644                         best_dist = new_distance;
645                         best_slot = slot;
646                 }
647         }
648         if (slot == conf->copies)
649                 slot = best_slot;
650
651         if (slot >= 0) {
652                 disk = r10_bio->devs[slot].devnum;
653                 rdev = rcu_dereference(conf->mirrors[disk].rdev);
654                 if (!rdev)
655                         goto retry;
656                 atomic_inc(&rdev->nr_pending);
657                 if (test_bit(Faulty, &rdev->flags)) {
658                         /* Cannot risk returning a device that failed
659                          * before we inc'ed nr_pending
660                          */
661                         rdev_dec_pending(rdev, conf->mddev);
662                         goto retry;
663                 }
664                 r10_bio->read_slot = slot;
665         } else
666                 disk = -1;
667         rcu_read_unlock();
668         *max_sectors = best_good_sectors;
669
670         return disk;
671 }
672
673 static int raid10_congested(void *data, int bits)
674 {
675         mddev_t *mddev = data;
676         conf_t *conf = mddev->private;
677         int i, ret = 0;
678
679         if (mddev_congested(mddev, bits))
680                 return 1;
681         rcu_read_lock();
682         for (i = 0; i < conf->raid_disks && ret == 0; i++) {
683                 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
684                 if (rdev && !test_bit(Faulty, &rdev->flags)) {
685                         struct request_queue *q = bdev_get_queue(rdev->bdev);
686
687                         ret |= bdi_congested(&q->backing_dev_info, bits);
688                 }
689         }
690         rcu_read_unlock();
691         return ret;
692 }
693
694 static void flush_pending_writes(conf_t *conf)
695 {
696         /* Any writes that have been queued but are awaiting
697          * bitmap updates get flushed here.
698          */
699         spin_lock_irq(&conf->device_lock);
700
701         if (conf->pending_bio_list.head) {
702                 struct bio *bio;
703                 bio = bio_list_get(&conf->pending_bio_list);
704                 spin_unlock_irq(&conf->device_lock);
705                 /* flush any pending bitmap writes to disk
706                  * before proceeding w/ I/O */
707                 bitmap_unplug(conf->mddev->bitmap);
708
709                 while (bio) { /* submit pending writes */
710                         struct bio *next = bio->bi_next;
711                         bio->bi_next = NULL;
712                         generic_make_request(bio);
713                         bio = next;
714                 }
715         } else
716                 spin_unlock_irq(&conf->device_lock);
717 }
718
719 /* Barriers....
720  * Sometimes we need to suspend IO while we do something else,
721  * either some resync/recovery, or reconfigure the array.
722  * To do this we raise a 'barrier'.
723  * The 'barrier' is a counter that can be raised multiple times
724  * to count how many activities are happening which preclude
725  * normal IO.
726  * We can only raise the barrier if there is no pending IO.
727  * i.e. if nr_pending == 0.
728  * We choose only to raise the barrier if no-one is waiting for the
729  * barrier to go down.  This means that as soon as an IO request
730  * is ready, no other operations which require a barrier will start
731  * until the IO request has had a chance.
732  *
733  * So: regular IO calls 'wait_barrier'.  When that returns there
734  *    is no backgroup IO happening,  It must arrange to call
735  *    allow_barrier when it has finished its IO.
736  * backgroup IO calls must call raise_barrier.  Once that returns
737  *    there is no normal IO happeing.  It must arrange to call
738  *    lower_barrier when the particular background IO completes.
739  */
740
741 static void raise_barrier(conf_t *conf, int force)
742 {
743         BUG_ON(force && !conf->barrier);
744         spin_lock_irq(&conf->resync_lock);
745
746         /* Wait until no block IO is waiting (unless 'force') */
747         wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
748                             conf->resync_lock, );
749
750         /* block any new IO from starting */
751         conf->barrier++;
752
753         /* Now wait for all pending IO to complete */
754         wait_event_lock_irq(conf->wait_barrier,
755                             !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
756                             conf->resync_lock, );
757
758         spin_unlock_irq(&conf->resync_lock);
759 }
760
761 static void lower_barrier(conf_t *conf)
762 {
763         unsigned long flags;
764         spin_lock_irqsave(&conf->resync_lock, flags);
765         conf->barrier--;
766         spin_unlock_irqrestore(&conf->resync_lock, flags);
767         wake_up(&conf->wait_barrier);
768 }
769
770 static void wait_barrier(conf_t *conf)
771 {
772         spin_lock_irq(&conf->resync_lock);
773         if (conf->barrier) {
774                 conf->nr_waiting++;
775                 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
776                                     conf->resync_lock,
777                                     );
778                 conf->nr_waiting--;
779         }
780         conf->nr_pending++;
781         spin_unlock_irq(&conf->resync_lock);
782 }
783
784 static void allow_barrier(conf_t *conf)
785 {
786         unsigned long flags;
787         spin_lock_irqsave(&conf->resync_lock, flags);
788         conf->nr_pending--;
789         spin_unlock_irqrestore(&conf->resync_lock, flags);
790         wake_up(&conf->wait_barrier);
791 }
792
793 static void freeze_array(conf_t *conf)
794 {
795         /* stop syncio and normal IO and wait for everything to
796          * go quiet.
797          * We increment barrier and nr_waiting, and then
798          * wait until nr_pending match nr_queued+1
799          * This is called in the context of one normal IO request
800          * that has failed. Thus any sync request that might be pending
801          * will be blocked by nr_pending, and we need to wait for
802          * pending IO requests to complete or be queued for re-try.
803          * Thus the number queued (nr_queued) plus this request (1)
804          * must match the number of pending IOs (nr_pending) before
805          * we continue.
806          */
807         spin_lock_irq(&conf->resync_lock);
808         conf->barrier++;
809         conf->nr_waiting++;
810         wait_event_lock_irq(conf->wait_barrier,
811                             conf->nr_pending == conf->nr_queued+1,
812                             conf->resync_lock,
813                             flush_pending_writes(conf));
814
815         spin_unlock_irq(&conf->resync_lock);
816 }
817
818 static void unfreeze_array(conf_t *conf)
819 {
820         /* reverse the effect of the freeze */
821         spin_lock_irq(&conf->resync_lock);
822         conf->barrier--;
823         conf->nr_waiting--;
824         wake_up(&conf->wait_barrier);
825         spin_unlock_irq(&conf->resync_lock);
826 }
827
828 static void make_request(mddev_t *mddev, struct bio * bio)
829 {
830         conf_t *conf = mddev->private;
831         mirror_info_t *mirror;
832         r10bio_t *r10_bio;
833         struct bio *read_bio;
834         int i;
835         int chunk_sects = conf->chunk_mask + 1;
836         const int rw = bio_data_dir(bio);
837         const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
838         const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
839         unsigned long flags;
840         mdk_rdev_t *blocked_rdev;
841         int plugged;
842         int sectors_handled;
843         int max_sectors;
844
845         if (unlikely(bio->bi_rw & REQ_FLUSH)) {
846                 md_flush_request(mddev, bio);
847                 return;
848         }
849
850         /* If this request crosses a chunk boundary, we need to
851          * split it.  This will only happen for 1 PAGE (or less) requests.
852          */
853         if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
854                       > chunk_sects &&
855                     conf->near_copies < conf->raid_disks)) {
856                 struct bio_pair *bp;
857                 /* Sanity check -- queue functions should prevent this happening */
858                 if (bio->bi_vcnt != 1 ||
859                     bio->bi_idx != 0)
860                         goto bad_map;
861                 /* This is a one page bio that upper layers
862                  * refuse to split for us, so we need to split it.
863                  */
864                 bp = bio_split(bio,
865                                chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
866
867                 /* Each of these 'make_request' calls will call 'wait_barrier'.
868                  * If the first succeeds but the second blocks due to the resync
869                  * thread raising the barrier, we will deadlock because the
870                  * IO to the underlying device will be queued in generic_make_request
871                  * and will never complete, so will never reduce nr_pending.
872                  * So increment nr_waiting here so no new raise_barriers will
873                  * succeed, and so the second wait_barrier cannot block.
874                  */
875                 spin_lock_irq(&conf->resync_lock);
876                 conf->nr_waiting++;
877                 spin_unlock_irq(&conf->resync_lock);
878
879                 make_request(mddev, &bp->bio1);
880                 make_request(mddev, &bp->bio2);
881
882                 spin_lock_irq(&conf->resync_lock);
883                 conf->nr_waiting--;
884                 wake_up(&conf->wait_barrier);
885                 spin_unlock_irq(&conf->resync_lock);
886
887                 bio_pair_release(bp);
888                 return;
889         bad_map:
890                 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
891                        " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
892                        (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
893
894                 bio_io_error(bio);
895                 return;
896         }
897
898         md_write_start(mddev, bio);
899
900         /*
901          * Register the new request and wait if the reconstruction
902          * thread has put up a bar for new requests.
903          * Continue immediately if no resync is active currently.
904          */
905         wait_barrier(conf);
906
907         r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
908
909         r10_bio->master_bio = bio;
910         r10_bio->sectors = bio->bi_size >> 9;
911
912         r10_bio->mddev = mddev;
913         r10_bio->sector = bio->bi_sector;
914         r10_bio->state = 0;
915
916         /* We might need to issue multiple reads to different
917          * devices if there are bad blocks around, so we keep
918          * track of the number of reads in bio->bi_phys_segments.
919          * If this is 0, there is only one r10_bio and no locking
920          * will be needed when the request completes.  If it is
921          * non-zero, then it is the number of not-completed requests.
922          */
923         bio->bi_phys_segments = 0;
924         clear_bit(BIO_SEG_VALID, &bio->bi_flags);
925
926         if (rw == READ) {
927                 /*
928                  * read balancing logic:
929                  */
930                 int disk;
931                 int slot;
932
933 read_again:
934                 disk = read_balance(conf, r10_bio, &max_sectors);
935                 slot = r10_bio->read_slot;
936                 if (disk < 0) {
937                         raid_end_bio_io(r10_bio);
938                         return;
939                 }
940                 mirror = conf->mirrors + disk;
941
942                 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
943                 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
944                             max_sectors);
945
946                 r10_bio->devs[slot].bio = read_bio;
947
948                 read_bio->bi_sector = r10_bio->devs[slot].addr +
949                         mirror->rdev->data_offset;
950                 read_bio->bi_bdev = mirror->rdev->bdev;
951                 read_bio->bi_end_io = raid10_end_read_request;
952                 read_bio->bi_rw = READ | do_sync;
953                 read_bio->bi_private = r10_bio;
954
955                 if (max_sectors < r10_bio->sectors) {
956                         /* Could not read all from this device, so we will
957                          * need another r10_bio.
958                          */
959                         sectors_handled = (r10_bio->sectors + max_sectors
960                                            - bio->bi_sector);
961                         r10_bio->sectors = max_sectors;
962                         spin_lock_irq(&conf->device_lock);
963                         if (bio->bi_phys_segments == 0)
964                                 bio->bi_phys_segments = 2;
965                         else
966                                 bio->bi_phys_segments++;
967                         spin_unlock(&conf->device_lock);
968                         /* Cannot call generic_make_request directly
969                          * as that will be queued in __generic_make_request
970                          * and subsequent mempool_alloc might block
971                          * waiting for it.  so hand bio over to raid10d.
972                          */
973                         reschedule_retry(r10_bio);
974
975                         r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
976
977                         r10_bio->master_bio = bio;
978                         r10_bio->sectors = ((bio->bi_size >> 9)
979                                             - sectors_handled);
980                         r10_bio->state = 0;
981                         r10_bio->mddev = mddev;
982                         r10_bio->sector = bio->bi_sector + sectors_handled;
983                         goto read_again;
984                 } else
985                         generic_make_request(read_bio);
986                 return;
987         }
988
989         /*
990          * WRITE:
991          */
992         /* first select target devices under rcu_lock and
993          * inc refcount on their rdev.  Record them by setting
994          * bios[x] to bio
995          * If there are known/acknowledged bad blocks on any device
996          * on which we have seen a write error, we want to avoid
997          * writing to those blocks.  This potentially requires several
998          * writes to write around the bad blocks.  Each set of writes
999          * gets its own r10_bio with a set of bios attached.  The number
1000          * of r10_bios is recored in bio->bi_phys_segments just as with
1001          * the read case.
1002          */
1003         plugged = mddev_check_plugged(mddev);
1004
1005         raid10_find_phys(conf, r10_bio);
1006 retry_write:
1007         blocked_rdev = NULL;
1008         rcu_read_lock();
1009         max_sectors = r10_bio->sectors;
1010
1011         for (i = 0;  i < conf->copies; i++) {
1012                 int d = r10_bio->devs[i].devnum;
1013                 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
1014                 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1015                         atomic_inc(&rdev->nr_pending);
1016                         blocked_rdev = rdev;
1017                         break;
1018                 }
1019                 r10_bio->devs[i].bio = NULL;
1020                 if (!rdev || test_bit(Faulty, &rdev->flags)) {
1021                         set_bit(R10BIO_Degraded, &r10_bio->state);
1022                         continue;
1023                 }
1024                 if (test_bit(WriteErrorSeen, &rdev->flags)) {
1025                         sector_t first_bad;
1026                         sector_t dev_sector = r10_bio->devs[i].addr;
1027                         int bad_sectors;
1028                         int is_bad;
1029
1030                         is_bad = is_badblock(rdev, dev_sector,
1031                                              max_sectors,
1032                                              &first_bad, &bad_sectors);
1033                         if (is_bad < 0) {
1034                                 /* Mustn't write here until the bad block
1035                                  * is acknowledged
1036                                  */
1037                                 atomic_inc(&rdev->nr_pending);
1038                                 set_bit(BlockedBadBlocks, &rdev->flags);
1039                                 blocked_rdev = rdev;
1040                                 break;
1041                         }
1042                         if (is_bad && first_bad <= dev_sector) {
1043                                 /* Cannot write here at all */
1044                                 bad_sectors -= (dev_sector - first_bad);
1045                                 if (bad_sectors < max_sectors)
1046                                         /* Mustn't write more than bad_sectors
1047                                          * to other devices yet
1048                                          */
1049                                         max_sectors = bad_sectors;
1050                                 /* We don't set R10BIO_Degraded as that
1051                                  * only applies if the disk is missing,
1052                                  * so it might be re-added, and we want to
1053                                  * know to recover this chunk.
1054                                  * In this case the device is here, and the
1055                                  * fact that this chunk is not in-sync is
1056                                  * recorded in the bad block log.
1057                                  */
1058                                 continue;
1059                         }
1060                         if (is_bad) {
1061                                 int good_sectors = first_bad - dev_sector;
1062                                 if (good_sectors < max_sectors)
1063                                         max_sectors = good_sectors;
1064                         }
1065                 }
1066                 r10_bio->devs[i].bio = bio;
1067                 atomic_inc(&rdev->nr_pending);
1068         }
1069         rcu_read_unlock();
1070
1071         if (unlikely(blocked_rdev)) {
1072                 /* Have to wait for this device to get unblocked, then retry */
1073                 int j;
1074                 int d;
1075
1076                 for (j = 0; j < i; j++)
1077                         if (r10_bio->devs[j].bio) {
1078                                 d = r10_bio->devs[j].devnum;
1079                                 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1080                         }
1081                 allow_barrier(conf);
1082                 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1083                 wait_barrier(conf);
1084                 goto retry_write;
1085         }
1086
1087         if (max_sectors < r10_bio->sectors) {
1088                 /* We are splitting this into multiple parts, so
1089                  * we need to prepare for allocating another r10_bio.
1090                  */
1091                 r10_bio->sectors = max_sectors;
1092                 spin_lock_irq(&conf->device_lock);
1093                 if (bio->bi_phys_segments == 0)
1094                         bio->bi_phys_segments = 2;
1095                 else
1096                         bio->bi_phys_segments++;
1097                 spin_unlock_irq(&conf->device_lock);
1098         }
1099         sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1100
1101         atomic_set(&r10_bio->remaining, 1);
1102         bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1103
1104         for (i = 0; i < conf->copies; i++) {
1105                 struct bio *mbio;
1106                 int d = r10_bio->devs[i].devnum;
1107                 if (!r10_bio->devs[i].bio)
1108                         continue;
1109
1110                 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1111                 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1112                             max_sectors);
1113                 r10_bio->devs[i].bio = mbio;
1114
1115                 mbio->bi_sector = (r10_bio->devs[i].addr+
1116                                    conf->mirrors[d].rdev->data_offset);
1117                 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1118                 mbio->bi_end_io = raid10_end_write_request;
1119                 mbio->bi_rw = WRITE | do_sync | do_fua;
1120                 mbio->bi_private = r10_bio;
1121
1122                 atomic_inc(&r10_bio->remaining);
1123                 spin_lock_irqsave(&conf->device_lock, flags);
1124                 bio_list_add(&conf->pending_bio_list, mbio);
1125                 spin_unlock_irqrestore(&conf->device_lock, flags);
1126         }
1127
1128         if (atomic_dec_and_test(&r10_bio->remaining)) {
1129                 /* This matches the end of raid10_end_write_request() */
1130                 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
1131                                 r10_bio->sectors,
1132                                 !test_bit(R10BIO_Degraded, &r10_bio->state),
1133                                 0);
1134                 md_write_end(mddev);
1135                 raid_end_bio_io(r10_bio);
1136         }
1137
1138         /* In case raid10d snuck in to freeze_array */
1139         wake_up(&conf->wait_barrier);
1140
1141         if (sectors_handled < (bio->bi_size >> 9)) {
1142                 /* We need another r10_bio.  It has already been counted
1143                  * in bio->bi_phys_segments.
1144                  */
1145                 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1146
1147                 r10_bio->master_bio = bio;
1148                 r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1149
1150                 r10_bio->mddev = mddev;
1151                 r10_bio->sector = bio->bi_sector + sectors_handled;
1152                 r10_bio->state = 0;
1153                 goto retry_write;
1154         }
1155
1156         if (do_sync || !mddev->bitmap || !plugged)
1157                 md_wakeup_thread(mddev->thread);
1158 }
1159
1160 static void status(struct seq_file *seq, mddev_t *mddev)
1161 {
1162         conf_t *conf = mddev->private;
1163         int i;
1164
1165         if (conf->near_copies < conf->raid_disks)
1166                 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1167         if (conf->near_copies > 1)
1168                 seq_printf(seq, " %d near-copies", conf->near_copies);
1169         if (conf->far_copies > 1) {
1170                 if (conf->far_offset)
1171                         seq_printf(seq, " %d offset-copies", conf->far_copies);
1172                 else
1173                         seq_printf(seq, " %d far-copies", conf->far_copies);
1174         }
1175         seq_printf(seq, " [%d/%d] [", conf->raid_disks,
1176                                         conf->raid_disks - mddev->degraded);
1177         for (i = 0; i < conf->raid_disks; i++)
1178                 seq_printf(seq, "%s",
1179                               conf->mirrors[i].rdev &&
1180                               test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1181         seq_printf(seq, "]");
1182 }
1183
1184 /* check if there are enough drives for
1185  * every block to appear on atleast one.
1186  * Don't consider the device numbered 'ignore'
1187  * as we might be about to remove it.
1188  */
1189 static int enough(conf_t *conf, int ignore)
1190 {
1191         int first = 0;
1192
1193         do {
1194                 int n = conf->copies;
1195                 int cnt = 0;
1196                 while (n--) {
1197                         if (conf->mirrors[first].rdev &&
1198                             first != ignore)
1199                                 cnt++;
1200                         first = (first+1) % conf->raid_disks;
1201                 }
1202                 if (cnt == 0)
1203                         return 0;
1204         } while (first != 0);
1205         return 1;
1206 }
1207
1208 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1209 {
1210         char b[BDEVNAME_SIZE];
1211         conf_t *conf = mddev->private;
1212
1213         /*
1214          * If it is not operational, then we have already marked it as dead
1215          * else if it is the last working disks, ignore the error, let the
1216          * next level up know.
1217          * else mark the drive as failed
1218          */
1219         if (test_bit(In_sync, &rdev->flags)
1220             && !enough(conf, rdev->raid_disk))
1221                 /*
1222                  * Don't fail the drive, just return an IO error.
1223                  */
1224                 return;
1225         if (test_and_clear_bit(In_sync, &rdev->flags)) {
1226                 unsigned long flags;
1227                 spin_lock_irqsave(&conf->device_lock, flags);
1228                 mddev->degraded++;
1229                 spin_unlock_irqrestore(&conf->device_lock, flags);
1230                 /*
1231                  * if recovery is running, make sure it aborts.
1232                  */
1233                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1234         }
1235         set_bit(Blocked, &rdev->flags);
1236         set_bit(Faulty, &rdev->flags);
1237         set_bit(MD_CHANGE_DEVS, &mddev->flags);
1238         printk(KERN_ALERT
1239                "md/raid10:%s: Disk failure on %s, disabling device.\n"
1240                "md/raid10:%s: Operation continuing on %d devices.\n",
1241                mdname(mddev), bdevname(rdev->bdev, b),
1242                mdname(mddev), conf->raid_disks - mddev->degraded);
1243 }
1244
1245 static void print_conf(conf_t *conf)
1246 {
1247         int i;
1248         mirror_info_t *tmp;
1249
1250         printk(KERN_DEBUG "RAID10 conf printout:\n");
1251         if (!conf) {
1252                 printk(KERN_DEBUG "(!conf)\n");
1253                 return;
1254         }
1255         printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1256                 conf->raid_disks);
1257
1258         for (i = 0; i < conf->raid_disks; i++) {
1259                 char b[BDEVNAME_SIZE];
1260                 tmp = conf->mirrors + i;
1261                 if (tmp->rdev)
1262                         printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1263                                 i, !test_bit(In_sync, &tmp->rdev->flags),
1264                                 !test_bit(Faulty, &tmp->rdev->flags),
1265                                 bdevname(tmp->rdev->bdev,b));
1266         }
1267 }
1268
1269 static void close_sync(conf_t *conf)
1270 {
1271         wait_barrier(conf);
1272         allow_barrier(conf);
1273
1274         mempool_destroy(conf->r10buf_pool);
1275         conf->r10buf_pool = NULL;
1276 }
1277
1278 static int raid10_spare_active(mddev_t *mddev)
1279 {
1280         int i;
1281         conf_t *conf = mddev->private;
1282         mirror_info_t *tmp;
1283         int count = 0;
1284         unsigned long flags;
1285
1286         /*
1287          * Find all non-in_sync disks within the RAID10 configuration
1288          * and mark them in_sync
1289          */
1290         for (i = 0; i < conf->raid_disks; i++) {
1291                 tmp = conf->mirrors + i;
1292                 if (tmp->rdev
1293                     && !test_bit(Faulty, &tmp->rdev->flags)
1294                     && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1295                         count++;
1296                         sysfs_notify_dirent(tmp->rdev->sysfs_state);
1297                 }
1298         }
1299         spin_lock_irqsave(&conf->device_lock, flags);
1300         mddev->degraded -= count;
1301         spin_unlock_irqrestore(&conf->device_lock, flags);
1302
1303         print_conf(conf);
1304         return count;
1305 }
1306
1307
1308 static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1309 {
1310         conf_t *conf = mddev->private;
1311         int err = -EEXIST;
1312         int mirror;
1313         int first = 0;
1314         int last = conf->raid_disks - 1;
1315
1316         if (mddev->recovery_cp < MaxSector)
1317                 /* only hot-add to in-sync arrays, as recovery is
1318                  * very different from resync
1319                  */
1320                 return -EBUSY;
1321         if (!enough(conf, -1))
1322                 return -EINVAL;
1323
1324         if (rdev->raid_disk >= 0)
1325                 first = last = rdev->raid_disk;
1326
1327         if (rdev->saved_raid_disk >= first &&
1328             conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1329                 mirror = rdev->saved_raid_disk;
1330         else
1331                 mirror = first;
1332         for ( ; mirror <= last ; mirror++) {
1333                 mirror_info_t *p = &conf->mirrors[mirror];
1334                 if (p->recovery_disabled == mddev->recovery_disabled)
1335                         continue;
1336                 if (!p->rdev)
1337                         continue;
1338
1339                 disk_stack_limits(mddev->gendisk, rdev->bdev,
1340                                   rdev->data_offset << 9);
1341                 /* as we don't honour merge_bvec_fn, we must
1342                  * never risk violating it, so limit
1343                  * ->max_segments to one lying with a single
1344                  * page, as a one page request is never in
1345                  * violation.
1346                  */
1347                 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1348                         blk_queue_max_segments(mddev->queue, 1);
1349                         blk_queue_segment_boundary(mddev->queue,
1350                                                    PAGE_CACHE_SIZE - 1);
1351                 }
1352
1353                 p->head_position = 0;
1354                 rdev->raid_disk = mirror;
1355                 err = 0;
1356                 if (rdev->saved_raid_disk != mirror)
1357                         conf->fullsync = 1;
1358                 rcu_assign_pointer(p->rdev, rdev);
1359                 break;
1360         }
1361
1362         md_integrity_add_rdev(rdev, mddev);
1363         print_conf(conf);
1364         return err;
1365 }
1366
1367 static int raid10_remove_disk(mddev_t *mddev, int number)
1368 {
1369         conf_t *conf = mddev->private;
1370         int err = 0;
1371         mdk_rdev_t *rdev;
1372         mirror_info_t *p = conf->mirrors+ number;
1373
1374         print_conf(conf);
1375         rdev = p->rdev;
1376         if (rdev) {
1377                 if (test_bit(In_sync, &rdev->flags) ||
1378                     atomic_read(&rdev->nr_pending)) {
1379                         err = -EBUSY;
1380                         goto abort;
1381                 }
1382                 /* Only remove faulty devices in recovery
1383                  * is not possible.
1384                  */
1385                 if (!test_bit(Faulty, &rdev->flags) &&
1386                     mddev->recovery_disabled != p->recovery_disabled &&
1387                     enough(conf, -1)) {
1388                         err = -EBUSY;
1389                         goto abort;
1390                 }
1391                 p->rdev = NULL;
1392                 synchronize_rcu();
1393                 if (atomic_read(&rdev->nr_pending)) {
1394                         /* lost the race, try later */
1395                         err = -EBUSY;
1396                         p->rdev = rdev;
1397                         goto abort;
1398                 }
1399                 err = md_integrity_register(mddev);
1400         }
1401 abort:
1402
1403         print_conf(conf);
1404         return err;
1405 }
1406
1407
1408 static void end_sync_read(struct bio *bio, int error)
1409 {
1410         r10bio_t *r10_bio = bio->bi_private;
1411         conf_t *conf = r10_bio->mddev->private;
1412         int d;
1413
1414         d = find_bio_disk(conf, r10_bio, bio, NULL);
1415
1416         if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1417                 set_bit(R10BIO_Uptodate, &r10_bio->state);
1418         else
1419                 /* The write handler will notice the lack of
1420                  * R10BIO_Uptodate and record any errors etc
1421                  */
1422                 atomic_add(r10_bio->sectors,
1423                            &conf->mirrors[d].rdev->corrected_errors);
1424
1425         /* for reconstruct, we always reschedule after a read.
1426          * for resync, only after all reads
1427          */
1428         rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1429         if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1430             atomic_dec_and_test(&r10_bio->remaining)) {
1431                 /* we have read all the blocks,
1432                  * do the comparison in process context in raid10d
1433                  */
1434                 reschedule_retry(r10_bio);
1435         }
1436 }
1437
1438 static void end_sync_request(r10bio_t *r10_bio)
1439 {
1440         mddev_t *mddev = r10_bio->mddev;
1441
1442         while (atomic_dec_and_test(&r10_bio->remaining)) {
1443                 if (r10_bio->master_bio == NULL) {
1444                         /* the primary of several recovery bios */
1445                         sector_t s = r10_bio->sectors;
1446                         if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1447                             test_bit(R10BIO_WriteError, &r10_bio->state))
1448                                 reschedule_retry(r10_bio);
1449                         else
1450                                 put_buf(r10_bio);
1451                         md_done_sync(mddev, s, 1);
1452                         break;
1453                 } else {
1454                         r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
1455                         if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1456                             test_bit(R10BIO_WriteError, &r10_bio->state))
1457                                 reschedule_retry(r10_bio);
1458                         else
1459                                 put_buf(r10_bio);
1460                         r10_bio = r10_bio2;
1461                 }
1462         }
1463 }
1464
1465 static void end_sync_write(struct bio *bio, int error)
1466 {
1467         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1468         r10bio_t *r10_bio = bio->bi_private;
1469         mddev_t *mddev = r10_bio->mddev;
1470         conf_t *conf = mddev->private;
1471         int d;
1472         sector_t first_bad;
1473         int bad_sectors;
1474         int slot;
1475
1476         d = find_bio_disk(conf, r10_bio, bio, &slot);
1477
1478         if (!uptodate) {
1479                 set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);
1480                 set_bit(R10BIO_WriteError, &r10_bio->state);
1481         } else if (is_badblock(conf->mirrors[d].rdev,
1482                              r10_bio->devs[slot].addr,
1483                              r10_bio->sectors,
1484                              &first_bad, &bad_sectors))
1485                 set_bit(R10BIO_MadeGood, &r10_bio->state);
1486
1487         rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1488
1489         end_sync_request(r10_bio);
1490 }
1491
1492 /*
1493  * Note: sync and recover and handled very differently for raid10
1494  * This code is for resync.
1495  * For resync, we read through virtual addresses and read all blocks.
1496  * If there is any error, we schedule a write.  The lowest numbered
1497  * drive is authoritative.
1498  * However requests come for physical address, so we need to map.
1499  * For every physical address there are raid_disks/copies virtual addresses,
1500  * which is always are least one, but is not necessarly an integer.
1501  * This means that a physical address can span multiple chunks, so we may
1502  * have to submit multiple io requests for a single sync request.
1503  */
1504 /*
1505  * We check if all blocks are in-sync and only write to blocks that
1506  * aren't in sync
1507  */
1508 static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1509 {
1510         conf_t *conf = mddev->private;
1511         int i, first;
1512         struct bio *tbio, *fbio;
1513
1514         atomic_set(&r10_bio->remaining, 1);
1515
1516         /* find the first device with a block */
1517         for (i=0; i<conf->copies; i++)
1518                 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1519                         break;
1520
1521         if (i == conf->copies)
1522                 goto done;
1523
1524         first = i;
1525         fbio = r10_bio->devs[i].bio;
1526
1527         /* now find blocks with errors */
1528         for (i=0 ; i < conf->copies ; i++) {
1529                 int  j, d;
1530                 int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1531
1532                 tbio = r10_bio->devs[i].bio;
1533
1534                 if (tbio->bi_end_io != end_sync_read)
1535                         continue;
1536                 if (i == first)
1537                         continue;
1538                 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
1539                         /* We know that the bi_io_vec layout is the same for
1540                          * both 'first' and 'i', so we just compare them.
1541                          * All vec entries are PAGE_SIZE;
1542                          */
1543                         for (j = 0; j < vcnt; j++)
1544                                 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1545                                            page_address(tbio->bi_io_vec[j].bv_page),
1546                                            PAGE_SIZE))
1547                                         break;
1548                         if (j == vcnt)
1549                                 continue;
1550                         mddev->resync_mismatches += r10_bio->sectors;
1551                         if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1552                                 /* Don't fix anything. */
1553                                 continue;
1554                 }
1555                 /* Ok, we need to write this bio, either to correct an
1556                  * inconsistency or to correct an unreadable block.
1557                  * First we need to fixup bv_offset, bv_len and
1558                  * bi_vecs, as the read request might have corrupted these
1559                  */
1560                 tbio->bi_vcnt = vcnt;
1561                 tbio->bi_size = r10_bio->sectors << 9;
1562                 tbio->bi_idx = 0;
1563                 tbio->bi_phys_segments = 0;
1564                 tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1565                 tbio->bi_flags |= 1 << BIO_UPTODATE;
1566                 tbio->bi_next = NULL;
1567                 tbio->bi_rw = WRITE;
1568                 tbio->bi_private = r10_bio;
1569                 tbio->bi_sector = r10_bio->devs[i].addr;
1570
1571                 for (j=0; j < vcnt ; j++) {
1572                         tbio->bi_io_vec[j].bv_offset = 0;
1573                         tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
1574
1575                         memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1576                                page_address(fbio->bi_io_vec[j].bv_page),
1577                                PAGE_SIZE);
1578                 }
1579                 tbio->bi_end_io = end_sync_write;
1580
1581                 d = r10_bio->devs[i].devnum;
1582                 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1583                 atomic_inc(&r10_bio->remaining);
1584                 md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
1585
1586                 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
1587                 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1588                 generic_make_request(tbio);
1589         }
1590
1591 done:
1592         if (atomic_dec_and_test(&r10_bio->remaining)) {
1593                 md_done_sync(mddev, r10_bio->sectors, 1);
1594                 put_buf(r10_bio);
1595         }
1596 }
1597
1598 /*
1599  * Now for the recovery code.
1600  * Recovery happens across physical sectors.
1601  * We recover all non-is_sync drives by finding the virtual address of
1602  * each, and then choose a working drive that also has that virt address.
1603  * There is a separate r10_bio for each non-in_sync drive.
1604  * Only the first two slots are in use. The first for reading,
1605  * The second for writing.
1606  *
1607  */
1608 static void fix_recovery_read_error(r10bio_t *r10_bio)
1609 {
1610         /* We got a read error during recovery.
1611          * We repeat the read in smaller page-sized sections.
1612          * If a read succeeds, write it to the new device or record
1613          * a bad block if we cannot.
1614          * If a read fails, record a bad block on both old and
1615          * new devices.
1616          */
1617         mddev_t *mddev = r10_bio->mddev;
1618         conf_t *conf = mddev->private;
1619         struct bio *bio = r10_bio->devs[0].bio;
1620         sector_t sect = 0;
1621         int sectors = r10_bio->sectors;
1622         int idx = 0;
1623         int dr = r10_bio->devs[0].devnum;
1624         int dw = r10_bio->devs[1].devnum;
1625
1626         while (sectors) {
1627                 int s = sectors;
1628                 mdk_rdev_t *rdev;
1629                 sector_t addr;
1630                 int ok;
1631
1632                 if (s > (PAGE_SIZE>>9))
1633                         s = PAGE_SIZE >> 9;
1634
1635                 rdev = conf->mirrors[dr].rdev;
1636                 addr = r10_bio->devs[0].addr + sect,
1637                 ok = sync_page_io(rdev,
1638                                   addr,
1639                                   s << 9,
1640                                   bio->bi_io_vec[idx].bv_page,
1641                                   READ, false);
1642                 if (ok) {
1643                         rdev = conf->mirrors[dw].rdev;
1644                         addr = r10_bio->devs[1].addr + sect;
1645                         ok = sync_page_io(rdev,
1646                                           addr,
1647                                           s << 9,
1648                                           bio->bi_io_vec[idx].bv_page,
1649                                           WRITE, false);
1650                         if (!ok)
1651                                 set_bit(WriteErrorSeen, &rdev->flags);
1652                 }
1653                 if (!ok) {
1654                         /* We don't worry if we cannot set a bad block -
1655                          * it really is bad so there is no loss in not
1656                          * recording it yet
1657                          */
1658                         rdev_set_badblocks(rdev, addr, s, 0);
1659
1660                         if (rdev != conf->mirrors[dw].rdev) {
1661                                 /* need bad block on destination too */
1662                                 mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev;
1663                                 addr = r10_bio->devs[1].addr + sect;
1664                                 ok = rdev_set_badblocks(rdev2, addr, s, 0);
1665                                 if (!ok) {
1666                                         /* just abort the recovery */
1667                                         printk(KERN_NOTICE
1668                                                "md/raid10:%s: recovery aborted"
1669                                                " due to read error\n",
1670                                                mdname(mddev));
1671
1672                                         conf->mirrors[dw].recovery_disabled
1673                                                 = mddev->recovery_disabled;
1674                                         set_bit(MD_RECOVERY_INTR,
1675                                                 &mddev->recovery);
1676                                         break;
1677                                 }
1678                         }
1679                 }
1680
1681                 sectors -= s;
1682                 sect += s;
1683                 idx++;
1684         }
1685 }
1686
1687 static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1688 {
1689         conf_t *conf = mddev->private;
1690         int d;
1691         struct bio *wbio;
1692
1693         if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
1694                 fix_recovery_read_error(r10_bio);
1695                 end_sync_request(r10_bio);
1696                 return;
1697         }
1698
1699         /*
1700          * share the pages with the first bio
1701          * and submit the write request
1702          */
1703         wbio = r10_bio->devs[1].bio;
1704         d = r10_bio->devs[1].devnum;
1705
1706         atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1707         md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1708         generic_make_request(wbio);
1709 }
1710
1711
1712 /*
1713  * Used by fix_read_error() to decay the per rdev read_errors.
1714  * We halve the read error count for every hour that has elapsed
1715  * since the last recorded read error.
1716  *
1717  */
1718 static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
1719 {
1720         struct timespec cur_time_mon;
1721         unsigned long hours_since_last;
1722         unsigned int read_errors = atomic_read(&rdev->read_errors);
1723
1724         ktime_get_ts(&cur_time_mon);
1725
1726         if (rdev->last_read_error.tv_sec == 0 &&
1727             rdev->last_read_error.tv_nsec == 0) {
1728                 /* first time we've seen a read error */
1729                 rdev->last_read_error = cur_time_mon;
1730                 return;
1731         }
1732
1733         hours_since_last = (cur_time_mon.tv_sec -
1734                             rdev->last_read_error.tv_sec) / 3600;
1735
1736         rdev->last_read_error = cur_time_mon;
1737
1738         /*
1739          * if hours_since_last is > the number of bits in read_errors
1740          * just set read errors to 0. We do this to avoid
1741          * overflowing the shift of read_errors by hours_since_last.
1742          */
1743         if (hours_since_last >= 8 * sizeof(read_errors))
1744                 atomic_set(&rdev->read_errors, 0);
1745         else
1746                 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
1747 }
1748
1749 static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
1750                             int sectors, struct page *page, int rw)
1751 {
1752         sector_t first_bad;
1753         int bad_sectors;
1754
1755         if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
1756             && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
1757                 return -1;
1758         if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1759                 /* success */
1760                 return 1;
1761         if (rw == WRITE)
1762                 set_bit(WriteErrorSeen, &rdev->flags);
1763         /* need to record an error - either for the block or the device */
1764         if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1765                 md_error(rdev->mddev, rdev);
1766         return 0;
1767 }
1768
1769 /*
1770  * This is a kernel thread which:
1771  *
1772  *      1.      Retries failed read operations on working mirrors.
1773  *      2.      Updates the raid superblock when problems encounter.
1774  *      3.      Performs writes following reads for array synchronising.
1775  */
1776
1777 static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1778 {
1779         int sect = 0; /* Offset from r10_bio->sector */
1780         int sectors = r10_bio->sectors;
1781         mdk_rdev_t*rdev;
1782         int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
1783         int d = r10_bio->devs[r10_bio->read_slot].devnum;
1784
1785         /* still own a reference to this rdev, so it cannot
1786          * have been cleared recently.
1787          */
1788         rdev = conf->mirrors[d].rdev;
1789
1790         if (test_bit(Faulty, &rdev->flags))
1791                 /* drive has already been failed, just ignore any
1792                    more fix_read_error() attempts */
1793                 return;
1794
1795         check_decay_read_errors(mddev, rdev);
1796         atomic_inc(&rdev->read_errors);
1797         if (atomic_read(&rdev->read_errors) > max_read_errors) {
1798                 char b[BDEVNAME_SIZE];
1799                 bdevname(rdev->bdev, b);
1800
1801                 printk(KERN_NOTICE
1802                        "md/raid10:%s: %s: Raid device exceeded "
1803                        "read_error threshold [cur %d:max %d]\n",
1804                        mdname(mddev), b,
1805                        atomic_read(&rdev->read_errors), max_read_errors);
1806                 printk(KERN_NOTICE
1807                        "md/raid10:%s: %s: Failing raid device\n",
1808                        mdname(mddev), b);
1809                 md_error(mddev, conf->mirrors[d].rdev);
1810                 return;
1811         }
1812
1813         while(sectors) {
1814                 int s = sectors;
1815                 int sl = r10_bio->read_slot;
1816                 int success = 0;
1817                 int start;
1818
1819                 if (s > (PAGE_SIZE>>9))
1820                         s = PAGE_SIZE >> 9;
1821
1822                 rcu_read_lock();
1823                 do {
1824                         sector_t first_bad;
1825                         int bad_sectors;
1826
1827                         d = r10_bio->devs[sl].devnum;
1828                         rdev = rcu_dereference(conf->mirrors[d].rdev);
1829                         if (rdev &&
1830                             test_bit(In_sync, &rdev->flags) &&
1831                             is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
1832                                         &first_bad, &bad_sectors) == 0) {
1833                                 atomic_inc(&rdev->nr_pending);
1834                                 rcu_read_unlock();
1835                                 success = sync_page_io(rdev,
1836                                                        r10_bio->devs[sl].addr +
1837                                                        sect,
1838                                                        s<<9,
1839                                                        conf->tmppage, READ, false);
1840                                 rdev_dec_pending(rdev, mddev);
1841                                 rcu_read_lock();
1842                                 if (success)
1843                                         break;
1844                         }
1845                         sl++;
1846                         if (sl == conf->copies)
1847                                 sl = 0;
1848                 } while (!success && sl != r10_bio->read_slot);
1849                 rcu_read_unlock();
1850
1851                 if (!success) {
1852                         /* Cannot read from anywhere, just mark the block
1853                          * as bad on the first device to discourage future
1854                          * reads.
1855                          */
1856                         int dn = r10_bio->devs[r10_bio->read_slot].devnum;
1857                         rdev = conf->mirrors[dn].rdev;
1858
1859                         if (!rdev_set_badblocks(
1860                                     rdev,
1861                                     r10_bio->devs[r10_bio->read_slot].addr
1862                                     + sect,
1863                                     s, 0))
1864                                 md_error(mddev, rdev);
1865                         break;
1866                 }
1867
1868                 start = sl;
1869                 /* write it back and re-read */
1870                 rcu_read_lock();
1871                 while (sl != r10_bio->read_slot) {
1872                         char b[BDEVNAME_SIZE];
1873
1874                         if (sl==0)
1875                                 sl = conf->copies;
1876                         sl--;
1877                         d = r10_bio->devs[sl].devnum;
1878                         rdev = rcu_dereference(conf->mirrors[d].rdev);
1879                         if (!rdev ||
1880                             !test_bit(In_sync, &rdev->flags))
1881                                 continue;
1882
1883                         atomic_inc(&rdev->nr_pending);
1884                         rcu_read_unlock();
1885                         if (r10_sync_page_io(rdev,
1886                                              r10_bio->devs[sl].addr +
1887                                              sect,
1888                                              s<<9, conf->tmppage, WRITE)
1889                             == 0) {
1890                                 /* Well, this device is dead */
1891                                 printk(KERN_NOTICE
1892                                        "md/raid10:%s: read correction "
1893                                        "write failed"
1894                                        " (%d sectors at %llu on %s)\n",
1895                                        mdname(mddev), s,
1896                                        (unsigned long long)(
1897                                                sect + rdev->data_offset),
1898                                        bdevname(rdev->bdev, b));
1899                                 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1900                                        "drive\n",
1901                                        mdname(mddev),
1902                                        bdevname(rdev->bdev, b));
1903                         }
1904                         rdev_dec_pending(rdev, mddev);
1905                         rcu_read_lock();
1906                 }
1907                 sl = start;
1908                 while (sl != r10_bio->read_slot) {
1909                         char b[BDEVNAME_SIZE];
1910
1911                         if (sl==0)
1912                                 sl = conf->copies;
1913                         sl--;
1914                         d = r10_bio->devs[sl].devnum;
1915                         rdev = rcu_dereference(conf->mirrors[d].rdev);
1916                         if (!rdev ||
1917                             !test_bit(In_sync, &rdev->flags))
1918                                 continue;
1919
1920                         atomic_inc(&rdev->nr_pending);
1921                         rcu_read_unlock();
1922                         switch (r10_sync_page_io(rdev,
1923                                              r10_bio->devs[sl].addr +
1924                                              sect,
1925                                              s<<9, conf->tmppage,
1926                                                  READ)) {
1927                         case 0:
1928                                 /* Well, this device is dead */
1929                                 printk(KERN_NOTICE
1930                                        "md/raid10:%s: unable to read back "
1931                                        "corrected sectors"
1932                                        " (%d sectors at %llu on %s)\n",
1933                                        mdname(mddev), s,
1934                                        (unsigned long long)(
1935                                                sect + rdev->data_offset),
1936                                        bdevname(rdev->bdev, b));
1937                                 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1938                                        "drive\n",
1939                                        mdname(mddev),
1940                                        bdevname(rdev->bdev, b));
1941                                 break;
1942                         case 1:
1943                                 printk(KERN_INFO
1944                                        "md/raid10:%s: read error corrected"
1945                                        " (%d sectors at %llu on %s)\n",
1946                                        mdname(mddev), s,
1947                                        (unsigned long long)(
1948                                                sect + rdev->data_offset),
1949                                        bdevname(rdev->bdev, b));
1950                                 atomic_add(s, &rdev->corrected_errors);
1951                         }
1952
1953                         rdev_dec_pending(rdev, mddev);
1954                         rcu_read_lock();
1955                 }
1956                 rcu_read_unlock();
1957
1958                 sectors -= s;
1959                 sect += s;
1960         }
1961 }
1962
1963 static void bi_complete(struct bio *bio, int error)
1964 {
1965         complete((struct completion *)bio->bi_private);
1966 }
1967
1968 static int submit_bio_wait(int rw, struct bio *bio)
1969 {
1970         struct completion event;
1971         rw |= REQ_SYNC;
1972
1973         init_completion(&event);
1974         bio->bi_private = &event;
1975         bio->bi_end_io = bi_complete;
1976         submit_bio(rw, bio);
1977         wait_for_completion(&event);
1978
1979         return test_bit(BIO_UPTODATE, &bio->bi_flags);
1980 }
1981
1982 static int narrow_write_error(r10bio_t *r10_bio, int i)
1983 {
1984         struct bio *bio = r10_bio->master_bio;
1985         mddev_t *mddev = r10_bio->mddev;
1986         conf_t *conf = mddev->private;
1987         mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
1988         /* bio has the data to be written to slot 'i' where
1989          * we just recently had a write error.
1990          * We repeatedly clone the bio and trim down to one block,
1991          * then try the write.  Where the write fails we record
1992          * a bad block.
1993          * It is conceivable that the bio doesn't exactly align with
1994          * blocks.  We must handle this.
1995          *
1996          * We currently own a reference to the rdev.
1997          */
1998
1999         int block_sectors;
2000         sector_t sector;
2001         int sectors;
2002         int sect_to_write = r10_bio->sectors;
2003         int ok = 1;
2004
2005         if (rdev->badblocks.shift < 0)
2006                 return 0;
2007
2008         block_sectors = 1 << rdev->badblocks.shift;
2009         sector = r10_bio->sector;
2010         sectors = ((r10_bio->sector + block_sectors)
2011                    & ~(sector_t)(block_sectors - 1))
2012                 - sector;
2013
2014         while (sect_to_write) {
2015                 struct bio *wbio;
2016                 if (sectors > sect_to_write)
2017                         sectors = sect_to_write;
2018                 /* Write at 'sector' for 'sectors' */
2019                 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2020                 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2021                 wbio->bi_sector = (r10_bio->devs[i].addr+
2022                                    rdev->data_offset+
2023                                    (sector - r10_bio->sector));
2024                 wbio->bi_bdev = rdev->bdev;
2025                 if (submit_bio_wait(WRITE, wbio) == 0)
2026                         /* Failure! */
2027                         ok = rdev_set_badblocks(rdev, sector,
2028                                                 sectors, 0)
2029                                 && ok;
2030
2031                 bio_put(wbio);
2032                 sect_to_write -= sectors;
2033                 sector += sectors;
2034                 sectors = block_sectors;
2035         }
2036         return ok;
2037 }
2038
2039 static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
2040 {
2041         int slot = r10_bio->read_slot;
2042         int mirror = r10_bio->devs[slot].devnum;
2043         struct bio *bio;
2044         conf_t *conf = mddev->private;
2045         mdk_rdev_t *rdev;
2046         char b[BDEVNAME_SIZE];
2047         unsigned long do_sync;
2048         int max_sectors;
2049
2050         /* we got a read error. Maybe the drive is bad.  Maybe just
2051          * the block and we can fix it.
2052          * We freeze all other IO, and try reading the block from
2053          * other devices.  When we find one, we re-write
2054          * and check it that fixes the read error.
2055          * This is all done synchronously while the array is
2056          * frozen.
2057          */
2058         if (mddev->ro == 0) {
2059                 freeze_array(conf);
2060                 fix_read_error(conf, mddev, r10_bio);
2061                 unfreeze_array(conf);
2062         }
2063         rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
2064
2065         bio = r10_bio->devs[slot].bio;
2066         bdevname(bio->bi_bdev, b);
2067         r10_bio->devs[slot].bio =
2068                 mddev->ro ? IO_BLOCKED : NULL;
2069 read_more:
2070         mirror = read_balance(conf, r10_bio, &max_sectors);
2071         if (mirror == -1) {
2072                 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2073                        " read error for block %llu\n",
2074                        mdname(mddev), b,
2075                        (unsigned long long)r10_bio->sector);
2076                 raid_end_bio_io(r10_bio);
2077                 bio_put(bio);
2078                 return;
2079         }
2080
2081         do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2082         if (bio)
2083                 bio_put(bio);
2084         slot = r10_bio->read_slot;
2085         rdev = conf->mirrors[mirror].rdev;
2086         printk_ratelimited(
2087                 KERN_ERR
2088                 "md/raid10:%s: %s: redirecting"
2089                 "sector %llu to another mirror\n",
2090                 mdname(mddev),
2091                 bdevname(rdev->bdev, b),
2092                 (unsigned long long)r10_bio->sector);
2093         bio = bio_clone_mddev(r10_bio->master_bio,
2094                               GFP_NOIO, mddev);
2095         md_trim_bio(bio,
2096                     r10_bio->sector - bio->bi_sector,
2097                     max_sectors);
2098         r10_bio->devs[slot].bio = bio;
2099         bio->bi_sector = r10_bio->devs[slot].addr
2100                 + rdev->data_offset;
2101         bio->bi_bdev = rdev->bdev;
2102         bio->bi_rw = READ | do_sync;
2103         bio->bi_private = r10_bio;
2104         bio->bi_end_io = raid10_end_read_request;
2105         if (max_sectors < r10_bio->sectors) {
2106                 /* Drat - have to split this up more */
2107                 struct bio *mbio = r10_bio->master_bio;
2108                 int sectors_handled =
2109                         r10_bio->sector + max_sectors
2110                         - mbio->bi_sector;
2111                 r10_bio->sectors = max_sectors;
2112                 spin_lock_irq(&conf->device_lock);
2113                 if (mbio->bi_phys_segments == 0)
2114                         mbio->bi_phys_segments = 2;
2115                 else
2116                         mbio->bi_phys_segments++;
2117                 spin_unlock_irq(&conf->device_lock);
2118                 generic_make_request(bio);
2119                 bio = NULL;
2120
2121                 r10_bio = mempool_alloc(conf->r10bio_pool,
2122                                         GFP_NOIO);
2123                 r10_bio->master_bio = mbio;
2124                 r10_bio->sectors = (mbio->bi_size >> 9)
2125                         - sectors_handled;
2126                 r10_bio->state = 0;
2127                 set_bit(R10BIO_ReadError,
2128                         &r10_bio->state);
2129                 r10_bio->mddev = mddev;
2130                 r10_bio->sector = mbio->bi_sector
2131                         + sectors_handled;
2132
2133                 goto read_more;
2134         } else
2135                 generic_make_request(bio);
2136 }
2137
2138 static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio)
2139 {
2140         /* Some sort of write request has finished and it
2141          * succeeded in writing where we thought there was a
2142          * bad block.  So forget the bad block.
2143          * Or possibly if failed and we need to record
2144          * a bad block.
2145          */
2146         int m;
2147         mdk_rdev_t *rdev;
2148
2149         if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2150             test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2151                 for (m = 0; m < conf->copies; m++) {
2152                         int dev = r10_bio->devs[m].devnum;
2153                         rdev = conf->mirrors[dev].rdev;
2154                         if (r10_bio->devs[m].bio == NULL)
2155                                 continue;
2156                         if (test_bit(BIO_UPTODATE,
2157                                      &r10_bio->devs[m].bio->bi_flags)) {
2158                                 rdev_clear_badblocks(
2159                                         rdev,
2160                                         r10_bio->devs[m].addr,
2161                                         r10_bio->sectors);
2162                         } else {
2163                                 if (!rdev_set_badblocks(
2164                                             rdev,
2165                                             r10_bio->devs[m].addr,
2166                                             r10_bio->sectors, 0))
2167                                         md_error(conf->mddev, rdev);
2168                         }
2169                 }
2170                 put_buf(r10_bio);
2171         } else {
2172                 for (m = 0; m < conf->copies; m++) {
2173                         int dev = r10_bio->devs[m].devnum;
2174                         struct bio *bio = r10_bio->devs[m].bio;
2175                         rdev = conf->mirrors[dev].rdev;
2176                         if (bio == IO_MADE_GOOD) {
2177                                 rdev_clear_badblocks(
2178                                         rdev,
2179                                         r10_bio->devs[m].addr,
2180                                         r10_bio->sectors);
2181                                 rdev_dec_pending(rdev, conf->mddev);
2182                         } else if (bio != NULL &&
2183                                    !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2184                                 if (!narrow_write_error(r10_bio, m)) {
2185                                         md_error(conf->mddev, rdev);
2186                                         set_bit(R10BIO_Degraded,
2187                                                 &r10_bio->state);
2188                                 }
2189                                 rdev_dec_pending(rdev, conf->mddev);
2190                         }
2191                 }
2192                 if (test_bit(R10BIO_WriteError,
2193                              &r10_bio->state))
2194                         close_write(r10_bio);
2195                 raid_end_bio_io(r10_bio);
2196         }
2197 }
2198
2199 static void raid10d(mddev_t *mddev)
2200 {
2201         r10bio_t *r10_bio;
2202         unsigned long flags;
2203         conf_t *conf = mddev->private;
2204         struct list_head *head = &conf->retry_list;
2205         struct blk_plug plug;
2206
2207         md_check_recovery(mddev);
2208
2209         blk_start_plug(&plug);
2210         for (;;) {
2211
2212                 flush_pending_writes(conf);
2213
2214                 spin_lock_irqsave(&conf->device_lock, flags);
2215                 if (list_empty(head)) {
2216                         spin_unlock_irqrestore(&conf->device_lock, flags);
2217                         break;
2218                 }
2219                 r10_bio = list_entry(head->prev, r10bio_t, retry_list);
2220                 list_del(head->prev);
2221                 conf->nr_queued--;
2222                 spin_unlock_irqrestore(&conf->device_lock, flags);
2223
2224                 mddev = r10_bio->mddev;
2225                 conf = mddev->private;
2226                 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2227                     test_bit(R10BIO_WriteError, &r10_bio->state))
2228                         handle_write_completed(conf, r10_bio);
2229                 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2230                         sync_request_write(mddev, r10_bio);
2231                 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2232                         recovery_request_write(mddev, r10_bio);
2233                 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2234                         handle_read_error(mddev, r10_bio);
2235                 else {
2236                         /* just a partial read to be scheduled from a
2237                          * separate context
2238                          */
2239                         int slot = r10_bio->read_slot;
2240                         generic_make_request(r10_bio->devs[slot].bio);
2241                 }
2242
2243                 cond_resched();
2244                 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2245                         md_check_recovery(mddev);
2246         }
2247         blk_finish_plug(&plug);
2248 }
2249
2250
2251 static int init_resync(conf_t *conf)
2252 {
2253         int buffs;
2254
2255         buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2256         BUG_ON(conf->r10buf_pool);
2257         conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2258         if (!conf->r10buf_pool)
2259                 return -ENOMEM;
2260         conf->next_resync = 0;
2261         return 0;
2262 }
2263
2264 /*
2265  * perform a "sync" on one "block"
2266  *
2267  * We need to make sure that no normal I/O request - particularly write
2268  * requests - conflict with active sync requests.
2269  *
2270  * This is achieved by tracking pending requests and a 'barrier' concept
2271  * that can be installed to exclude normal IO requests.
2272  *
2273  * Resync and recovery are handled very differently.
2274  * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
2275  *
2276  * For resync, we iterate over virtual addresses, read all copies,
2277  * and update if there are differences.  If only one copy is live,
2278  * skip it.
2279  * For recovery, we iterate over physical addresses, read a good
2280  * value for each non-in_sync drive, and over-write.
2281  *
2282  * So, for recovery we may have several outstanding complex requests for a
2283  * given address, one for each out-of-sync device.  We model this by allocating
2284  * a number of r10_bio structures, one for each out-of-sync device.
2285  * As we setup these structures, we collect all bio's together into a list
2286  * which we then process collectively to add pages, and then process again
2287  * to pass to generic_make_request.
2288  *
2289  * The r10_bio structures are linked using a borrowed master_bio pointer.
2290  * This link is counted in ->remaining.  When the r10_bio that points to NULL
2291  * has its remaining count decremented to 0, the whole complex operation
2292  * is complete.
2293  *
2294  */
2295
2296 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
2297                              int *skipped, int go_faster)
2298 {
2299         conf_t *conf = mddev->private;
2300         r10bio_t *r10_bio;
2301         struct bio *biolist = NULL, *bio;
2302         sector_t max_sector, nr_sectors;
2303         int i;
2304         int max_sync;
2305         sector_t sync_blocks;
2306         sector_t sectors_skipped = 0;
2307         int chunks_skipped = 0;
2308
2309         if (!conf->r10buf_pool)
2310                 if (init_resync(conf))
2311                         return 0;
2312
2313  skipped:
2314         max_sector = mddev->dev_sectors;
2315         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2316                 max_sector = mddev->resync_max_sectors;
2317         if (sector_nr >= max_sector) {
2318                 /* If we aborted, we need to abort the
2319                  * sync on the 'current' bitmap chucks (there can
2320                  * be several when recovering multiple devices).
2321                  * as we may have started syncing it but not finished.
2322                  * We can find the current address in
2323                  * mddev->curr_resync, but for recovery,
2324                  * we need to convert that to several
2325                  * virtual addresses.
2326                  */
2327                 if (mddev->curr_resync < max_sector) { /* aborted */
2328                         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2329                                 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2330                                                 &sync_blocks, 1);
2331                         else for (i=0; i<conf->raid_disks; i++) {
2332                                 sector_t sect =
2333                                         raid10_find_virt(conf, mddev->curr_resync, i);
2334                                 bitmap_end_sync(mddev->bitmap, sect,
2335                                                 &sync_blocks, 1);
2336                         }
2337                 } else /* completed sync */
2338                         conf->fullsync = 0;
2339
2340                 bitmap_close_sync(mddev->bitmap);
2341                 close_sync(conf);
2342                 *skipped = 1;
2343                 return sectors_skipped;
2344         }
2345         if (chunks_skipped >= conf->raid_disks) {
2346                 /* if there has been nothing to do on any drive,
2347                  * then there is nothing to do at all..
2348                  */
2349                 *skipped = 1;
2350                 return (max_sector - sector_nr) + sectors_skipped;
2351         }
2352
2353         if (max_sector > mddev->resync_max)
2354                 max_sector = mddev->resync_max; /* Don't do IO beyond here */
2355
2356         /* make sure whole request will fit in a chunk - if chunks
2357          * are meaningful
2358          */
2359         if (conf->near_copies < conf->raid_disks &&
2360             max_sector > (sector_nr | conf->chunk_mask))
2361                 max_sector = (sector_nr | conf->chunk_mask) + 1;
2362         /*
2363          * If there is non-resync activity waiting for us then
2364          * put in a delay to throttle resync.
2365          */
2366         if (!go_faster && conf->nr_waiting)
2367                 msleep_interruptible(1000);
2368
2369         /* Again, very different code for resync and recovery.
2370          * Both must result in an r10bio with a list of bios that
2371          * have bi_end_io, bi_sector, bi_bdev set,
2372          * and bi_private set to the r10bio.
2373          * For recovery, we may actually create several r10bios
2374          * with 2 bios in each, that correspond to the bios in the main one.
2375          * In this case, the subordinate r10bios link back through a
2376          * borrowed master_bio pointer, and the counter in the master
2377          * includes a ref from each subordinate.
2378          */
2379         /* First, we decide what to do and set ->bi_end_io
2380          * To end_sync_read if we want to read, and
2381          * end_sync_write if we will want to write.
2382          */
2383
2384         max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
2385         if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2386                 /* recovery... the complicated one */
2387                 int j;
2388                 r10_bio = NULL;
2389
2390                 for (i=0 ; i<conf->raid_disks; i++) {
2391                         int still_degraded;
2392                         r10bio_t *rb2;
2393                         sector_t sect;
2394                         int must_sync;
2395                         int any_working;
2396
2397                         if (conf->mirrors[i].rdev == NULL ||
2398                             test_bit(In_sync, &conf->mirrors[i].rdev->flags)) 
2399                                 continue;
2400
2401                         still_degraded = 0;
2402                         /* want to reconstruct this device */
2403                         rb2 = r10_bio;
2404                         sect = raid10_find_virt(conf, sector_nr, i);
2405                         /* Unless we are doing a full sync, we only need
2406                          * to recover the block if it is set in the bitmap
2407                          */
2408                         must_sync = bitmap_start_sync(mddev->bitmap, sect,
2409                                                       &sync_blocks, 1);
2410                         if (sync_blocks < max_sync)
2411                                 max_sync = sync_blocks;
2412                         if (!must_sync &&
2413                             !conf->fullsync) {
2414                                 /* yep, skip the sync_blocks here, but don't assume
2415                                  * that there will never be anything to do here
2416                                  */
2417                                 chunks_skipped = -1;
2418                                 continue;
2419                         }
2420
2421                         r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
2422                         raise_barrier(conf, rb2 != NULL);
2423                         atomic_set(&r10_bio->remaining, 0);
2424
2425                         r10_bio->master_bio = (struct bio*)rb2;
2426                         if (rb2)
2427                                 atomic_inc(&rb2->remaining);
2428                         r10_bio->mddev = mddev;
2429                         set_bit(R10BIO_IsRecover, &r10_bio->state);
2430                         r10_bio->sector = sect;
2431
2432                         raid10_find_phys(conf, r10_bio);
2433
2434                         /* Need to check if the array will still be
2435                          * degraded
2436                          */
2437                         for (j=0; j<conf->raid_disks; j++)
2438                                 if (conf->mirrors[j].rdev == NULL ||
2439                                     test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
2440                                         still_degraded = 1;
2441                                         break;
2442                                 }
2443
2444                         must_sync = bitmap_start_sync(mddev->bitmap, sect,
2445                                                       &sync_blocks, still_degraded);
2446
2447                         any_working = 0;
2448                         for (j=0; j<conf->copies;j++) {
2449                                 int k;
2450                                 int d = r10_bio->devs[j].devnum;
2451                                 sector_t from_addr, to_addr;
2452                                 mdk_rdev_t *rdev;
2453                                 sector_t sector, first_bad;
2454                                 int bad_sectors;
2455                                 if (!conf->mirrors[d].rdev ||
2456                                     !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
2457                                         continue;
2458                                 /* This is where we read from */
2459                                 any_working = 1;
2460                                 rdev = conf->mirrors[d].rdev;
2461                                 sector = r10_bio->devs[j].addr;
2462
2463                                 if (is_badblock(rdev, sector, max_sync,
2464                                                 &first_bad, &bad_sectors)) {
2465                                         if (first_bad > sector)
2466                                                 max_sync = first_bad - sector;
2467                                         else {
2468                                                 bad_sectors -= (sector
2469                                                                 - first_bad);
2470                                                 if (max_sync > bad_sectors)
2471                                                         max_sync = bad_sectors;
2472                                                 continue;
2473                                         }
2474                                 }
2475                                 bio = r10_bio->devs[0].bio;
2476                                 bio->bi_next = biolist;
2477                                 biolist = bio;
2478                                 bio->bi_private = r10_bio;
2479                                 bio->bi_end_io = end_sync_read;
2480                                 bio->bi_rw = READ;
2481                                 from_addr = r10_bio->devs[j].addr;
2482                                 bio->bi_sector = from_addr +
2483                                         conf->mirrors[d].rdev->data_offset;
2484                                 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
2485                                 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2486                                 atomic_inc(&r10_bio->remaining);
2487                                 /* and we write to 'i' */
2488
2489                                 for (k=0; k<conf->copies; k++)
2490                                         if (r10_bio->devs[k].devnum == i)
2491                                                 break;
2492                                 BUG_ON(k == conf->copies);
2493                                 bio = r10_bio->devs[1].bio;
2494                                 bio->bi_next = biolist;
2495                                 biolist = bio;
2496                                 bio->bi_private = r10_bio;
2497                                 bio->bi_end_io = end_sync_write;
2498                                 bio->bi_rw = WRITE;
2499                                 to_addr = r10_bio->devs[k].addr;
2500                                 bio->bi_sector = to_addr +
2501                                         conf->mirrors[i].rdev->data_offset;
2502                                 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
2503
2504                                 r10_bio->devs[0].devnum = d;
2505                                 r10_bio->devs[0].addr = from_addr;
2506                                 r10_bio->devs[1].devnum = i;
2507                                 r10_bio->devs[1].addr = to_addr;
2508
2509                                 break;
2510                         }
2511                         if (j == conf->copies) {
2512                                 /* Cannot recover, so abort the recovery or
2513                                  * record a bad block */
2514                                 put_buf(r10_bio);
2515                                 if (rb2)
2516                                         atomic_dec(&rb2->remaining);
2517                                 r10_bio = rb2;
2518                                 if (any_working) {
2519                                         /* problem is that there are bad blocks
2520                                          * on other device(s)
2521                                          */
2522                                         int k;
2523                                         for (k = 0; k < conf->copies; k++)
2524                                                 if (r10_bio->devs[k].devnum == i)
2525                                                         break;
2526                                         if (!rdev_set_badblocks(
2527                                                     conf->mirrors[i].rdev,
2528                                                     r10_bio->devs[k].addr,
2529                                                     max_sync, 0))
2530                                                 any_working = 0;
2531                                 }
2532                                 if (!any_working)  {
2533                                         if (!test_and_set_bit(MD_RECOVERY_INTR,
2534                                                               &mddev->recovery))
2535                                                 printk(KERN_INFO "md/raid10:%s: insufficient "
2536                                                        "working devices for recovery.\n",
2537                                                        mdname(mddev));
2538                                         conf->mirrors[i].recovery_disabled
2539                                                 = mddev->recovery_disabled;
2540                                 }
2541                                 break;
2542                         }
2543                 }
2544                 if (biolist == NULL) {
2545                         while (r10_bio) {
2546                                 r10bio_t *rb2 = r10_bio;
2547                                 r10_bio = (r10bio_t*) rb2->master_bio;
2548                                 rb2->master_bio = NULL;
2549                                 put_buf(rb2);
2550                         }
2551                         goto giveup;
2552                 }
2553         } else {
2554                 /* resync. Schedule a read for every block at this virt offset */
2555                 int count = 0;
2556
2557                 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
2558
2559                 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
2560                                        &sync_blocks, mddev->degraded) &&
2561                     !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
2562                                                  &mddev->recovery)) {
2563                         /* We can skip this block */
2564                         *skipped = 1;
2565                         return sync_blocks + sectors_skipped;
2566                 }
2567                 if (sync_blocks < max_sync)
2568                         max_sync = sync_blocks;
2569                 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
2570
2571                 r10_bio->mddev = mddev;
2572                 atomic_set(&r10_bio->remaining, 0);
2573                 raise_barrier(conf, 0);
2574                 conf->next_resync = sector_nr;
2575
2576                 r10_bio->master_bio = NULL;
2577                 r10_bio->sector = sector_nr;
2578                 set_bit(R10BIO_IsSync, &r10_bio->state);
2579                 raid10_find_phys(conf, r10_bio);
2580                 r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
2581
2582                 for (i=0; i<conf->copies; i++) {
2583                         int d = r10_bio->devs[i].devnum;
2584                         sector_t first_bad, sector;
2585                         int bad_sectors;
2586
2587                         bio = r10_bio->devs[i].bio;
2588                         bio->bi_end_io = NULL;
2589                         clear_bit(BIO_UPTODATE, &bio->bi_flags);
2590                         if (conf->mirrors[d].rdev == NULL ||
2591                             test_bit(Faulty, &conf->mirrors[d].rdev->flags))
2592                                 continue;
2593                         sector = r10_bio->devs[i].addr;
2594                         if (is_badblock(conf->mirrors[d].rdev,
2595                                         sector, max_sync,
2596                                         &first_bad, &bad_sectors)) {
2597                                 if (first_bad > sector)
2598                                         max_sync = first_bad - sector;
2599                                 else {
2600                                         bad_sectors -= (sector - first_bad);
2601                                         if (max_sync > bad_sectors)
2602                                                 max_sync = max_sync;
2603                                         continue;
2604                                 }
2605                         }
2606                         atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2607                         atomic_inc(&r10_bio->remaining);
2608                         bio->bi_next = biolist;
2609                         biolist = bio;
2610                         bio->bi_private = r10_bio;
2611                         bio->bi_end_io = end_sync_read;
2612                         bio->bi_rw = READ;
2613                         bio->bi_sector = sector +
2614                                 conf->mirrors[d].rdev->data_offset;
2615                         bio->bi_bdev = conf->mirrors[d].rdev->bdev;
2616                         count++;
2617                 }
2618
2619                 if (count < 2) {
2620                         for (i=0; i<conf->copies; i++) {
2621                                 int d = r10_bio->devs[i].devnum;
2622                                 if (r10_bio->devs[i].bio->bi_end_io)
2623                                         rdev_dec_pending(conf->mirrors[d].rdev,
2624                                                          mddev);
2625                         }
2626                         put_buf(r10_bio);
2627                         biolist = NULL;
2628                         goto giveup;
2629                 }
2630         }
2631
2632         for (bio = biolist; bio ; bio=bio->bi_next) {
2633
2634                 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
2635                 if (bio->bi_end_io)
2636                         bio->bi_flags |= 1 << BIO_UPTODATE;
2637                 bio->bi_vcnt = 0;
2638                 bio->bi_idx = 0;
2639                 bio->bi_phys_segments = 0;
2640                 bio->bi_size = 0;
2641         }
2642
2643         nr_sectors = 0;
2644         if (sector_nr + max_sync < max_sector)
2645                 max_sector = sector_nr + max_sync;
2646         do {
2647                 struct page *page;
2648                 int len = PAGE_SIZE;
2649                 if (sector_nr + (len>>9) > max_sector)
2650                         len = (max_sector - sector_nr) << 9;
2651                 if (len == 0)
2652                         break;
2653                 for (bio= biolist ; bio ; bio=bio->bi_next) {
2654                         struct bio *bio2;
2655                         page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
2656                         if (bio_add_page(bio, page, len, 0))
2657                                 continue;
2658
2659                         /* stop here */
2660                         bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
2661                         for (bio2 = biolist;
2662                              bio2 && bio2 != bio;
2663                              bio2 = bio2->bi_next) {
2664                                 /* remove last page from this bio */
2665                                 bio2->bi_vcnt--;
2666                                 bio2->bi_size -= len;
2667                                 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
2668                         }
2669                         goto bio_full;
2670                 }
2671                 nr_sectors += len>>9;
2672                 sector_nr += len>>9;
2673         } while (biolist->bi_vcnt < RESYNC_PAGES);
2674  bio_full:
2675         r10_bio->sectors = nr_sectors;
2676
2677         while (biolist) {
2678                 bio = biolist;
2679                 biolist = biolist->bi_next;
2680
2681                 bio->bi_next = NULL;
2682                 r10_bio = bio->bi_private;
2683                 r10_bio->sectors = nr_sectors;
2684
2685                 if (bio->bi_end_io == end_sync_read) {
2686                         md_sync_acct(bio->bi_bdev, nr_sectors);
2687                         generic_make_request(bio);
2688                 }
2689         }
2690
2691         if (sectors_skipped)
2692                 /* pretend they weren't skipped, it makes
2693                  * no important difference in this case
2694                  */
2695                 md_done_sync(mddev, sectors_skipped, 1);
2696
2697         return sectors_skipped + nr_sectors;
2698  giveup:
2699         /* There is nowhere to write, so all non-sync
2700          * drives must be failed or in resync, all drives
2701          * have a bad block, so try the next chunk...
2702          */
2703         if (sector_nr + max_sync < max_sector)
2704                 max_sector = sector_nr + max_sync;
2705
2706         sectors_skipped += (max_sector - sector_nr);
2707         chunks_skipped ++;
2708         sector_nr = max_sector;
2709         goto skipped;
2710 }
2711
2712 static sector_t
2713 raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
2714 {
2715         sector_t size;
2716         conf_t *conf = mddev->private;
2717
2718         if (!raid_disks)
2719                 raid_disks = conf->raid_disks;
2720         if (!sectors)
2721                 sectors = conf->dev_sectors;
2722
2723         size = sectors >> conf->chunk_shift;
2724         sector_div(size, conf->far_copies);
2725         size = size * raid_disks;
2726         sector_div(size, conf->near_copies);
2727
2728         return size << conf->chunk_shift;
2729 }
2730
2731
2732 static conf_t *setup_conf(mddev_t *mddev)
2733 {
2734         conf_t *conf = NULL;
2735         int nc, fc, fo;
2736         sector_t stride, size;
2737         int err = -EINVAL;
2738
2739         if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) ||
2740             !is_power_of_2(mddev->new_chunk_sectors)) {
2741                 printk(KERN_ERR "md/raid10:%s: chunk size must be "
2742                        "at least PAGE_SIZE(%ld) and be a power of 2.\n",
2743                        mdname(mddev), PAGE_SIZE);
2744                 goto out;
2745         }
2746
2747         nc = mddev->new_layout & 255;
2748         fc = (mddev->new_layout >> 8) & 255;
2749         fo = mddev->new_layout & (1<<16);
2750
2751         if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
2752             (mddev->new_layout >> 17)) {
2753                 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
2754                        mdname(mddev), mddev->new_layout);
2755                 goto out;
2756         }
2757
2758         err = -ENOMEM;
2759         conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
2760         if (!conf)
2761                 goto out;
2762
2763         conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
2764                                 GFP_KERNEL);
2765         if (!conf->mirrors)
2766                 goto out;
2767
2768         conf->tmppage = alloc_page(GFP_KERNEL);
2769         if (!conf->tmppage)
2770                 goto out;
2771
2772
2773         conf->raid_disks = mddev->raid_disks;
2774         conf->near_copies = nc;
2775         conf->far_copies = fc;
2776         conf->copies = nc*fc;
2777         conf->far_offset = fo;
2778         conf->chunk_mask = mddev->new_chunk_sectors - 1;
2779         conf->chunk_shift = ffz(~mddev->new_chunk_sectors);
2780
2781         conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
2782                                            r10bio_pool_free, conf);
2783         if (!conf->r10bio_pool)
2784                 goto out;
2785
2786         size = mddev->dev_sectors >> conf->chunk_shift;
2787         sector_div(size, fc);
2788         size = size * conf->raid_disks;
2789         sector_div(size, nc);
2790         /* 'size' is now the number of chunks in the array */
2791         /* calculate "used chunks per device" in 'stride' */
2792         stride = size * conf->copies;
2793
2794         /* We need to round up when dividing by raid_disks to
2795          * get the stride size.
2796          */
2797         stride += conf->raid_disks - 1;
2798         sector_div(stride, conf->raid_disks);
2799
2800         conf->dev_sectors = stride << conf->chunk_shift;
2801
2802         if (fo)
2803                 stride = 1;
2804         else
2805                 sector_div(stride, fc);
2806         conf->stride = stride << conf->chunk_shift;
2807
2808
2809         spin_lock_init(&conf->device_lock);
2810         INIT_LIST_HEAD(&conf->retry_list);
2811
2812         spin_lock_init(&conf->resync_lock);
2813         init_waitqueue_head(&conf->wait_barrier);
2814
2815         conf->thread = md_register_thread(raid10d, mddev, NULL);
2816         if (!conf->thread)
2817                 goto out;
2818
2819         conf->mddev = mddev;
2820         return conf;
2821
2822  out:
2823         printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
2824                mdname(mddev));
2825         if (conf) {
2826                 if (conf->r10bio_pool)
2827                         mempool_destroy(conf->r10bio_pool);
2828                 kfree(conf->mirrors);
2829                 safe_put_page(conf->tmppage);
2830                 kfree(conf);
2831         }
2832         return ERR_PTR(err);
2833 }
2834
2835 static int run(mddev_t *mddev)
2836 {
2837         conf_t *conf;
2838         int i, disk_idx, chunk_size;
2839         mirror_info_t *disk;
2840         mdk_rdev_t *rdev;
2841         sector_t size;
2842
2843         /*
2844          * copy the already verified devices into our private RAID10
2845          * bookkeeping area. [whatever we allocate in run(),
2846          * should be freed in stop()]
2847          */
2848
2849         if (mddev->private == NULL) {
2850                 conf = setup_conf(mddev);
2851                 if (IS_ERR(conf))
2852                         return PTR_ERR(conf);
2853                 mddev->private = conf;
2854         }
2855         conf = mddev->private;
2856         if (!conf)
2857                 goto out;
2858
2859         mddev->thread = conf->thread;
2860         conf->thread = NULL;
2861
2862         chunk_size = mddev->chunk_sectors << 9;
2863         blk_queue_io_min(mddev->queue, chunk_size);
2864         if (conf->raid_disks % conf->near_copies)
2865                 blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks);
2866         else
2867                 blk_queue_io_opt(mddev->queue, chunk_size *
2868                                  (conf->raid_disks / conf->near_copies));
2869
2870         list_for_each_entry(rdev, &mddev->disks, same_set) {
2871
2872                 disk_idx = rdev->raid_disk;
2873                 if (disk_idx >= conf->raid_disks
2874                     || disk_idx < 0)
2875                         continue;
2876                 disk = conf->mirrors + disk_idx;
2877
2878                 disk->rdev = rdev;
2879                 disk_stack_limits(mddev->gendisk, rdev->bdev,
2880                                   rdev->data_offset << 9);
2881                 /* as we don't honour merge_bvec_fn, we must never risk
2882                  * violating it, so limit max_segments to 1 lying
2883                  * within a single page.
2884                  */
2885                 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
2886                         blk_queue_max_segments(mddev->queue, 1);
2887                         blk_queue_segment_boundary(mddev->queue,
2888                                                    PAGE_CACHE_SIZE - 1);
2889                 }
2890
2891                 disk->head_position = 0;
2892         }
2893         /* need to check that every block has at least one working mirror */
2894         if (!enough(conf, -1)) {
2895                 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
2896                        mdname(mddev));
2897                 goto out_free_conf;
2898         }
2899
2900         mddev->degraded = 0;
2901         for (i = 0; i < conf->raid_disks; i++) {
2902
2903                 disk = conf->mirrors + i;
2904
2905                 if (!disk->rdev ||
2906                     !test_bit(In_sync, &disk->rdev->flags)) {
2907                         disk->head_position = 0;
2908                         mddev->degraded++;
2909                         if (disk->rdev)
2910                                 conf->fullsync = 1;
2911                 }
2912         }
2913
2914         if (mddev->recovery_cp != MaxSector)
2915                 printk(KERN_NOTICE "md/raid10:%s: not clean"
2916                        " -- starting background reconstruction\n",
2917                        mdname(mddev));
2918         printk(KERN_INFO
2919                 "md/raid10:%s: active with %d out of %d devices\n",
2920                 mdname(mddev), conf->raid_disks - mddev->degraded,
2921                 conf->raid_disks);
2922         /*
2923          * Ok, everything is just fine now
2924          */
2925         mddev->dev_sectors = conf->dev_sectors;
2926         size = raid10_size(mddev, 0, 0);
2927         md_set_array_sectors(mddev, size);
2928         mddev->resync_max_sectors = size;
2929
2930         mddev->queue->backing_dev_info.congested_fn = raid10_congested;
2931         mddev->queue->backing_dev_info.congested_data = mddev;
2932
2933         /* Calculate max read-ahead size.
2934          * We need to readahead at least twice a whole stripe....
2935          * maybe...
2936          */
2937         {
2938                 int stripe = conf->raid_disks *
2939                         ((mddev->chunk_sectors << 9) / PAGE_SIZE);
2940                 stripe /= conf->near_copies;
2941                 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
2942                         mddev->queue->backing_dev_info.ra_pages = 2* stripe;
2943         }
2944
2945         if (conf->near_copies < conf->raid_disks)
2946                 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
2947
2948         if (md_integrity_register(mddev))
2949                 goto out_free_conf;
2950
2951         return 0;
2952
2953 out_free_conf:
2954         md_unregister_thread(mddev->thread);
2955         if (conf->r10bio_pool)
2956                 mempool_destroy(conf->r10bio_pool);
2957         safe_put_page(conf->tmppage);
2958         kfree(conf->mirrors);
2959         kfree(conf);
2960         mddev->private = NULL;
2961 out:
2962         return -EIO;
2963 }
2964
2965 static int stop(mddev_t *mddev)
2966 {
2967         conf_t *conf = mddev->private;
2968
2969         raise_barrier(conf, 0);
2970         lower_barrier(conf);
2971
2972         md_unregister_thread(mddev->thread);
2973         mddev->thread = NULL;
2974         blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
2975         if (conf->r10bio_pool)
2976                 mempool_destroy(conf->r10bio_pool);
2977         kfree(conf->mirrors);
2978         kfree(conf);
2979         mddev->private = NULL;
2980         return 0;
2981 }
2982
2983 static void raid10_quiesce(mddev_t *mddev, int state)
2984 {
2985         conf_t *conf = mddev->private;
2986
2987         switch(state) {
2988         case 1:
2989                 raise_barrier(conf, 0);
2990                 break;
2991         case 0:
2992                 lower_barrier(conf);
2993                 break;
2994         }
2995 }
2996
2997 static void *raid10_takeover_raid0(mddev_t *mddev)
2998 {
2999         mdk_rdev_t *rdev;
3000         conf_t *conf;
3001
3002         if (mddev->degraded > 0) {
3003                 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
3004                        mdname(mddev));
3005                 return ERR_PTR(-EINVAL);
3006         }
3007
3008         /* Set new parameters */
3009         mddev->new_level = 10;
3010         /* new layout: far_copies = 1, near_copies = 2 */
3011         mddev->new_layout = (1<<8) + 2;
3012         mddev->new_chunk_sectors = mddev->chunk_sectors;
3013         mddev->delta_disks = mddev->raid_disks;
3014         mddev->raid_disks *= 2;
3015         /* make sure it will be not marked as dirty */
3016         mddev->recovery_cp = MaxSector;
3017
3018         conf = setup_conf(mddev);
3019         if (!IS_ERR(conf)) {
3020                 list_for_each_entry(rdev, &mddev->disks, same_set)
3021                         if (rdev->raid_disk >= 0)
3022                                 rdev->new_raid_disk = rdev->raid_disk * 2;
3023                 conf->barrier = 1;
3024         }
3025
3026         return conf;
3027 }
3028
3029 static void *raid10_takeover(mddev_t *mddev)
3030 {
3031         struct raid0_private_data *raid0_priv;
3032
3033         /* raid10 can take over:
3034          *  raid0 - providing it has only two drives
3035          */
3036         if (mddev->level == 0) {
3037                 /* for raid0 takeover only one zone is supported */
3038                 raid0_priv = mddev->private;
3039                 if (raid0_priv->nr_strip_zones > 1) {
3040                         printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3041                                " with more than one zone.\n",
3042                                mdname(mddev));
3043                         return ERR_PTR(-EINVAL);
3044                 }
3045                 return raid10_takeover_raid0(mddev);
3046         }
3047         return ERR_PTR(-EINVAL);
3048 }
3049
3050 static struct mdk_personality raid10_personality =
3051 {
3052         .name           = "raid10",
3053         .level          = 10,
3054         .owner          = THIS_MODULE,
3055         .make_request   = make_request,
3056         .run            = run,
3057         .stop           = stop,
3058         .status         = status,
3059         .error_handler  = error,
3060         .hot_add_disk   = raid10_add_disk,
3061         .hot_remove_disk= raid10_remove_disk,
3062         .spare_active   = raid10_spare_active,
3063         .sync_request   = sync_request,
3064         .quiesce        = raid10_quiesce,
3065         .size           = raid10_size,
3066         .takeover       = raid10_takeover,
3067 };
3068
3069 static int __init raid_init(void)
3070 {
3071         return register_md_personality(&raid10_personality);
3072 }
3073
3074 static void raid_exit(void)
3075 {
3076         unregister_md_personality(&raid10_personality);
3077 }
3078
3079 module_init(raid_init);
3080 module_exit(raid_exit);
3081 MODULE_LICENSE("GPL");
3082 MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
3083 MODULE_ALIAS("md-personality-9"); /* RAID10 */
3084 MODULE_ALIAS("md-raid10");
3085 MODULE_ALIAS("md-level-10");