md: Move check for bitmap presence to personality code.
[linux-2.6.git] / drivers / md / md.c
1 /*
2    md.c : Multiple Devices driver for Linux
3           Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5      completely rewritten, based on the MD driver code from Marc Zyngier
6
7    Changes:
8
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13    - kmod support by: Cyrus Durgin
14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17    - lots of fixes and improvements to the RAID1/RAID5 and generic
18      RAID code (such as request based resynchronization):
19
20      Neil Brown <neilb@cse.unsw.edu.au>.
21
22    - persistent bitmap code
23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25    This program is free software; you can redistribute it and/or modify
26    it under the terms of the GNU General Public License as published by
27    the Free Software Foundation; either version 2, or (at your option)
28    any later version.
29
30    You should have received a copy of the GNU General Public License
31    (for example /usr/src/linux/COPYING); if not, write to the Free
32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */
34
35 #include <linux/kthread.h>
36 #include <linux/blkdev.h>
37 #include <linux/sysctl.h>
38 #include <linux/seq_file.h>
39 #include <linux/buffer_head.h> /* for invalidate_bdev */
40 #include <linux/poll.h>
41 #include <linux/ctype.h>
42 #include <linux/hdreg.h>
43 #include <linux/proc_fs.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/file.h>
47 #include <linux/delay.h>
48 #include <linux/raid/md_p.h>
49 #include <linux/raid/md_u.h>
50 #include "md.h"
51 #include "bitmap.h"
52
53 #define DEBUG 0
54 #define dprintk(x...) ((void)(DEBUG && printk(x)))
55
56
57 #ifndef MODULE
58 static void autostart_arrays(int part);
59 #endif
60
61 static LIST_HEAD(pers_list);
62 static DEFINE_SPINLOCK(pers_lock);
63
64 static void md_print_devices(void);
65
66 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
67
68 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
69
70 /*
71  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
72  * is 1000 KB/sec, so the extra system load does not show up that much.
73  * Increase it if you want to have more _guaranteed_ speed. Note that
74  * the RAID driver will use the maximum available bandwidth if the IO
75  * subsystem is idle. There is also an 'absolute maximum' reconstruction
76  * speed limit - in case reconstruction slows down your system despite
77  * idle IO detection.
78  *
79  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
80  * or /sys/block/mdX/md/sync_speed_{min,max}
81  */
82
83 static int sysctl_speed_limit_min = 1000;
84 static int sysctl_speed_limit_max = 200000;
85 static inline int speed_min(mddev_t *mddev)
86 {
87         return mddev->sync_speed_min ?
88                 mddev->sync_speed_min : sysctl_speed_limit_min;
89 }
90
91 static inline int speed_max(mddev_t *mddev)
92 {
93         return mddev->sync_speed_max ?
94                 mddev->sync_speed_max : sysctl_speed_limit_max;
95 }
96
97 static struct ctl_table_header *raid_table_header;
98
99 static ctl_table raid_table[] = {
100         {
101                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MIN,
102                 .procname       = "speed_limit_min",
103                 .data           = &sysctl_speed_limit_min,
104                 .maxlen         = sizeof(int),
105                 .mode           = S_IRUGO|S_IWUSR,
106                 .proc_handler   = &proc_dointvec,
107         },
108         {
109                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MAX,
110                 .procname       = "speed_limit_max",
111                 .data           = &sysctl_speed_limit_max,
112                 .maxlen         = sizeof(int),
113                 .mode           = S_IRUGO|S_IWUSR,
114                 .proc_handler   = &proc_dointvec,
115         },
116         { .ctl_name = 0 }
117 };
118
119 static ctl_table raid_dir_table[] = {
120         {
121                 .ctl_name       = DEV_RAID,
122                 .procname       = "raid",
123                 .maxlen         = 0,
124                 .mode           = S_IRUGO|S_IXUGO,
125                 .child          = raid_table,
126         },
127         { .ctl_name = 0 }
128 };
129
130 static ctl_table raid_root_table[] = {
131         {
132                 .ctl_name       = CTL_DEV,
133                 .procname       = "dev",
134                 .maxlen         = 0,
135                 .mode           = 0555,
136                 .child          = raid_dir_table,
137         },
138         { .ctl_name = 0 }
139 };
140
141 static struct block_device_operations md_fops;
142
143 static int start_readonly;
144
145 /*
146  * We have a system wide 'event count' that is incremented
147  * on any 'interesting' event, and readers of /proc/mdstat
148  * can use 'poll' or 'select' to find out when the event
149  * count increases.
150  *
151  * Events are:
152  *  start array, stop array, error, add device, remove device,
153  *  start build, activate spare
154  */
155 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
156 static atomic_t md_event_count;
157 void md_new_event(mddev_t *mddev)
158 {
159         atomic_inc(&md_event_count);
160         wake_up(&md_event_waiters);
161 }
162 EXPORT_SYMBOL_GPL(md_new_event);
163
164 /* Alternate version that can be called from interrupts
165  * when calling sysfs_notify isn't needed.
166  */
167 static void md_new_event_inintr(mddev_t *mddev)
168 {
169         atomic_inc(&md_event_count);
170         wake_up(&md_event_waiters);
171 }
172
173 /*
174  * Enables to iterate over all existing md arrays
175  * all_mddevs_lock protects this list.
176  */
177 static LIST_HEAD(all_mddevs);
178 static DEFINE_SPINLOCK(all_mddevs_lock);
179
180
181 /*
182  * iterates through all used mddevs in the system.
183  * We take care to grab the all_mddevs_lock whenever navigating
184  * the list, and to always hold a refcount when unlocked.
185  * Any code which breaks out of this loop while own
186  * a reference to the current mddev and must mddev_put it.
187  */
188 #define for_each_mddev(mddev,tmp)                                       \
189                                                                         \
190         for (({ spin_lock(&all_mddevs_lock);                            \
191                 tmp = all_mddevs.next;                                  \
192                 mddev = NULL;});                                        \
193              ({ if (tmp != &all_mddevs)                                 \
194                         mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
195                 spin_unlock(&all_mddevs_lock);                          \
196                 if (mddev) mddev_put(mddev);                            \
197                 mddev = list_entry(tmp, mddev_t, all_mddevs);           \
198                 tmp != &all_mddevs;});                                  \
199              ({ spin_lock(&all_mddevs_lock);                            \
200                 tmp = tmp->next;})                                      \
201                 )
202
203
204 /* Rather than calling directly into the personality make_request function,
205  * IO requests come here first so that we can check if the device is
206  * being suspended pending a reconfiguration.
207  * We hold a refcount over the call to ->make_request.  By the time that
208  * call has finished, the bio has been linked into some internal structure
209  * and so is visible to ->quiesce(), so we don't need the refcount any more.
210  */
211 static int md_make_request(struct request_queue *q, struct bio *bio)
212 {
213         mddev_t *mddev = q->queuedata;
214         int rv;
215         if (mddev == NULL || mddev->pers == NULL) {
216                 bio_io_error(bio);
217                 return 0;
218         }
219         rcu_read_lock();
220         if (mddev->suspended) {
221                 DEFINE_WAIT(__wait);
222                 for (;;) {
223                         prepare_to_wait(&mddev->sb_wait, &__wait,
224                                         TASK_UNINTERRUPTIBLE);
225                         if (!mddev->suspended)
226                                 break;
227                         rcu_read_unlock();
228                         schedule();
229                         rcu_read_lock();
230                 }
231                 finish_wait(&mddev->sb_wait, &__wait);
232         }
233         atomic_inc(&mddev->active_io);
234         rcu_read_unlock();
235         rv = mddev->pers->make_request(q, bio);
236         if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
237                 wake_up(&mddev->sb_wait);
238
239         return rv;
240 }
241
242 static void mddev_suspend(mddev_t *mddev)
243 {
244         BUG_ON(mddev->suspended);
245         mddev->suspended = 1;
246         synchronize_rcu();
247         wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
248         mddev->pers->quiesce(mddev, 1);
249         md_unregister_thread(mddev->thread);
250         mddev->thread = NULL;
251         /* we now know that no code is executing in the personality module,
252          * except possibly the tail end of a ->bi_end_io function, but that
253          * is certain to complete before the module has a chance to get
254          * unloaded
255          */
256 }
257
258 static void mddev_resume(mddev_t *mddev)
259 {
260         mddev->suspended = 0;
261         wake_up(&mddev->sb_wait);
262         mddev->pers->quiesce(mddev, 0);
263 }
264
265
266 static inline mddev_t *mddev_get(mddev_t *mddev)
267 {
268         atomic_inc(&mddev->active);
269         return mddev;
270 }
271
272 static void mddev_delayed_delete(struct work_struct *ws);
273
274 static void mddev_put(mddev_t *mddev)
275 {
276         if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
277                 return;
278         if (!mddev->raid_disks && list_empty(&mddev->disks) &&
279             !mddev->hold_active) {
280                 list_del(&mddev->all_mddevs);
281                 if (mddev->gendisk) {
282                         /* we did a probe so need to clean up.
283                          * Call schedule_work inside the spinlock
284                          * so that flush_scheduled_work() after
285                          * mddev_find will succeed in waiting for the
286                          * work to be done.
287                          */
288                         INIT_WORK(&mddev->del_work, mddev_delayed_delete);
289                         schedule_work(&mddev->del_work);
290                 } else
291                         kfree(mddev);
292         }
293         spin_unlock(&all_mddevs_lock);
294 }
295
296 static mddev_t * mddev_find(dev_t unit)
297 {
298         mddev_t *mddev, *new = NULL;
299
300  retry:
301         spin_lock(&all_mddevs_lock);
302
303         if (unit) {
304                 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
305                         if (mddev->unit == unit) {
306                                 mddev_get(mddev);
307                                 spin_unlock(&all_mddevs_lock);
308                                 kfree(new);
309                                 return mddev;
310                         }
311
312                 if (new) {
313                         list_add(&new->all_mddevs, &all_mddevs);
314                         spin_unlock(&all_mddevs_lock);
315                         new->hold_active = UNTIL_IOCTL;
316                         return new;
317                 }
318         } else if (new) {
319                 /* find an unused unit number */
320                 static int next_minor = 512;
321                 int start = next_minor;
322                 int is_free = 0;
323                 int dev = 0;
324                 while (!is_free) {
325                         dev = MKDEV(MD_MAJOR, next_minor);
326                         next_minor++;
327                         if (next_minor > MINORMASK)
328                                 next_minor = 0;
329                         if (next_minor == start) {
330                                 /* Oh dear, all in use. */
331                                 spin_unlock(&all_mddevs_lock);
332                                 kfree(new);
333                                 return NULL;
334                         }
335                                 
336                         is_free = 1;
337                         list_for_each_entry(mddev, &all_mddevs, all_mddevs)
338                                 if (mddev->unit == dev) {
339                                         is_free = 0;
340                                         break;
341                                 }
342                 }
343                 new->unit = dev;
344                 new->md_minor = MINOR(dev);
345                 new->hold_active = UNTIL_STOP;
346                 list_add(&new->all_mddevs, &all_mddevs);
347                 spin_unlock(&all_mddevs_lock);
348                 return new;
349         }
350         spin_unlock(&all_mddevs_lock);
351
352         new = kzalloc(sizeof(*new), GFP_KERNEL);
353         if (!new)
354                 return NULL;
355
356         new->unit = unit;
357         if (MAJOR(unit) == MD_MAJOR)
358                 new->md_minor = MINOR(unit);
359         else
360                 new->md_minor = MINOR(unit) >> MdpMinorShift;
361
362         mutex_init(&new->reconfig_mutex);
363         INIT_LIST_HEAD(&new->disks);
364         INIT_LIST_HEAD(&new->all_mddevs);
365         init_timer(&new->safemode_timer);
366         atomic_set(&new->active, 1);
367         atomic_set(&new->openers, 0);
368         atomic_set(&new->active_io, 0);
369         spin_lock_init(&new->write_lock);
370         init_waitqueue_head(&new->sb_wait);
371         init_waitqueue_head(&new->recovery_wait);
372         new->reshape_position = MaxSector;
373         new->resync_min = 0;
374         new->resync_max = MaxSector;
375         new->level = LEVEL_NONE;
376
377         goto retry;
378 }
379
380 static inline int mddev_lock(mddev_t * mddev)
381 {
382         return mutex_lock_interruptible(&mddev->reconfig_mutex);
383 }
384
385 static inline int mddev_is_locked(mddev_t *mddev)
386 {
387         return mutex_is_locked(&mddev->reconfig_mutex);
388 }
389
390 static inline int mddev_trylock(mddev_t * mddev)
391 {
392         return mutex_trylock(&mddev->reconfig_mutex);
393 }
394
395 static inline void mddev_unlock(mddev_t * mddev)
396 {
397         mutex_unlock(&mddev->reconfig_mutex);
398
399         md_wakeup_thread(mddev->thread);
400 }
401
402 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
403 {
404         mdk_rdev_t *rdev;
405
406         list_for_each_entry(rdev, &mddev->disks, same_set)
407                 if (rdev->desc_nr == nr)
408                         return rdev;
409
410         return NULL;
411 }
412
413 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
414 {
415         mdk_rdev_t *rdev;
416
417         list_for_each_entry(rdev, &mddev->disks, same_set)
418                 if (rdev->bdev->bd_dev == dev)
419                         return rdev;
420
421         return NULL;
422 }
423
424 static struct mdk_personality *find_pers(int level, char *clevel)
425 {
426         struct mdk_personality *pers;
427         list_for_each_entry(pers, &pers_list, list) {
428                 if (level != LEVEL_NONE && pers->level == level)
429                         return pers;
430                 if (strcmp(pers->name, clevel)==0)
431                         return pers;
432         }
433         return NULL;
434 }
435
436 /* return the offset of the super block in 512byte sectors */
437 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
438 {
439         sector_t num_sectors = bdev->bd_inode->i_size / 512;
440         return MD_NEW_SIZE_SECTORS(num_sectors);
441 }
442
443 static int alloc_disk_sb(mdk_rdev_t * rdev)
444 {
445         if (rdev->sb_page)
446                 MD_BUG();
447
448         rdev->sb_page = alloc_page(GFP_KERNEL);
449         if (!rdev->sb_page) {
450                 printk(KERN_ALERT "md: out of memory.\n");
451                 return -ENOMEM;
452         }
453
454         return 0;
455 }
456
457 static void free_disk_sb(mdk_rdev_t * rdev)
458 {
459         if (rdev->sb_page) {
460                 put_page(rdev->sb_page);
461                 rdev->sb_loaded = 0;
462                 rdev->sb_page = NULL;
463                 rdev->sb_start = 0;
464                 rdev->sectors = 0;
465         }
466 }
467
468
469 static void super_written(struct bio *bio, int error)
470 {
471         mdk_rdev_t *rdev = bio->bi_private;
472         mddev_t *mddev = rdev->mddev;
473
474         if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
475                 printk("md: super_written gets error=%d, uptodate=%d\n",
476                        error, test_bit(BIO_UPTODATE, &bio->bi_flags));
477                 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
478                 md_error(mddev, rdev);
479         }
480
481         if (atomic_dec_and_test(&mddev->pending_writes))
482                 wake_up(&mddev->sb_wait);
483         bio_put(bio);
484 }
485
486 static void super_written_barrier(struct bio *bio, int error)
487 {
488         struct bio *bio2 = bio->bi_private;
489         mdk_rdev_t *rdev = bio2->bi_private;
490         mddev_t *mddev = rdev->mddev;
491
492         if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
493             error == -EOPNOTSUPP) {
494                 unsigned long flags;
495                 /* barriers don't appear to be supported :-( */
496                 set_bit(BarriersNotsupp, &rdev->flags);
497                 mddev->barriers_work = 0;
498                 spin_lock_irqsave(&mddev->write_lock, flags);
499                 bio2->bi_next = mddev->biolist;
500                 mddev->biolist = bio2;
501                 spin_unlock_irqrestore(&mddev->write_lock, flags);
502                 wake_up(&mddev->sb_wait);
503                 bio_put(bio);
504         } else {
505                 bio_put(bio2);
506                 bio->bi_private = rdev;
507                 super_written(bio, error);
508         }
509 }
510
511 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
512                    sector_t sector, int size, struct page *page)
513 {
514         /* write first size bytes of page to sector of rdev
515          * Increment mddev->pending_writes before returning
516          * and decrement it on completion, waking up sb_wait
517          * if zero is reached.
518          * If an error occurred, call md_error
519          *
520          * As we might need to resubmit the request if BIO_RW_BARRIER
521          * causes ENOTSUPP, we allocate a spare bio...
522          */
523         struct bio *bio = bio_alloc(GFP_NOIO, 1);
524         int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
525
526         bio->bi_bdev = rdev->bdev;
527         bio->bi_sector = sector;
528         bio_add_page(bio, page, size, 0);
529         bio->bi_private = rdev;
530         bio->bi_end_io = super_written;
531         bio->bi_rw = rw;
532
533         atomic_inc(&mddev->pending_writes);
534         if (!test_bit(BarriersNotsupp, &rdev->flags)) {
535                 struct bio *rbio;
536                 rw |= (1<<BIO_RW_BARRIER);
537                 rbio = bio_clone(bio, GFP_NOIO);
538                 rbio->bi_private = bio;
539                 rbio->bi_end_io = super_written_barrier;
540                 submit_bio(rw, rbio);
541         } else
542                 submit_bio(rw, bio);
543 }
544
545 void md_super_wait(mddev_t *mddev)
546 {
547         /* wait for all superblock writes that were scheduled to complete.
548          * if any had to be retried (due to BARRIER problems), retry them
549          */
550         DEFINE_WAIT(wq);
551         for(;;) {
552                 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
553                 if (atomic_read(&mddev->pending_writes)==0)
554                         break;
555                 while (mddev->biolist) {
556                         struct bio *bio;
557                         spin_lock_irq(&mddev->write_lock);
558                         bio = mddev->biolist;
559                         mddev->biolist = bio->bi_next ;
560                         bio->bi_next = NULL;
561                         spin_unlock_irq(&mddev->write_lock);
562                         submit_bio(bio->bi_rw, bio);
563                 }
564                 schedule();
565         }
566         finish_wait(&mddev->sb_wait, &wq);
567 }
568
569 static void bi_complete(struct bio *bio, int error)
570 {
571         complete((struct completion*)bio->bi_private);
572 }
573
574 int sync_page_io(struct block_device *bdev, sector_t sector, int size,
575                    struct page *page, int rw)
576 {
577         struct bio *bio = bio_alloc(GFP_NOIO, 1);
578         struct completion event;
579         int ret;
580
581         rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
582
583         bio->bi_bdev = bdev;
584         bio->bi_sector = sector;
585         bio_add_page(bio, page, size, 0);
586         init_completion(&event);
587         bio->bi_private = &event;
588         bio->bi_end_io = bi_complete;
589         submit_bio(rw, bio);
590         wait_for_completion(&event);
591
592         ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
593         bio_put(bio);
594         return ret;
595 }
596 EXPORT_SYMBOL_GPL(sync_page_io);
597
598 static int read_disk_sb(mdk_rdev_t * rdev, int size)
599 {
600         char b[BDEVNAME_SIZE];
601         if (!rdev->sb_page) {
602                 MD_BUG();
603                 return -EINVAL;
604         }
605         if (rdev->sb_loaded)
606                 return 0;
607
608
609         if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
610                 goto fail;
611         rdev->sb_loaded = 1;
612         return 0;
613
614 fail:
615         printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
616                 bdevname(rdev->bdev,b));
617         return -EINVAL;
618 }
619
620 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
621 {
622         return  sb1->set_uuid0 == sb2->set_uuid0 &&
623                 sb1->set_uuid1 == sb2->set_uuid1 &&
624                 sb1->set_uuid2 == sb2->set_uuid2 &&
625                 sb1->set_uuid3 == sb2->set_uuid3;
626 }
627
628 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
629 {
630         int ret;
631         mdp_super_t *tmp1, *tmp2;
632
633         tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
634         tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
635
636         if (!tmp1 || !tmp2) {
637                 ret = 0;
638                 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
639                 goto abort;
640         }
641
642         *tmp1 = *sb1;
643         *tmp2 = *sb2;
644
645         /*
646          * nr_disks is not constant
647          */
648         tmp1->nr_disks = 0;
649         tmp2->nr_disks = 0;
650
651         ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
652 abort:
653         kfree(tmp1);
654         kfree(tmp2);
655         return ret;
656 }
657
658
659 static u32 md_csum_fold(u32 csum)
660 {
661         csum = (csum & 0xffff) + (csum >> 16);
662         return (csum & 0xffff) + (csum >> 16);
663 }
664
665 static unsigned int calc_sb_csum(mdp_super_t * sb)
666 {
667         u64 newcsum = 0;
668         u32 *sb32 = (u32*)sb;
669         int i;
670         unsigned int disk_csum, csum;
671
672         disk_csum = sb->sb_csum;
673         sb->sb_csum = 0;
674
675         for (i = 0; i < MD_SB_BYTES/4 ; i++)
676                 newcsum += sb32[i];
677         csum = (newcsum & 0xffffffff) + (newcsum>>32);
678
679
680 #ifdef CONFIG_ALPHA
681         /* This used to use csum_partial, which was wrong for several
682          * reasons including that different results are returned on
683          * different architectures.  It isn't critical that we get exactly
684          * the same return value as before (we always csum_fold before
685          * testing, and that removes any differences).  However as we
686          * know that csum_partial always returned a 16bit value on
687          * alphas, do a fold to maximise conformity to previous behaviour.
688          */
689         sb->sb_csum = md_csum_fold(disk_csum);
690 #else
691         sb->sb_csum = disk_csum;
692 #endif
693         return csum;
694 }
695
696
697 /*
698  * Handle superblock details.
699  * We want to be able to handle multiple superblock formats
700  * so we have a common interface to them all, and an array of
701  * different handlers.
702  * We rely on user-space to write the initial superblock, and support
703  * reading and updating of superblocks.
704  * Interface methods are:
705  *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
706  *      loads and validates a superblock on dev.
707  *      if refdev != NULL, compare superblocks on both devices
708  *    Return:
709  *      0 - dev has a superblock that is compatible with refdev
710  *      1 - dev has a superblock that is compatible and newer than refdev
711  *          so dev should be used as the refdev in future
712  *     -EINVAL superblock incompatible or invalid
713  *     -othererror e.g. -EIO
714  *
715  *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
716  *      Verify that dev is acceptable into mddev.
717  *       The first time, mddev->raid_disks will be 0, and data from
718  *       dev should be merged in.  Subsequent calls check that dev
719  *       is new enough.  Return 0 or -EINVAL
720  *
721  *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
722  *     Update the superblock for rdev with data in mddev
723  *     This does not write to disc.
724  *
725  */
726
727 struct super_type  {
728         char                *name;
729         struct module       *owner;
730         int                 (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
731                                           int minor_version);
732         int                 (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
733         void                (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
734         unsigned long long  (*rdev_size_change)(mdk_rdev_t *rdev,
735                                                 sector_t num_sectors);
736 };
737
738 /*
739  * Check that the given mddev has no bitmap.
740  *
741  * This function is called from the run method of all personalities that do not
742  * support bitmaps. It prints an error message and returns non-zero if mddev
743  * has a bitmap. Otherwise, it returns 0.
744  *
745  */
746 int md_check_no_bitmap(mddev_t *mddev)
747 {
748         if (!mddev->bitmap_file && !mddev->bitmap_offset)
749                 return 0;
750         printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
751                 mdname(mddev), mddev->pers->name);
752         return 1;
753 }
754 EXPORT_SYMBOL(md_check_no_bitmap);
755
756 /*
757  * load_super for 0.90.0 
758  */
759 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
760 {
761         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
762         mdp_super_t *sb;
763         int ret;
764
765         /*
766          * Calculate the position of the superblock (512byte sectors),
767          * it's at the end of the disk.
768          *
769          * It also happens to be a multiple of 4Kb.
770          */
771         rdev->sb_start = calc_dev_sboffset(rdev->bdev);
772
773         ret = read_disk_sb(rdev, MD_SB_BYTES);
774         if (ret) return ret;
775
776         ret = -EINVAL;
777
778         bdevname(rdev->bdev, b);
779         sb = (mdp_super_t*)page_address(rdev->sb_page);
780
781         if (sb->md_magic != MD_SB_MAGIC) {
782                 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
783                        b);
784                 goto abort;
785         }
786
787         if (sb->major_version != 0 ||
788             sb->minor_version < 90 ||
789             sb->minor_version > 91) {
790                 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
791                         sb->major_version, sb->minor_version,
792                         b);
793                 goto abort;
794         }
795
796         if (sb->raid_disks <= 0)
797                 goto abort;
798
799         if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
800                 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
801                         b);
802                 goto abort;
803         }
804
805         rdev->preferred_minor = sb->md_minor;
806         rdev->data_offset = 0;
807         rdev->sb_size = MD_SB_BYTES;
808
809         if (sb->level == LEVEL_MULTIPATH)
810                 rdev->desc_nr = -1;
811         else
812                 rdev->desc_nr = sb->this_disk.number;
813
814         if (!refdev) {
815                 ret = 1;
816         } else {
817                 __u64 ev1, ev2;
818                 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
819                 if (!uuid_equal(refsb, sb)) {
820                         printk(KERN_WARNING "md: %s has different UUID to %s\n",
821                                 b, bdevname(refdev->bdev,b2));
822                         goto abort;
823                 }
824                 if (!sb_equal(refsb, sb)) {
825                         printk(KERN_WARNING "md: %s has same UUID"
826                                " but different superblock to %s\n",
827                                b, bdevname(refdev->bdev, b2));
828                         goto abort;
829                 }
830                 ev1 = md_event(sb);
831                 ev2 = md_event(refsb);
832                 if (ev1 > ev2)
833                         ret = 1;
834                 else 
835                         ret = 0;
836         }
837         rdev->sectors = rdev->sb_start;
838
839         if (rdev->sectors < sb->size * 2 && sb->level > 1)
840                 /* "this cannot possibly happen" ... */
841                 ret = -EINVAL;
842
843  abort:
844         return ret;
845 }
846
847 /*
848  * validate_super for 0.90.0
849  */
850 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
851 {
852         mdp_disk_t *desc;
853         mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
854         __u64 ev1 = md_event(sb);
855
856         rdev->raid_disk = -1;
857         clear_bit(Faulty, &rdev->flags);
858         clear_bit(In_sync, &rdev->flags);
859         clear_bit(WriteMostly, &rdev->flags);
860         clear_bit(BarriersNotsupp, &rdev->flags);
861
862         if (mddev->raid_disks == 0) {
863                 mddev->major_version = 0;
864                 mddev->minor_version = sb->minor_version;
865                 mddev->patch_version = sb->patch_version;
866                 mddev->external = 0;
867                 mddev->chunk_sectors = sb->chunk_size >> 9;
868                 mddev->ctime = sb->ctime;
869                 mddev->utime = sb->utime;
870                 mddev->level = sb->level;
871                 mddev->clevel[0] = 0;
872                 mddev->layout = sb->layout;
873                 mddev->raid_disks = sb->raid_disks;
874                 mddev->dev_sectors = sb->size * 2;
875                 mddev->events = ev1;
876                 mddev->bitmap_offset = 0;
877                 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
878
879                 if (mddev->minor_version >= 91) {
880                         mddev->reshape_position = sb->reshape_position;
881                         mddev->delta_disks = sb->delta_disks;
882                         mddev->new_level = sb->new_level;
883                         mddev->new_layout = sb->new_layout;
884                         mddev->new_chunk_sectors = sb->new_chunk >> 9;
885                 } else {
886                         mddev->reshape_position = MaxSector;
887                         mddev->delta_disks = 0;
888                         mddev->new_level = mddev->level;
889                         mddev->new_layout = mddev->layout;
890                         mddev->new_chunk_sectors = mddev->chunk_sectors;
891                 }
892
893                 if (sb->state & (1<<MD_SB_CLEAN))
894                         mddev->recovery_cp = MaxSector;
895                 else {
896                         if (sb->events_hi == sb->cp_events_hi && 
897                                 sb->events_lo == sb->cp_events_lo) {
898                                 mddev->recovery_cp = sb->recovery_cp;
899                         } else
900                                 mddev->recovery_cp = 0;
901                 }
902
903                 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
904                 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
905                 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
906                 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
907
908                 mddev->max_disks = MD_SB_DISKS;
909
910                 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
911                     mddev->bitmap_file == NULL)
912                         mddev->bitmap_offset = mddev->default_bitmap_offset;
913
914         } else if (mddev->pers == NULL) {
915                 /* Insist on good event counter while assembling */
916                 ++ev1;
917                 if (ev1 < mddev->events) 
918                         return -EINVAL;
919         } else if (mddev->bitmap) {
920                 /* if adding to array with a bitmap, then we can accept an
921                  * older device ... but not too old.
922                  */
923                 if (ev1 < mddev->bitmap->events_cleared)
924                         return 0;
925         } else {
926                 if (ev1 < mddev->events)
927                         /* just a hot-add of a new device, leave raid_disk at -1 */
928                         return 0;
929         }
930
931         if (mddev->level != LEVEL_MULTIPATH) {
932                 desc = sb->disks + rdev->desc_nr;
933
934                 if (desc->state & (1<<MD_DISK_FAULTY))
935                         set_bit(Faulty, &rdev->flags);
936                 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
937                             desc->raid_disk < mddev->raid_disks */) {
938                         set_bit(In_sync, &rdev->flags);
939                         rdev->raid_disk = desc->raid_disk;
940                 }
941                 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
942                         set_bit(WriteMostly, &rdev->flags);
943         } else /* MULTIPATH are always insync */
944                 set_bit(In_sync, &rdev->flags);
945         return 0;
946 }
947
948 /*
949  * sync_super for 0.90.0
950  */
951 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
952 {
953         mdp_super_t *sb;
954         mdk_rdev_t *rdev2;
955         int next_spare = mddev->raid_disks;
956
957
958         /* make rdev->sb match mddev data..
959          *
960          * 1/ zero out disks
961          * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
962          * 3/ any empty disks < next_spare become removed
963          *
964          * disks[0] gets initialised to REMOVED because
965          * we cannot be sure from other fields if it has
966          * been initialised or not.
967          */
968         int i;
969         int active=0, working=0,failed=0,spare=0,nr_disks=0;
970
971         rdev->sb_size = MD_SB_BYTES;
972
973         sb = (mdp_super_t*)page_address(rdev->sb_page);
974
975         memset(sb, 0, sizeof(*sb));
976
977         sb->md_magic = MD_SB_MAGIC;
978         sb->major_version = mddev->major_version;
979         sb->patch_version = mddev->patch_version;
980         sb->gvalid_words  = 0; /* ignored */
981         memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
982         memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
983         memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
984         memcpy(&sb->set_uuid3, mddev->uuid+12,4);
985
986         sb->ctime = mddev->ctime;
987         sb->level = mddev->level;
988         sb->size = mddev->dev_sectors / 2;
989         sb->raid_disks = mddev->raid_disks;
990         sb->md_minor = mddev->md_minor;
991         sb->not_persistent = 0;
992         sb->utime = mddev->utime;
993         sb->state = 0;
994         sb->events_hi = (mddev->events>>32);
995         sb->events_lo = (u32)mddev->events;
996
997         if (mddev->reshape_position == MaxSector)
998                 sb->minor_version = 90;
999         else {
1000                 sb->minor_version = 91;
1001                 sb->reshape_position = mddev->reshape_position;
1002                 sb->new_level = mddev->new_level;
1003                 sb->delta_disks = mddev->delta_disks;
1004                 sb->new_layout = mddev->new_layout;
1005                 sb->new_chunk = mddev->new_chunk_sectors << 9;
1006         }
1007         mddev->minor_version = sb->minor_version;
1008         if (mddev->in_sync)
1009         {
1010                 sb->recovery_cp = mddev->recovery_cp;
1011                 sb->cp_events_hi = (mddev->events>>32);
1012                 sb->cp_events_lo = (u32)mddev->events;
1013                 if (mddev->recovery_cp == MaxSector)
1014                         sb->state = (1<< MD_SB_CLEAN);
1015         } else
1016                 sb->recovery_cp = 0;
1017
1018         sb->layout = mddev->layout;
1019         sb->chunk_size = mddev->chunk_sectors << 9;
1020
1021         if (mddev->bitmap && mddev->bitmap_file == NULL)
1022                 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1023
1024         sb->disks[0].state = (1<<MD_DISK_REMOVED);
1025         list_for_each_entry(rdev2, &mddev->disks, same_set) {
1026                 mdp_disk_t *d;
1027                 int desc_nr;
1028                 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
1029                     && !test_bit(Faulty, &rdev2->flags))
1030                         desc_nr = rdev2->raid_disk;
1031                 else
1032                         desc_nr = next_spare++;
1033                 rdev2->desc_nr = desc_nr;
1034                 d = &sb->disks[rdev2->desc_nr];
1035                 nr_disks++;
1036                 d->number = rdev2->desc_nr;
1037                 d->major = MAJOR(rdev2->bdev->bd_dev);
1038                 d->minor = MINOR(rdev2->bdev->bd_dev);
1039                 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
1040                     && !test_bit(Faulty, &rdev2->flags))
1041                         d->raid_disk = rdev2->raid_disk;
1042                 else
1043                         d->raid_disk = rdev2->desc_nr; /* compatibility */
1044                 if (test_bit(Faulty, &rdev2->flags))
1045                         d->state = (1<<MD_DISK_FAULTY);
1046                 else if (test_bit(In_sync, &rdev2->flags)) {
1047                         d->state = (1<<MD_DISK_ACTIVE);
1048                         d->state |= (1<<MD_DISK_SYNC);
1049                         active++;
1050                         working++;
1051                 } else {
1052                         d->state = 0;
1053                         spare++;
1054                         working++;
1055                 }
1056                 if (test_bit(WriteMostly, &rdev2->flags))
1057                         d->state |= (1<<MD_DISK_WRITEMOSTLY);
1058         }
1059         /* now set the "removed" and "faulty" bits on any missing devices */
1060         for (i=0 ; i < mddev->raid_disks ; i++) {
1061                 mdp_disk_t *d = &sb->disks[i];
1062                 if (d->state == 0 && d->number == 0) {
1063                         d->number = i;
1064                         d->raid_disk = i;
1065                         d->state = (1<<MD_DISK_REMOVED);
1066                         d->state |= (1<<MD_DISK_FAULTY);
1067                         failed++;
1068                 }
1069         }
1070         sb->nr_disks = nr_disks;
1071         sb->active_disks = active;
1072         sb->working_disks = working;
1073         sb->failed_disks = failed;
1074         sb->spare_disks = spare;
1075
1076         sb->this_disk = sb->disks[rdev->desc_nr];
1077         sb->sb_csum = calc_sb_csum(sb);
1078 }
1079
1080 /*
1081  * rdev_size_change for 0.90.0
1082  */
1083 static unsigned long long
1084 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1085 {
1086         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1087                 return 0; /* component must fit device */
1088         if (rdev->mddev->bitmap_offset)
1089                 return 0; /* can't move bitmap */
1090         rdev->sb_start = calc_dev_sboffset(rdev->bdev);
1091         if (!num_sectors || num_sectors > rdev->sb_start)
1092                 num_sectors = rdev->sb_start;
1093         md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1094                        rdev->sb_page);
1095         md_super_wait(rdev->mddev);
1096         return num_sectors / 2; /* kB for sysfs */
1097 }
1098
1099
1100 /*
1101  * version 1 superblock
1102  */
1103
1104 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1105 {
1106         __le32 disk_csum;
1107         u32 csum;
1108         unsigned long long newcsum;
1109         int size = 256 + le32_to_cpu(sb->max_dev)*2;
1110         __le32 *isuper = (__le32*)sb;
1111         int i;
1112
1113         disk_csum = sb->sb_csum;
1114         sb->sb_csum = 0;
1115         newcsum = 0;
1116         for (i=0; size>=4; size -= 4 )
1117                 newcsum += le32_to_cpu(*isuper++);
1118
1119         if (size == 2)
1120                 newcsum += le16_to_cpu(*(__le16*) isuper);
1121
1122         csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1123         sb->sb_csum = disk_csum;
1124         return cpu_to_le32(csum);
1125 }
1126
1127 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1128 {
1129         struct mdp_superblock_1 *sb;
1130         int ret;
1131         sector_t sb_start;
1132         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1133         int bmask;
1134
1135         /*
1136          * Calculate the position of the superblock in 512byte sectors.
1137          * It is always aligned to a 4K boundary and
1138          * depeding on minor_version, it can be:
1139          * 0: At least 8K, but less than 12K, from end of device
1140          * 1: At start of device
1141          * 2: 4K from start of device.
1142          */
1143         switch(minor_version) {
1144         case 0:
1145                 sb_start = rdev->bdev->bd_inode->i_size >> 9;
1146                 sb_start -= 8*2;
1147                 sb_start &= ~(sector_t)(4*2-1);
1148                 break;
1149         case 1:
1150                 sb_start = 0;
1151                 break;
1152         case 2:
1153                 sb_start = 8;
1154                 break;
1155         default:
1156                 return -EINVAL;
1157         }
1158         rdev->sb_start = sb_start;
1159
1160         /* superblock is rarely larger than 1K, but it can be larger,
1161          * and it is safe to read 4k, so we do that
1162          */
1163         ret = read_disk_sb(rdev, 4096);
1164         if (ret) return ret;
1165
1166
1167         sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1168
1169         if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1170             sb->major_version != cpu_to_le32(1) ||
1171             le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1172             le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1173             (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1174                 return -EINVAL;
1175
1176         if (calc_sb_1_csum(sb) != sb->sb_csum) {
1177                 printk("md: invalid superblock checksum on %s\n",
1178                         bdevname(rdev->bdev,b));
1179                 return -EINVAL;
1180         }
1181         if (le64_to_cpu(sb->data_size) < 10) {
1182                 printk("md: data_size too small on %s\n",
1183                        bdevname(rdev->bdev,b));
1184                 return -EINVAL;
1185         }
1186
1187         rdev->preferred_minor = 0xffff;
1188         rdev->data_offset = le64_to_cpu(sb->data_offset);
1189         atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1190
1191         rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1192         bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1193         if (rdev->sb_size & bmask)
1194                 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1195
1196         if (minor_version
1197             && rdev->data_offset < sb_start + (rdev->sb_size/512))
1198                 return -EINVAL;
1199
1200         if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1201                 rdev->desc_nr = -1;
1202         else
1203                 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1204
1205         if (!refdev) {
1206                 ret = 1;
1207         } else {
1208                 __u64 ev1, ev2;
1209                 struct mdp_superblock_1 *refsb = 
1210                         (struct mdp_superblock_1*)page_address(refdev->sb_page);
1211
1212                 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1213                     sb->level != refsb->level ||
1214                     sb->layout != refsb->layout ||
1215                     sb->chunksize != refsb->chunksize) {
1216                         printk(KERN_WARNING "md: %s has strangely different"
1217                                 " superblock to %s\n",
1218                                 bdevname(rdev->bdev,b),
1219                                 bdevname(refdev->bdev,b2));
1220                         return -EINVAL;
1221                 }
1222                 ev1 = le64_to_cpu(sb->events);
1223                 ev2 = le64_to_cpu(refsb->events);
1224
1225                 if (ev1 > ev2)
1226                         ret = 1;
1227                 else
1228                         ret = 0;
1229         }
1230         if (minor_version)
1231                 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
1232                         le64_to_cpu(sb->data_offset);
1233         else
1234                 rdev->sectors = rdev->sb_start;
1235         if (rdev->sectors < le64_to_cpu(sb->data_size))
1236                 return -EINVAL;
1237         rdev->sectors = le64_to_cpu(sb->data_size);
1238         if (le64_to_cpu(sb->size) > rdev->sectors)
1239                 return -EINVAL;
1240         return ret;
1241 }
1242
1243 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1244 {
1245         struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1246         __u64 ev1 = le64_to_cpu(sb->events);
1247
1248         rdev->raid_disk = -1;
1249         clear_bit(Faulty, &rdev->flags);
1250         clear_bit(In_sync, &rdev->flags);
1251         clear_bit(WriteMostly, &rdev->flags);
1252         clear_bit(BarriersNotsupp, &rdev->flags);
1253
1254         if (mddev->raid_disks == 0) {
1255                 mddev->major_version = 1;
1256                 mddev->patch_version = 0;
1257                 mddev->external = 0;
1258                 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1259                 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1260                 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1261                 mddev->level = le32_to_cpu(sb->level);
1262                 mddev->clevel[0] = 0;
1263                 mddev->layout = le32_to_cpu(sb->layout);
1264                 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1265                 mddev->dev_sectors = le64_to_cpu(sb->size);
1266                 mddev->events = ev1;
1267                 mddev->bitmap_offset = 0;
1268                 mddev->default_bitmap_offset = 1024 >> 9;
1269                 
1270                 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1271                 memcpy(mddev->uuid, sb->set_uuid, 16);
1272
1273                 mddev->max_disks =  (4096-256)/2;
1274
1275                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1276                     mddev->bitmap_file == NULL )
1277                         mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1278
1279                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1280                         mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1281                         mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1282                         mddev->new_level = le32_to_cpu(sb->new_level);
1283                         mddev->new_layout = le32_to_cpu(sb->new_layout);
1284                         mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1285                 } else {
1286                         mddev->reshape_position = MaxSector;
1287                         mddev->delta_disks = 0;
1288                         mddev->new_level = mddev->level;
1289                         mddev->new_layout = mddev->layout;
1290                         mddev->new_chunk_sectors = mddev->chunk_sectors;
1291                 }
1292
1293         } else if (mddev->pers == NULL) {
1294                 /* Insist of good event counter while assembling */
1295                 ++ev1;
1296                 if (ev1 < mddev->events)
1297                         return -EINVAL;
1298         } else if (mddev->bitmap) {
1299                 /* If adding to array with a bitmap, then we can accept an
1300                  * older device, but not too old.
1301                  */
1302                 if (ev1 < mddev->bitmap->events_cleared)
1303                         return 0;
1304         } else {
1305                 if (ev1 < mddev->events)
1306                         /* just a hot-add of a new device, leave raid_disk at -1 */
1307                         return 0;
1308         }
1309         if (mddev->level != LEVEL_MULTIPATH) {
1310                 int role;
1311                 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1312                 switch(role) {
1313                 case 0xffff: /* spare */
1314                         break;
1315                 case 0xfffe: /* faulty */
1316                         set_bit(Faulty, &rdev->flags);
1317                         break;
1318                 default:
1319                         if ((le32_to_cpu(sb->feature_map) &
1320                              MD_FEATURE_RECOVERY_OFFSET))
1321                                 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1322                         else
1323                                 set_bit(In_sync, &rdev->flags);
1324                         rdev->raid_disk = role;
1325                         break;
1326                 }
1327                 if (sb->devflags & WriteMostly1)
1328                         set_bit(WriteMostly, &rdev->flags);
1329         } else /* MULTIPATH are always insync */
1330                 set_bit(In_sync, &rdev->flags);
1331
1332         return 0;
1333 }
1334
1335 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1336 {
1337         struct mdp_superblock_1 *sb;
1338         mdk_rdev_t *rdev2;
1339         int max_dev, i;
1340         /* make rdev->sb match mddev and rdev data. */
1341
1342         sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1343
1344         sb->feature_map = 0;
1345         sb->pad0 = 0;
1346         sb->recovery_offset = cpu_to_le64(0);
1347         memset(sb->pad1, 0, sizeof(sb->pad1));
1348         memset(sb->pad2, 0, sizeof(sb->pad2));
1349         memset(sb->pad3, 0, sizeof(sb->pad3));
1350
1351         sb->utime = cpu_to_le64((__u64)mddev->utime);
1352         sb->events = cpu_to_le64(mddev->events);
1353         if (mddev->in_sync)
1354                 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1355         else
1356                 sb->resync_offset = cpu_to_le64(0);
1357
1358         sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1359
1360         sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1361         sb->size = cpu_to_le64(mddev->dev_sectors);
1362         sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1363         sb->level = cpu_to_le32(mddev->level);
1364         sb->layout = cpu_to_le32(mddev->layout);
1365
1366         if (mddev->bitmap && mddev->bitmap_file == NULL) {
1367                 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1368                 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1369         }
1370
1371         if (rdev->raid_disk >= 0 &&
1372             !test_bit(In_sync, &rdev->flags)) {
1373                 if (mddev->curr_resync_completed > rdev->recovery_offset)
1374                         rdev->recovery_offset = mddev->curr_resync_completed;
1375                 if (rdev->recovery_offset > 0) {
1376                         sb->feature_map |=
1377                                 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1378                         sb->recovery_offset =
1379                                 cpu_to_le64(rdev->recovery_offset);
1380                 }
1381         }
1382
1383         if (mddev->reshape_position != MaxSector) {
1384                 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1385                 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1386                 sb->new_layout = cpu_to_le32(mddev->new_layout);
1387                 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1388                 sb->new_level = cpu_to_le32(mddev->new_level);
1389                 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1390         }
1391
1392         max_dev = 0;
1393         list_for_each_entry(rdev2, &mddev->disks, same_set)
1394                 if (rdev2->desc_nr+1 > max_dev)
1395                         max_dev = rdev2->desc_nr+1;
1396
1397         if (max_dev > le32_to_cpu(sb->max_dev))
1398                 sb->max_dev = cpu_to_le32(max_dev);
1399         for (i=0; i<max_dev;i++)
1400                 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1401         
1402         list_for_each_entry(rdev2, &mddev->disks, same_set) {
1403                 i = rdev2->desc_nr;
1404                 if (test_bit(Faulty, &rdev2->flags))
1405                         sb->dev_roles[i] = cpu_to_le16(0xfffe);
1406                 else if (test_bit(In_sync, &rdev2->flags))
1407                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1408                 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1409                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1410                 else
1411                         sb->dev_roles[i] = cpu_to_le16(0xffff);
1412         }
1413
1414         sb->sb_csum = calc_sb_1_csum(sb);
1415 }
1416
1417 static unsigned long long
1418 super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1419 {
1420         struct mdp_superblock_1 *sb;
1421         sector_t max_sectors;
1422         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1423                 return 0; /* component must fit device */
1424         if (rdev->sb_start < rdev->data_offset) {
1425                 /* minor versions 1 and 2; superblock before data */
1426                 max_sectors = rdev->bdev->bd_inode->i_size >> 9;
1427                 max_sectors -= rdev->data_offset;
1428                 if (!num_sectors || num_sectors > max_sectors)
1429                         num_sectors = max_sectors;
1430         } else if (rdev->mddev->bitmap_offset) {
1431                 /* minor version 0 with bitmap we can't move */
1432                 return 0;
1433         } else {
1434                 /* minor version 0; superblock after data */
1435                 sector_t sb_start;
1436                 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1437                 sb_start &= ~(sector_t)(4*2 - 1);
1438                 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1439                 if (!num_sectors || num_sectors > max_sectors)
1440                         num_sectors = max_sectors;
1441                 rdev->sb_start = sb_start;
1442         }
1443         sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
1444         sb->data_size = cpu_to_le64(num_sectors);
1445         sb->super_offset = rdev->sb_start;
1446         sb->sb_csum = calc_sb_1_csum(sb);
1447         md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1448                        rdev->sb_page);
1449         md_super_wait(rdev->mddev);
1450         return num_sectors / 2; /* kB for sysfs */
1451 }
1452
1453 static struct super_type super_types[] = {
1454         [0] = {
1455                 .name   = "0.90.0",
1456                 .owner  = THIS_MODULE,
1457                 .load_super         = super_90_load,
1458                 .validate_super     = super_90_validate,
1459                 .sync_super         = super_90_sync,
1460                 .rdev_size_change   = super_90_rdev_size_change,
1461         },
1462         [1] = {
1463                 .name   = "md-1",
1464                 .owner  = THIS_MODULE,
1465                 .load_super         = super_1_load,
1466                 .validate_super     = super_1_validate,
1467                 .sync_super         = super_1_sync,
1468                 .rdev_size_change   = super_1_rdev_size_change,
1469         },
1470 };
1471
1472 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1473 {
1474         mdk_rdev_t *rdev, *rdev2;
1475
1476         rcu_read_lock();
1477         rdev_for_each_rcu(rdev, mddev1)
1478                 rdev_for_each_rcu(rdev2, mddev2)
1479                         if (rdev->bdev->bd_contains ==
1480                             rdev2->bdev->bd_contains) {
1481                                 rcu_read_unlock();
1482                                 return 1;
1483                         }
1484         rcu_read_unlock();
1485         return 0;
1486 }
1487
1488 static LIST_HEAD(pending_raid_disks);
1489
1490 static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev)
1491 {
1492         struct mdk_personality *pers = mddev->pers;
1493         struct gendisk *disk = mddev->gendisk;
1494         struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
1495         struct blk_integrity *bi_mddev = blk_get_integrity(disk);
1496
1497         /* Data integrity passthrough not supported on RAID 4, 5 and 6 */
1498         if (pers && pers->level >= 4 && pers->level <= 6)
1499                 return;
1500
1501         /* If rdev is integrity capable, register profile for mddev */
1502         if (!bi_mddev && bi_rdev) {
1503                 if (blk_integrity_register(disk, bi_rdev))
1504                         printk(KERN_ERR "%s: %s Could not register integrity!\n",
1505                                __func__, disk->disk_name);
1506                 else
1507                         printk(KERN_NOTICE "Enabling data integrity on %s\n",
1508                                disk->disk_name);
1509                 return;
1510         }
1511
1512         /* Check that mddev and rdev have matching profiles */
1513         if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) {
1514                 printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__,
1515                        disk->disk_name, rdev->bdev->bd_disk->disk_name);
1516                 printk(KERN_NOTICE "Disabling data integrity on %s\n",
1517                        disk->disk_name);
1518                 blk_integrity_unregister(disk);
1519         }
1520 }
1521
1522 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1523 {
1524         char b[BDEVNAME_SIZE];
1525         struct kobject *ko;
1526         char *s;
1527         int err;
1528
1529         if (rdev->mddev) {
1530                 MD_BUG();
1531                 return -EINVAL;
1532         }
1533
1534         /* prevent duplicates */
1535         if (find_rdev(mddev, rdev->bdev->bd_dev))
1536                 return -EEXIST;
1537
1538         /* make sure rdev->sectors exceeds mddev->dev_sectors */
1539         if (rdev->sectors && (mddev->dev_sectors == 0 ||
1540                         rdev->sectors < mddev->dev_sectors)) {
1541                 if (mddev->pers) {
1542                         /* Cannot change size, so fail
1543                          * If mddev->level <= 0, then we don't care
1544                          * about aligning sizes (e.g. linear)
1545                          */
1546                         if (mddev->level > 0)
1547                                 return -ENOSPC;
1548                 } else
1549                         mddev->dev_sectors = rdev->sectors;
1550         }
1551
1552         /* Verify rdev->desc_nr is unique.
1553          * If it is -1, assign a free number, else
1554          * check number is not in use
1555          */
1556         if (rdev->desc_nr < 0) {
1557                 int choice = 0;
1558                 if (mddev->pers) choice = mddev->raid_disks;
1559                 while (find_rdev_nr(mddev, choice))
1560                         choice++;
1561                 rdev->desc_nr = choice;
1562         } else {
1563                 if (find_rdev_nr(mddev, rdev->desc_nr))
1564                         return -EBUSY;
1565         }
1566         if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
1567                 printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
1568                        mdname(mddev), mddev->max_disks);
1569                 return -EBUSY;
1570         }
1571         bdevname(rdev->bdev,b);
1572         while ( (s=strchr(b, '/')) != NULL)
1573                 *s = '!';
1574
1575         rdev->mddev = mddev;
1576         printk(KERN_INFO "md: bind<%s>\n", b);
1577
1578         if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
1579                 goto fail;
1580
1581         ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
1582         if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
1583                 kobject_del(&rdev->kobj);
1584                 goto fail;
1585         }
1586         rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state");
1587
1588         list_add_rcu(&rdev->same_set, &mddev->disks);
1589         bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1590
1591         /* May as well allow recovery to be retried once */
1592         mddev->recovery_disabled = 0;
1593
1594         md_integrity_check(rdev, mddev);
1595         return 0;
1596
1597  fail:
1598         printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
1599                b, mdname(mddev));
1600         return err;
1601 }
1602
1603 static void md_delayed_delete(struct work_struct *ws)
1604 {
1605         mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
1606         kobject_del(&rdev->kobj);
1607         kobject_put(&rdev->kobj);
1608 }
1609
1610 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1611 {
1612         char b[BDEVNAME_SIZE];
1613         if (!rdev->mddev) {
1614                 MD_BUG();
1615                 return;
1616         }
1617         bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1618         list_del_rcu(&rdev->same_set);
1619         printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1620         rdev->mddev = NULL;
1621         sysfs_remove_link(&rdev->kobj, "block");
1622         sysfs_put(rdev->sysfs_state);
1623         rdev->sysfs_state = NULL;
1624         /* We need to delay this, otherwise we can deadlock when
1625          * writing to 'remove' to "dev/state".  We also need
1626          * to delay it due to rcu usage.
1627          */
1628         synchronize_rcu();
1629         INIT_WORK(&rdev->del_work, md_delayed_delete);
1630         kobject_get(&rdev->kobj);
1631         schedule_work(&rdev->del_work);
1632 }
1633
1634 /*
1635  * prevent the device from being mounted, repartitioned or
1636  * otherwise reused by a RAID array (or any other kernel
1637  * subsystem), by bd_claiming the device.
1638  */
1639 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
1640 {
1641         int err = 0;
1642         struct block_device *bdev;
1643         char b[BDEVNAME_SIZE];
1644
1645         bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1646         if (IS_ERR(bdev)) {
1647                 printk(KERN_ERR "md: could not open %s.\n",
1648                         __bdevname(dev, b));
1649                 return PTR_ERR(bdev);
1650         }
1651         err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
1652         if (err) {
1653                 printk(KERN_ERR "md: could not bd_claim %s.\n",
1654                         bdevname(bdev, b));
1655                 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1656                 return err;
1657         }
1658         if (!shared)
1659                 set_bit(AllReserved, &rdev->flags);
1660         rdev->bdev = bdev;
1661         return err;
1662 }
1663
1664 static void unlock_rdev(mdk_rdev_t *rdev)
1665 {
1666         struct block_device *bdev = rdev->bdev;
1667         rdev->bdev = NULL;
1668         if (!bdev)
1669                 MD_BUG();
1670         bd_release(bdev);
1671         blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1672 }
1673
1674 void md_autodetect_dev(dev_t dev);
1675
1676 static void export_rdev(mdk_rdev_t * rdev)
1677 {
1678         char b[BDEVNAME_SIZE];
1679         printk(KERN_INFO "md: export_rdev(%s)\n",
1680                 bdevname(rdev->bdev,b));
1681         if (rdev->mddev)
1682                 MD_BUG();
1683         free_disk_sb(rdev);
1684 #ifndef MODULE
1685         if (test_bit(AutoDetected, &rdev->flags))
1686                 md_autodetect_dev(rdev->bdev->bd_dev);
1687 #endif
1688         unlock_rdev(rdev);
1689         kobject_put(&rdev->kobj);
1690 }
1691
1692 static void kick_rdev_from_array(mdk_rdev_t * rdev)
1693 {
1694         unbind_rdev_from_array(rdev);
1695         export_rdev(rdev);
1696 }
1697
1698 static void export_array(mddev_t *mddev)
1699 {
1700         mdk_rdev_t *rdev, *tmp;
1701
1702         rdev_for_each(rdev, tmp, mddev) {
1703                 if (!rdev->mddev) {
1704                         MD_BUG();
1705                         continue;
1706                 }
1707                 kick_rdev_from_array(rdev);
1708         }
1709         if (!list_empty(&mddev->disks))
1710                 MD_BUG();
1711         mddev->raid_disks = 0;
1712         mddev->major_version = 0;
1713 }
1714
1715 static void print_desc(mdp_disk_t *desc)
1716 {
1717         printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1718                 desc->major,desc->minor,desc->raid_disk,desc->state);
1719 }
1720
1721 static void print_sb_90(mdp_super_t *sb)
1722 {
1723         int i;
1724
1725         printk(KERN_INFO 
1726                 "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1727                 sb->major_version, sb->minor_version, sb->patch_version,
1728                 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1729                 sb->ctime);
1730         printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1731                 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1732                 sb->md_minor, sb->layout, sb->chunk_size);
1733         printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
1734                 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1735                 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1736                 sb->failed_disks, sb->spare_disks,
1737                 sb->sb_csum, (unsigned long)sb->events_lo);
1738
1739         printk(KERN_INFO);
1740         for (i = 0; i < MD_SB_DISKS; i++) {
1741                 mdp_disk_t *desc;
1742
1743                 desc = sb->disks + i;
1744                 if (desc->number || desc->major || desc->minor ||
1745                     desc->raid_disk || (desc->state && (desc->state != 4))) {
1746                         printk("     D %2d: ", i);
1747                         print_desc(desc);
1748                 }
1749         }
1750         printk(KERN_INFO "md:     THIS: ");
1751         print_desc(&sb->this_disk);
1752 }
1753
1754 static void print_sb_1(struct mdp_superblock_1 *sb)
1755 {
1756         __u8 *uuid;
1757
1758         uuid = sb->set_uuid;
1759         printk(KERN_INFO "md:  SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
1760                         ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
1761                KERN_INFO "md:    Name: \"%s\" CT:%llu\n",
1762                 le32_to_cpu(sb->major_version),
1763                 le32_to_cpu(sb->feature_map),
1764                 uuid[0], uuid[1], uuid[2], uuid[3],
1765                 uuid[4], uuid[5], uuid[6], uuid[7],
1766                 uuid[8], uuid[9], uuid[10], uuid[11],
1767                 uuid[12], uuid[13], uuid[14], uuid[15],
1768                 sb->set_name,
1769                 (unsigned long long)le64_to_cpu(sb->ctime)
1770                        & MD_SUPERBLOCK_1_TIME_SEC_MASK);
1771
1772         uuid = sb->device_uuid;
1773         printk(KERN_INFO "md:       L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
1774                         " RO:%llu\n"
1775                KERN_INFO "md:     Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
1776                         ":%02x%02x%02x%02x%02x%02x\n"
1777                KERN_INFO "md:       (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
1778                KERN_INFO "md:         (MaxDev:%u) \n",
1779                 le32_to_cpu(sb->level),
1780                 (unsigned long long)le64_to_cpu(sb->size),
1781                 le32_to_cpu(sb->raid_disks),
1782                 le32_to_cpu(sb->layout),
1783                 le32_to_cpu(sb->chunksize),
1784                 (unsigned long long)le64_to_cpu(sb->data_offset),
1785                 (unsigned long long)le64_to_cpu(sb->data_size),
1786                 (unsigned long long)le64_to_cpu(sb->super_offset),
1787                 (unsigned long long)le64_to_cpu(sb->recovery_offset),
1788                 le32_to_cpu(sb->dev_number),
1789                 uuid[0], uuid[1], uuid[2], uuid[3],
1790                 uuid[4], uuid[5], uuid[6], uuid[7],
1791                 uuid[8], uuid[9], uuid[10], uuid[11],
1792                 uuid[12], uuid[13], uuid[14], uuid[15],
1793                 sb->devflags,
1794                 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
1795                 (unsigned long long)le64_to_cpu(sb->events),
1796                 (unsigned long long)le64_to_cpu(sb->resync_offset),
1797                 le32_to_cpu(sb->sb_csum),
1798                 le32_to_cpu(sb->max_dev)
1799                 );
1800 }
1801
1802 static void print_rdev(mdk_rdev_t *rdev, int major_version)
1803 {
1804         char b[BDEVNAME_SIZE];
1805         printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
1806                 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
1807                 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1808                 rdev->desc_nr);
1809         if (rdev->sb_loaded) {
1810                 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
1811                 switch (major_version) {
1812                 case 0:
1813                         print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
1814                         break;
1815                 case 1:
1816                         print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
1817                         break;
1818                 }
1819         } else
1820                 printk(KERN_INFO "md: no rdev superblock!\n");
1821 }
1822
1823 static void md_print_devices(void)
1824 {
1825         struct list_head *tmp;
1826         mdk_rdev_t *rdev;
1827         mddev_t *mddev;
1828         char b[BDEVNAME_SIZE];
1829
1830         printk("\n");
1831         printk("md:     **********************************\n");
1832         printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
1833         printk("md:     **********************************\n");
1834         for_each_mddev(mddev, tmp) {
1835
1836                 if (mddev->bitmap)
1837                         bitmap_print_sb(mddev->bitmap);
1838                 else
1839                         printk("%s: ", mdname(mddev));
1840                 list_for_each_entry(rdev, &mddev->disks, same_set)
1841                         printk("<%s>", bdevname(rdev->bdev,b));
1842                 printk("\n");
1843
1844                 list_for_each_entry(rdev, &mddev->disks, same_set)
1845                         print_rdev(rdev, mddev->major_version);
1846         }
1847         printk("md:     **********************************\n");
1848         printk("\n");
1849 }
1850
1851
1852 static void sync_sbs(mddev_t * mddev, int nospares)
1853 {
1854         /* Update each superblock (in-memory image), but
1855          * if we are allowed to, skip spares which already
1856          * have the right event counter, or have one earlier
1857          * (which would mean they aren't being marked as dirty
1858          * with the rest of the array)
1859          */
1860         mdk_rdev_t *rdev;
1861
1862         list_for_each_entry(rdev, &mddev->disks, same_set) {
1863                 if (rdev->sb_events == mddev->events ||
1864                     (nospares &&
1865                      rdev->raid_disk < 0 &&
1866                      (rdev->sb_events&1)==0 &&
1867                      rdev->sb_events+1 == mddev->events)) {
1868                         /* Don't update this superblock */
1869                         rdev->sb_loaded = 2;
1870                 } else {
1871                         super_types[mddev->major_version].
1872                                 sync_super(mddev, rdev);
1873                         rdev->sb_loaded = 1;
1874                 }
1875         }
1876 }
1877
1878 static void md_update_sb(mddev_t * mddev, int force_change)
1879 {
1880         mdk_rdev_t *rdev;
1881         int sync_req;
1882         int nospares = 0;
1883
1884         mddev->utime = get_seconds();
1885         if (mddev->external)
1886                 return;
1887 repeat:
1888         spin_lock_irq(&mddev->write_lock);
1889
1890         set_bit(MD_CHANGE_PENDING, &mddev->flags);
1891         if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
1892                 force_change = 1;
1893         if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
1894                 /* just a clean<-> dirty transition, possibly leave spares alone,
1895                  * though if events isn't the right even/odd, we will have to do
1896                  * spares after all
1897                  */
1898                 nospares = 1;
1899         if (force_change)
1900                 nospares = 0;
1901         if (mddev->degraded)
1902                 /* If the array is degraded, then skipping spares is both
1903                  * dangerous and fairly pointless.
1904                  * Dangerous because a device that was removed from the array
1905                  * might have a event_count that still looks up-to-date,
1906                  * so it can be re-added without a resync.
1907                  * Pointless because if there are any spares to skip,
1908                  * then a recovery will happen and soon that array won't
1909                  * be degraded any more and the spare can go back to sleep then.
1910                  */
1911                 nospares = 0;
1912
1913         sync_req = mddev->in_sync;
1914
1915         /* If this is just a dirty<->clean transition, and the array is clean
1916          * and 'events' is odd, we can roll back to the previous clean state */
1917         if (nospares
1918             && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1919             && (mddev->events & 1)
1920             && mddev->events != 1)
1921                 mddev->events--;
1922         else {
1923                 /* otherwise we have to go forward and ... */
1924                 mddev->events ++;
1925                 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1926                         /* .. if the array isn't clean, insist on an odd 'events' */
1927                         if ((mddev->events&1)==0) {
1928                                 mddev->events++;
1929                                 nospares = 0;
1930                         }
1931                 } else {
1932                         /* otherwise insist on an even 'events' (for clean states) */
1933                         if ((mddev->events&1)) {
1934                                 mddev->events++;
1935                                 nospares = 0;
1936                         }
1937                 }
1938         }
1939
1940         if (!mddev->events) {
1941                 /*
1942                  * oops, this 64-bit counter should never wrap.
1943                  * Either we are in around ~1 trillion A.C., assuming
1944                  * 1 reboot per second, or we have a bug:
1945                  */
1946                 MD_BUG();
1947                 mddev->events --;
1948         }
1949
1950         /*
1951          * do not write anything to disk if using
1952          * nonpersistent superblocks
1953          */
1954         if (!mddev->persistent) {
1955                 if (!mddev->external)
1956                         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1957
1958                 spin_unlock_irq(&mddev->write_lock);
1959                 wake_up(&mddev->sb_wait);
1960                 return;
1961         }
1962         sync_sbs(mddev, nospares);
1963         spin_unlock_irq(&mddev->write_lock);
1964
1965         dprintk(KERN_INFO 
1966                 "md: updating %s RAID superblock on device (in sync %d)\n",
1967                 mdname(mddev),mddev->in_sync);
1968
1969         bitmap_update_sb(mddev->bitmap);
1970         list_for_each_entry(rdev, &mddev->disks, same_set) {
1971                 char b[BDEVNAME_SIZE];
1972                 dprintk(KERN_INFO "md: ");
1973                 if (rdev->sb_loaded != 1)
1974                         continue; /* no noise on spare devices */
1975                 if (test_bit(Faulty, &rdev->flags))
1976                         dprintk("(skipping faulty ");
1977
1978                 dprintk("%s ", bdevname(rdev->bdev,b));
1979                 if (!test_bit(Faulty, &rdev->flags)) {
1980                         md_super_write(mddev,rdev,
1981                                        rdev->sb_start, rdev->sb_size,
1982                                        rdev->sb_page);
1983                         dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1984                                 bdevname(rdev->bdev,b),
1985                                 (unsigned long long)rdev->sb_start);
1986                         rdev->sb_events = mddev->events;
1987
1988                 } else
1989                         dprintk(")\n");
1990                 if (mddev->level == LEVEL_MULTIPATH)
1991                         /* only need to write one superblock... */
1992                         break;
1993         }
1994         md_super_wait(mddev);
1995         /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
1996
1997         spin_lock_irq(&mddev->write_lock);
1998         if (mddev->in_sync != sync_req ||
1999             test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2000                 /* have to write it out again */
2001                 spin_unlock_irq(&mddev->write_lock);
2002                 goto repeat;
2003         }
2004         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2005         spin_unlock_irq(&mddev->write_lock);
2006         wake_up(&mddev->sb_wait);
2007         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2008                 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2009
2010 }
2011
2012 /* words written to sysfs files may, or may not, be \n terminated.
2013  * We want to accept with case. For this we use cmd_match.
2014  */
2015 static int cmd_match(const char *cmd, const char *str)
2016 {
2017         /* See if cmd, written into a sysfs file, matches
2018          * str.  They must either be the same, or cmd can
2019          * have a trailing newline
2020          */
2021         while (*cmd && *str && *cmd == *str) {
2022                 cmd++;
2023                 str++;
2024         }
2025         if (*cmd == '\n')
2026                 cmd++;
2027         if (*str || *cmd)
2028                 return 0;
2029         return 1;
2030 }
2031
2032 struct rdev_sysfs_entry {
2033         struct attribute attr;
2034         ssize_t (*show)(mdk_rdev_t *, char *);
2035         ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
2036 };
2037
2038 static ssize_t
2039 state_show(mdk_rdev_t *rdev, char *page)
2040 {
2041         char *sep = "";
2042         size_t len = 0;
2043
2044         if (test_bit(Faulty, &rdev->flags)) {
2045                 len+= sprintf(page+len, "%sfaulty",sep);
2046                 sep = ",";
2047         }
2048         if (test_bit(In_sync, &rdev->flags)) {
2049                 len += sprintf(page+len, "%sin_sync",sep);
2050                 sep = ",";
2051         }
2052         if (test_bit(WriteMostly, &rdev->flags)) {
2053                 len += sprintf(page+len, "%swrite_mostly",sep);
2054                 sep = ",";
2055         }
2056         if (test_bit(Blocked, &rdev->flags)) {
2057                 len += sprintf(page+len, "%sblocked", sep);
2058                 sep = ",";
2059         }
2060         if (!test_bit(Faulty, &rdev->flags) &&
2061             !test_bit(In_sync, &rdev->flags)) {
2062                 len += sprintf(page+len, "%sspare", sep);
2063                 sep = ",";
2064         }
2065         return len+sprintf(page+len, "\n");
2066 }
2067
2068 static ssize_t
2069 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2070 {
2071         /* can write
2072          *  faulty  - simulates and error
2073          *  remove  - disconnects the device
2074          *  writemostly - sets write_mostly
2075          *  -writemostly - clears write_mostly
2076          *  blocked - sets the Blocked flag
2077          *  -blocked - clears the Blocked flag
2078          *  insync - sets Insync providing device isn't active
2079          */
2080         int err = -EINVAL;
2081         if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2082                 md_error(rdev->mddev, rdev);
2083                 err = 0;
2084         } else if (cmd_match(buf, "remove")) {
2085                 if (rdev->raid_disk >= 0)
2086                         err = -EBUSY;
2087                 else {
2088                         mddev_t *mddev = rdev->mddev;
2089                         kick_rdev_from_array(rdev);
2090                         if (mddev->pers)
2091                                 md_update_sb(mddev, 1);
2092                         md_new_event(mddev);
2093                         err = 0;
2094                 }
2095         } else if (cmd_match(buf, "writemostly")) {
2096                 set_bit(WriteMostly, &rdev->flags);
2097                 err = 0;
2098         } else if (cmd_match(buf, "-writemostly")) {
2099                 clear_bit(WriteMostly, &rdev->flags);
2100                 err = 0;
2101         } else if (cmd_match(buf, "blocked")) {
2102                 set_bit(Blocked, &rdev->flags);
2103                 err = 0;
2104         } else if (cmd_match(buf, "-blocked")) {
2105                 clear_bit(Blocked, &rdev->flags);
2106                 wake_up(&rdev->blocked_wait);
2107                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2108                 md_wakeup_thread(rdev->mddev->thread);
2109
2110                 err = 0;
2111         } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2112                 set_bit(In_sync, &rdev->flags);
2113                 err = 0;
2114         }
2115         if (!err && rdev->sysfs_state)
2116                 sysfs_notify_dirent(rdev->sysfs_state);
2117         return err ? err : len;
2118 }
2119 static struct rdev_sysfs_entry rdev_state =
2120 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
2121
2122 static ssize_t
2123 errors_show(mdk_rdev_t *rdev, char *page)
2124 {
2125         return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2126 }
2127
2128 static ssize_t
2129 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2130 {
2131         char *e;
2132         unsigned long n = simple_strtoul(buf, &e, 10);
2133         if (*buf && (*e == 0 || *e == '\n')) {
2134                 atomic_set(&rdev->corrected_errors, n);
2135                 return len;
2136         }
2137         return -EINVAL;
2138 }
2139 static struct rdev_sysfs_entry rdev_errors =
2140 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2141
2142 static ssize_t
2143 slot_show(mdk_rdev_t *rdev, char *page)
2144 {
2145         if (rdev->raid_disk < 0)
2146                 return sprintf(page, "none\n");
2147         else
2148                 return sprintf(page, "%d\n", rdev->raid_disk);
2149 }
2150
2151 static ssize_t
2152 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2153 {
2154         char *e;
2155         int err;
2156         char nm[20];
2157         int slot = simple_strtoul(buf, &e, 10);
2158         if (strncmp(buf, "none", 4)==0)
2159                 slot = -1;
2160         else if (e==buf || (*e && *e!= '\n'))
2161                 return -EINVAL;
2162         if (rdev->mddev->pers && slot == -1) {
2163                 /* Setting 'slot' on an active array requires also
2164                  * updating the 'rd%d' link, and communicating
2165                  * with the personality with ->hot_*_disk.
2166                  * For now we only support removing
2167                  * failed/spare devices.  This normally happens automatically,
2168                  * but not when the metadata is externally managed.
2169                  */
2170                 if (rdev->raid_disk == -1)
2171                         return -EEXIST;
2172                 /* personality does all needed checks */
2173                 if (rdev->mddev->pers->hot_add_disk == NULL)
2174                         return -EINVAL;
2175                 err = rdev->mddev->pers->
2176                         hot_remove_disk(rdev->mddev, rdev->raid_disk);
2177                 if (err)
2178                         return err;
2179                 sprintf(nm, "rd%d", rdev->raid_disk);
2180                 sysfs_remove_link(&rdev->mddev->kobj, nm);
2181                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2182                 md_wakeup_thread(rdev->mddev->thread);
2183         } else if (rdev->mddev->pers) {
2184                 mdk_rdev_t *rdev2;
2185                 /* Activating a spare .. or possibly reactivating
2186                  * if we ever get bitmaps working here.
2187                  */
2188
2189                 if (rdev->raid_disk != -1)
2190                         return -EBUSY;
2191
2192                 if (rdev->mddev->pers->hot_add_disk == NULL)
2193                         return -EINVAL;
2194
2195                 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
2196                         if (rdev2->raid_disk == slot)
2197                                 return -EEXIST;
2198
2199                 rdev->raid_disk = slot;
2200                 if (test_bit(In_sync, &rdev->flags))
2201                         rdev->saved_raid_disk = slot;
2202                 else
2203                         rdev->saved_raid_disk = -1;
2204                 err = rdev->mddev->pers->
2205                         hot_add_disk(rdev->mddev, rdev);
2206                 if (err) {
2207                         rdev->raid_disk = -1;
2208                         return err;
2209                 } else
2210                         sysfs_notify_dirent(rdev->sysfs_state);
2211                 sprintf(nm, "rd%d", rdev->raid_disk);
2212                 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2213                         printk(KERN_WARNING
2214                                "md: cannot register "
2215                                "%s for %s\n",
2216                                nm, mdname(rdev->mddev));
2217
2218                 /* don't wakeup anyone, leave that to userspace. */
2219         } else {
2220                 if (slot >= rdev->mddev->raid_disks)
2221                         return -ENOSPC;
2222                 rdev->raid_disk = slot;
2223                 /* assume it is working */
2224                 clear_bit(Faulty, &rdev->flags);
2225                 clear_bit(WriteMostly, &rdev->flags);
2226                 set_bit(In_sync, &rdev->flags);
2227                 sysfs_notify_dirent(rdev->sysfs_state);
2228         }
2229         return len;
2230 }
2231
2232
2233 static struct rdev_sysfs_entry rdev_slot =
2234 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2235
2236 static ssize_t
2237 offset_show(mdk_rdev_t *rdev, char *page)
2238 {
2239         return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2240 }
2241
2242 static ssize_t
2243 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2244 {
2245         char *e;
2246         unsigned long long offset = simple_strtoull(buf, &e, 10);
2247         if (e==buf || (*e && *e != '\n'))
2248                 return -EINVAL;
2249         if (rdev->mddev->pers && rdev->raid_disk >= 0)
2250                 return -EBUSY;
2251         if (rdev->sectors && rdev->mddev->external)
2252                 /* Must set offset before size, so overlap checks
2253                  * can be sane */
2254                 return -EBUSY;
2255         rdev->data_offset = offset;
2256         return len;
2257 }
2258
2259 static struct rdev_sysfs_entry rdev_offset =
2260 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2261
2262 static ssize_t
2263 rdev_size_show(mdk_rdev_t *rdev, char *page)
2264 {
2265         return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2266 }
2267
2268 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2269 {
2270         /* check if two start/length pairs overlap */
2271         if (s1+l1 <= s2)
2272                 return 0;
2273         if (s2+l2 <= s1)
2274                 return 0;
2275         return 1;
2276 }
2277
2278 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2279 {
2280         unsigned long long blocks;
2281         sector_t new;
2282
2283         if (strict_strtoull(buf, 10, &blocks) < 0)
2284                 return -EINVAL;
2285
2286         if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2287                 return -EINVAL; /* sector conversion overflow */
2288
2289         new = blocks * 2;
2290         if (new != blocks * 2)
2291                 return -EINVAL; /* unsigned long long to sector_t overflow */
2292
2293         *sectors = new;
2294         return 0;
2295 }
2296
2297 static ssize_t
2298 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2299 {
2300         mddev_t *my_mddev = rdev->mddev;
2301         sector_t oldsectors = rdev->sectors;
2302         sector_t sectors;
2303
2304         if (strict_blocks_to_sectors(buf, &sectors) < 0)
2305                 return -EINVAL;
2306         if (my_mddev->pers && rdev->raid_disk >= 0) {
2307                 if (my_mddev->persistent) {
2308                         sectors = super_types[my_mddev->major_version].
2309                                 rdev_size_change(rdev, sectors);
2310                         if (!sectors)
2311                                 return -EBUSY;
2312                 } else if (!sectors)
2313                         sectors = (rdev->bdev->bd_inode->i_size >> 9) -
2314                                 rdev->data_offset;
2315         }
2316         if (sectors < my_mddev->dev_sectors)
2317                 return -EINVAL; /* component must fit device */
2318
2319         rdev->sectors = sectors;
2320         if (sectors > oldsectors && my_mddev->external) {
2321                 /* need to check that all other rdevs with the same ->bdev
2322                  * do not overlap.  We need to unlock the mddev to avoid
2323                  * a deadlock.  We have already changed rdev->sectors, and if
2324                  * we have to change it back, we will have the lock again.
2325                  */
2326                 mddev_t *mddev;
2327                 int overlap = 0;
2328                 struct list_head *tmp;
2329
2330                 mddev_unlock(my_mddev);
2331                 for_each_mddev(mddev, tmp) {
2332                         mdk_rdev_t *rdev2;
2333
2334                         mddev_lock(mddev);
2335                         list_for_each_entry(rdev2, &mddev->disks, same_set)
2336                                 if (test_bit(AllReserved, &rdev2->flags) ||
2337                                     (rdev->bdev == rdev2->bdev &&
2338                                      rdev != rdev2 &&
2339                                      overlaps(rdev->data_offset, rdev->sectors,
2340                                               rdev2->data_offset,
2341                                               rdev2->sectors))) {
2342                                         overlap = 1;
2343                                         break;
2344                                 }
2345                         mddev_unlock(mddev);
2346                         if (overlap) {
2347                                 mddev_put(mddev);
2348                                 break;
2349                         }
2350                 }
2351                 mddev_lock(my_mddev);
2352                 if (overlap) {
2353                         /* Someone else could have slipped in a size
2354                          * change here, but doing so is just silly.
2355                          * We put oldsectors back because we *know* it is
2356                          * safe, and trust userspace not to race with
2357                          * itself
2358                          */
2359                         rdev->sectors = oldsectors;
2360                         return -EBUSY;
2361                 }
2362         }
2363         return len;
2364 }
2365
2366 static struct rdev_sysfs_entry rdev_size =
2367 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2368
2369 static struct attribute *rdev_default_attrs[] = {
2370         &rdev_state.attr,
2371         &rdev_errors.attr,
2372         &rdev_slot.attr,
2373         &rdev_offset.attr,
2374         &rdev_size.attr,
2375         NULL,
2376 };
2377 static ssize_t
2378 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2379 {
2380         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2381         mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2382         mddev_t *mddev = rdev->mddev;
2383         ssize_t rv;
2384
2385         if (!entry->show)
2386                 return -EIO;
2387
2388         rv = mddev ? mddev_lock(mddev) : -EBUSY;
2389         if (!rv) {
2390                 if (rdev->mddev == NULL)
2391                         rv = -EBUSY;
2392                 else
2393                         rv = entry->show(rdev, page);
2394                 mddev_unlock(mddev);
2395         }
2396         return rv;
2397 }
2398
2399 static ssize_t
2400 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2401               const char *page, size_t length)
2402 {
2403         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2404         mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2405         ssize_t rv;
2406         mddev_t *mddev = rdev->mddev;
2407
2408         if (!entry->store)
2409                 return -EIO;
2410         if (!capable(CAP_SYS_ADMIN))
2411                 return -EACCES;
2412         rv = mddev ? mddev_lock(mddev): -EBUSY;
2413         if (!rv) {
2414                 if (rdev->mddev == NULL)
2415                         rv = -EBUSY;
2416                 else
2417                         rv = entry->store(rdev, page, length);
2418                 mddev_unlock(mddev);
2419         }
2420         return rv;
2421 }
2422
2423 static void rdev_free(struct kobject *ko)
2424 {
2425         mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
2426         kfree(rdev);
2427 }
2428 static struct sysfs_ops rdev_sysfs_ops = {
2429         .show           = rdev_attr_show,
2430         .store          = rdev_attr_store,
2431 };
2432 static struct kobj_type rdev_ktype = {
2433         .release        = rdev_free,
2434         .sysfs_ops      = &rdev_sysfs_ops,
2435         .default_attrs  = rdev_default_attrs,
2436 };
2437
2438 /*
2439  * Import a device. If 'super_format' >= 0, then sanity check the superblock
2440  *
2441  * mark the device faulty if:
2442  *
2443  *   - the device is nonexistent (zero size)
2444  *   - the device has no valid superblock
2445  *
2446  * a faulty rdev _never_ has rdev->sb set.
2447  */
2448 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
2449 {
2450         char b[BDEVNAME_SIZE];
2451         int err;
2452         mdk_rdev_t *rdev;
2453         sector_t size;
2454
2455         rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
2456         if (!rdev) {
2457                 printk(KERN_ERR "md: could not alloc mem for new device!\n");
2458                 return ERR_PTR(-ENOMEM);
2459         }
2460
2461         if ((err = alloc_disk_sb(rdev)))
2462                 goto abort_free;
2463
2464         err = lock_rdev(rdev, newdev, super_format == -2);
2465         if (err)
2466                 goto abort_free;
2467
2468         kobject_init(&rdev->kobj, &rdev_ktype);
2469
2470         rdev->desc_nr = -1;
2471         rdev->saved_raid_disk = -1;
2472         rdev->raid_disk = -1;
2473         rdev->flags = 0;
2474         rdev->data_offset = 0;
2475         rdev->sb_events = 0;
2476         atomic_set(&rdev->nr_pending, 0);
2477         atomic_set(&rdev->read_errors, 0);
2478         atomic_set(&rdev->corrected_errors, 0);
2479
2480         size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2481         if (!size) {
2482                 printk(KERN_WARNING 
2483                         "md: %s has zero or unknown size, marking faulty!\n",
2484                         bdevname(rdev->bdev,b));
2485                 err = -EINVAL;
2486                 goto abort_free;
2487         }
2488
2489         if (super_format >= 0) {
2490                 err = super_types[super_format].
2491                         load_super(rdev, NULL, super_minor);
2492                 if (err == -EINVAL) {
2493                         printk(KERN_WARNING
2494                                 "md: %s does not have a valid v%d.%d "
2495                                "superblock, not importing!\n",
2496                                 bdevname(rdev->bdev,b),
2497                                super_format, super_minor);
2498                         goto abort_free;
2499                 }
2500                 if (err < 0) {
2501                         printk(KERN_WARNING 
2502                                 "md: could not read %s's sb, not importing!\n",
2503                                 bdevname(rdev->bdev,b));
2504                         goto abort_free;
2505                 }
2506         }
2507
2508         INIT_LIST_HEAD(&rdev->same_set);
2509         init_waitqueue_head(&rdev->blocked_wait);
2510
2511         return rdev;
2512
2513 abort_free:
2514         if (rdev->sb_page) {
2515                 if (rdev->bdev)
2516                         unlock_rdev(rdev);
2517                 free_disk_sb(rdev);
2518         }
2519         kfree(rdev);
2520         return ERR_PTR(err);
2521 }
2522
2523 /*
2524  * Check a full RAID array for plausibility
2525  */
2526
2527
2528 static void analyze_sbs(mddev_t * mddev)
2529 {
2530         int i;
2531         mdk_rdev_t *rdev, *freshest, *tmp;
2532         char b[BDEVNAME_SIZE];
2533
2534         freshest = NULL;
2535         rdev_for_each(rdev, tmp, mddev)
2536                 switch (super_types[mddev->major_version].
2537                         load_super(rdev, freshest, mddev->minor_version)) {
2538                 case 1:
2539                         freshest = rdev;
2540                         break;
2541                 case 0:
2542                         break;
2543                 default:
2544                         printk( KERN_ERR \
2545                                 "md: fatal superblock inconsistency in %s"
2546                                 " -- removing from array\n", 
2547                                 bdevname(rdev->bdev,b));
2548                         kick_rdev_from_array(rdev);
2549                 }
2550
2551
2552         super_types[mddev->major_version].
2553                 validate_super(mddev, freshest);
2554
2555         i = 0;
2556         rdev_for_each(rdev, tmp, mddev) {
2557                 if (rdev->desc_nr >= mddev->max_disks ||
2558                     i > mddev->max_disks) {
2559                         printk(KERN_WARNING
2560                                "md: %s: %s: only %d devices permitted\n",
2561                                mdname(mddev), bdevname(rdev->bdev, b),
2562                                mddev->max_disks);
2563                         kick_rdev_from_array(rdev);
2564                         continue;
2565                 }
2566                 if (rdev != freshest)
2567                         if (super_types[mddev->major_version].
2568                             validate_super(mddev, rdev)) {
2569                                 printk(KERN_WARNING "md: kicking non-fresh %s"
2570                                         " from array!\n",
2571                                         bdevname(rdev->bdev,b));
2572                                 kick_rdev_from_array(rdev);
2573                                 continue;
2574                         }
2575                 if (mddev->level == LEVEL_MULTIPATH) {
2576                         rdev->desc_nr = i++;
2577                         rdev->raid_disk = rdev->desc_nr;
2578                         set_bit(In_sync, &rdev->flags);
2579                 } else if (rdev->raid_disk >= mddev->raid_disks) {
2580                         rdev->raid_disk = -1;
2581                         clear_bit(In_sync, &rdev->flags);
2582                 }
2583         }
2584 }
2585
2586 static void md_safemode_timeout(unsigned long data);
2587
2588 static ssize_t
2589 safe_delay_show(mddev_t *mddev, char *page)
2590 {
2591         int msec = (mddev->safemode_delay*1000)/HZ;
2592         return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2593 }
2594 static ssize_t
2595 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2596 {
2597         int scale=1;
2598         int dot=0;
2599         int i;
2600         unsigned long msec;
2601         char buf[30];
2602
2603         /* remove a period, and count digits after it */
2604         if (len >= sizeof(buf))
2605                 return -EINVAL;
2606         strlcpy(buf, cbuf, sizeof(buf));
2607         for (i=0; i<len; i++) {
2608                 if (dot) {
2609                         if (isdigit(buf[i])) {
2610                                 buf[i-1] = buf[i];
2611                                 scale *= 10;
2612                         }
2613                         buf[i] = 0;
2614                 } else if (buf[i] == '.') {
2615                         dot=1;
2616                         buf[i] = 0;
2617                 }
2618         }
2619         if (strict_strtoul(buf, 10, &msec) < 0)
2620                 return -EINVAL;
2621         msec = (msec * 1000) / scale;
2622         if (msec == 0)
2623                 mddev->safemode_delay = 0;
2624         else {
2625                 unsigned long old_delay = mddev->safemode_delay;
2626                 mddev->safemode_delay = (msec*HZ)/1000;
2627                 if (mddev->safemode_delay == 0)
2628                         mddev->safemode_delay = 1;
2629                 if (mddev->safemode_delay < old_delay)
2630                         md_safemode_timeout((unsigned long)mddev);
2631         }
2632         return len;
2633 }
2634 static struct md_sysfs_entry md_safe_delay =
2635 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2636
2637 static ssize_t
2638 level_show(mddev_t *mddev, char *page)
2639 {
2640         struct mdk_personality *p = mddev->pers;
2641         if (p)
2642                 return sprintf(page, "%s\n", p->name);
2643         else if (mddev->clevel[0])
2644                 return sprintf(page, "%s\n", mddev->clevel);
2645         else if (mddev->level != LEVEL_NONE)
2646                 return sprintf(page, "%d\n", mddev->level);
2647         else
2648                 return 0;
2649 }
2650
2651 static ssize_t
2652 level_store(mddev_t *mddev, const char *buf, size_t len)
2653 {
2654         char level[16];
2655         ssize_t rv = len;
2656         struct mdk_personality *pers;
2657         void *priv;
2658
2659         if (mddev->pers == NULL) {
2660                 if (len == 0)
2661                         return 0;
2662                 if (len >= sizeof(mddev->clevel))
2663                         return -ENOSPC;
2664                 strncpy(mddev->clevel, buf, len);
2665                 if (mddev->clevel[len-1] == '\n')
2666                         len--;
2667                 mddev->clevel[len] = 0;
2668                 mddev->level = LEVEL_NONE;
2669                 return rv;
2670         }
2671
2672         /* request to change the personality.  Need to ensure:
2673          *  - array is not engaged in resync/recovery/reshape
2674          *  - old personality can be suspended
2675          *  - new personality will access other array.
2676          */
2677
2678         if (mddev->sync_thread || mddev->reshape_position != MaxSector)
2679                 return -EBUSY;
2680
2681         if (!mddev->pers->quiesce) {
2682                 printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
2683                        mdname(mddev), mddev->pers->name);
2684                 return -EINVAL;
2685         }
2686
2687         /* Now find the new personality */
2688         if (len == 0 || len >= sizeof(level))
2689                 return -EINVAL;
2690         strncpy(level, buf, len);
2691         if (level[len-1] == '\n')
2692                 len--;
2693         level[len] = 0;
2694
2695         request_module("md-%s", level);
2696         spin_lock(&pers_lock);
2697         pers = find_pers(LEVEL_NONE, level);
2698         if (!pers || !try_module_get(pers->owner)) {
2699                 spin_unlock(&pers_lock);
2700                 printk(KERN_WARNING "md: personality %s not loaded\n", level);
2701                 return -EINVAL;
2702         }
2703         spin_unlock(&pers_lock);
2704
2705         if (pers == mddev->pers) {
2706                 /* Nothing to do! */
2707                 module_put(pers->owner);
2708                 return rv;
2709         }
2710         if (!pers->takeover) {
2711                 module_put(pers->owner);
2712                 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
2713                        mdname(mddev), level);
2714                 return -EINVAL;
2715         }
2716
2717         /* ->takeover must set new_* and/or delta_disks
2718          * if it succeeds, and may set them when it fails.
2719          */
2720         priv = pers->takeover(mddev);
2721         if (IS_ERR(priv)) {
2722                 mddev->new_level = mddev->level;
2723                 mddev->new_layout = mddev->layout;
2724                 mddev->new_chunk_sectors = mddev->chunk_sectors;
2725                 mddev->raid_disks -= mddev->delta_disks;
2726                 mddev->delta_disks = 0;
2727                 module_put(pers->owner);
2728                 printk(KERN_WARNING "md: %s: %s would not accept array\n",
2729                        mdname(mddev), level);
2730                 return PTR_ERR(priv);
2731         }
2732
2733         /* Looks like we have a winner */
2734         mddev_suspend(mddev);
2735         mddev->pers->stop(mddev);
2736         module_put(mddev->pers->owner);
2737         mddev->pers = pers;
2738         mddev->private = priv;
2739         strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
2740         mddev->level = mddev->new_level;
2741         mddev->layout = mddev->new_layout;
2742         mddev->chunk_sectors = mddev->new_chunk_sectors;
2743         mddev->delta_disks = 0;
2744         pers->run(mddev);
2745         mddev_resume(mddev);
2746         set_bit(MD_CHANGE_DEVS, &mddev->flags);
2747         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2748         md_wakeup_thread(mddev->thread);
2749         return rv;
2750 }
2751
2752 static struct md_sysfs_entry md_level =
2753 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2754
2755
2756 static ssize_t
2757 layout_show(mddev_t *mddev, char *page)
2758 {
2759         /* just a number, not meaningful for all levels */
2760         if (mddev->reshape_position != MaxSector &&
2761             mddev->layout != mddev->new_layout)
2762                 return sprintf(page, "%d (%d)\n",
2763                                mddev->new_layout, mddev->layout);
2764         return sprintf(page, "%d\n", mddev->layout);
2765 }
2766
2767 static ssize_t
2768 layout_store(mddev_t *mddev, const char *buf, size_t len)
2769 {
2770         char *e;
2771         unsigned long n = simple_strtoul(buf, &e, 10);
2772
2773         if (!*buf || (*e && *e != '\n'))
2774                 return -EINVAL;
2775
2776         if (mddev->pers) {
2777                 int err;
2778                 if (mddev->pers->check_reshape == NULL)
2779                         return -EBUSY;
2780                 mddev->new_layout = n;
2781                 err = mddev->pers->check_reshape(mddev);
2782                 if (err) {
2783                         mddev->new_layout = mddev->layout;
2784                         return err;
2785                 }
2786         } else {
2787                 mddev->new_layout = n;
2788                 if (mddev->reshape_position == MaxSector)
2789                         mddev->layout = n;
2790         }
2791         return len;
2792 }
2793 static struct md_sysfs_entry md_layout =
2794 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2795
2796
2797 static ssize_t
2798 raid_disks_show(mddev_t *mddev, char *page)
2799 {
2800         if (mddev->raid_disks == 0)
2801                 return 0;
2802         if (mddev->reshape_position != MaxSector &&
2803             mddev->delta_disks != 0)
2804                 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
2805                                mddev->raid_disks - mddev->delta_disks);
2806         return sprintf(page, "%d\n", mddev->raid_disks);
2807 }
2808
2809 static int update_raid_disks(mddev_t *mddev, int raid_disks);
2810
2811 static ssize_t
2812 raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2813 {
2814         char *e;
2815         int rv = 0;
2816         unsigned long n = simple_strtoul(buf, &e, 10);
2817
2818         if (!*buf || (*e && *e != '\n'))
2819                 return -EINVAL;
2820
2821         if (mddev->pers)
2822                 rv = update_raid_disks(mddev, n);
2823         else if (mddev->reshape_position != MaxSector) {
2824                 int olddisks = mddev->raid_disks - mddev->delta_disks;
2825                 mddev->delta_disks = n - olddisks;
2826                 mddev->raid_disks = n;
2827         } else
2828                 mddev->raid_disks = n;
2829         return rv ? rv : len;
2830 }
2831 static struct md_sysfs_entry md_raid_disks =
2832 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2833
2834 static ssize_t
2835 chunk_size_show(mddev_t *mddev, char *page)
2836 {
2837         if (mddev->reshape_position != MaxSector &&
2838             mddev->chunk_sectors != mddev->new_chunk_sectors)
2839                 return sprintf(page, "%d (%d)\n",
2840                                mddev->new_chunk_sectors << 9,
2841                                mddev->chunk_sectors << 9);
2842         return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
2843 }
2844
2845 static ssize_t
2846 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2847 {
2848         char *e;
2849         unsigned long n = simple_strtoul(buf, &e, 10);
2850
2851         if (!*buf || (*e && *e != '\n'))
2852                 return -EINVAL;
2853
2854         if (mddev->pers) {
2855                 int err;
2856                 if (mddev->pers->check_reshape == NULL)
2857                         return -EBUSY;
2858                 mddev->new_chunk_sectors = n >> 9;
2859                 err = mddev->pers->check_reshape(mddev);
2860                 if (err) {
2861                         mddev->new_chunk_sectors = mddev->chunk_sectors;
2862                         return err;
2863                 }
2864         } else {
2865                 mddev->new_chunk_sectors = n >> 9;
2866                 if (mddev->reshape_position == MaxSector)
2867                         mddev->chunk_sectors = n >> 9;
2868         }
2869         return len;
2870 }
2871 static struct md_sysfs_entry md_chunk_size =
2872 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2873
2874 static ssize_t
2875 resync_start_show(mddev_t *mddev, char *page)
2876 {
2877         if (mddev->recovery_cp == MaxSector)
2878                 return sprintf(page, "none\n");
2879         return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2880 }
2881
2882 static ssize_t
2883 resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2884 {
2885         char *e;
2886         unsigned long long n = simple_strtoull(buf, &e, 10);
2887
2888         if (mddev->pers)
2889                 return -EBUSY;
2890         if (!*buf || (*e && *e != '\n'))
2891                 return -EINVAL;
2892
2893         mddev->recovery_cp = n;
2894         return len;
2895 }
2896 static struct md_sysfs_entry md_resync_start =
2897 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2898
2899 /*
2900  * The array state can be:
2901  *
2902  * clear
2903  *     No devices, no size, no level
2904  *     Equivalent to STOP_ARRAY ioctl
2905  * inactive
2906  *     May have some settings, but array is not active
2907  *        all IO results in error
2908  *     When written, doesn't tear down array, but just stops it
2909  * suspended (not supported yet)
2910  *     All IO requests will block. The array can be reconfigured.
2911  *     Writing this, if accepted, will block until array is quiescent
2912  * readonly
2913  *     no resync can happen.  no superblocks get written.
2914  *     write requests fail
2915  * read-auto
2916  *     like readonly, but behaves like 'clean' on a write request.
2917  *
2918  * clean - no pending writes, but otherwise active.
2919  *     When written to inactive array, starts without resync
2920  *     If a write request arrives then
2921  *       if metadata is known, mark 'dirty' and switch to 'active'.
2922  *       if not known, block and switch to write-pending
2923  *     If written to an active array that has pending writes, then fails.
2924  * active
2925  *     fully active: IO and resync can be happening.
2926  *     When written to inactive array, starts with resync
2927  *
2928  * write-pending
2929  *     clean, but writes are blocked waiting for 'active' to be written.
2930  *
2931  * active-idle
2932  *     like active, but no writes have been seen for a while (100msec).
2933  *
2934  */
2935 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2936                    write_pending, active_idle, bad_word};
2937 static char *array_states[] = {
2938         "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2939         "write-pending", "active-idle", NULL };
2940
2941 static int match_word(const char *word, char **list)
2942 {
2943         int n;
2944         for (n=0; list[n]; n++)
2945                 if (cmd_match(word, list[n]))
2946                         break;
2947         return n;
2948 }
2949
2950 static ssize_t
2951 array_state_show(mddev_t *mddev, char *page)
2952 {
2953         enum array_state st = inactive;
2954
2955         if (mddev->pers)
2956                 switch(mddev->ro) {
2957                 case 1:
2958                         st = readonly;
2959                         break;
2960                 case 2:
2961                         st = read_auto;
2962                         break;
2963                 case 0:
2964                         if (mddev->in_sync)
2965                                 st = clean;
2966                         else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
2967                                 st = write_pending;
2968                         else if (mddev->safemode)
2969                                 st = active_idle;
2970                         else
2971                                 st = active;
2972                 }
2973         else {
2974                 if (list_empty(&mddev->disks) &&
2975                     mddev->raid_disks == 0 &&
2976                     mddev->dev_sectors == 0)
2977                         st = clear;
2978                 else
2979                         st = inactive;
2980         }
2981         return sprintf(page, "%s\n", array_states[st]);
2982 }
2983
2984 static int do_md_stop(mddev_t * mddev, int ro, int is_open);
2985 static int do_md_run(mddev_t * mddev);
2986 static int restart_array(mddev_t *mddev);
2987
2988 static ssize_t
2989 array_state_store(mddev_t *mddev, const char *buf, size_t len)
2990 {
2991         int err = -EINVAL;
2992         enum array_state st = match_word(buf, array_states);
2993         switch(st) {
2994         case bad_word:
2995                 break;
2996         case clear:
2997                 /* stopping an active array */
2998                 if (atomic_read(&mddev->openers) > 0)
2999                         return -EBUSY;
3000                 err = do_md_stop(mddev, 0, 0);
3001                 break;
3002         case inactive:
3003                 /* stopping an active array */
3004                 if (mddev->pers) {
3005                         if (atomic_read(&mddev->openers) > 0)
3006                                 return -EBUSY;
3007                         err = do_md_stop(mddev, 2, 0);
3008                 } else
3009                         err = 0; /* already inactive */
3010                 break;
3011         case suspended:
3012                 break; /* not supported yet */
3013         case readonly:
3014                 if (mddev->pers)
3015                         err = do_md_stop(mddev, 1, 0);
3016                 else {
3017                         mddev->ro = 1;
3018                         set_disk_ro(mddev->gendisk, 1);
3019                         err = do_md_run(mddev);
3020                 }
3021                 break;
3022         case read_auto:
3023                 if (mddev->pers) {
3024                         if (mddev->ro == 0)
3025                                 err = do_md_stop(mddev, 1, 0);
3026                         else if (mddev->ro == 1)
3027                                 err = restart_array(mddev);
3028                         if (err == 0) {
3029                                 mddev->ro = 2;
3030                                 set_disk_ro(mddev->gendisk, 0);
3031                         }
3032                 } else {
3033                         mddev->ro = 2;
3034                         err = do_md_run(mddev);
3035                 }
3036                 break;
3037         case clean:
3038                 if (mddev->pers) {
3039                         restart_array(mddev);
3040                         spin_lock_irq(&mddev->write_lock);
3041                         if (atomic_read(&mddev->writes_pending) == 0) {
3042                                 if (mddev->in_sync == 0) {
3043                                         mddev->in_sync = 1;
3044                                         if (mddev->safemode == 1)
3045                                                 mddev->safemode = 0;
3046                                         if (mddev->persistent)
3047                                                 set_bit(MD_CHANGE_CLEAN,
3048                                                         &mddev->flags);
3049                                 }
3050                                 err = 0;
3051                         } else
3052                                 err = -EBUSY;
3053                         spin_unlock_irq(&mddev->write_lock);
3054                 } else
3055                         err = -EINVAL;
3056                 break;
3057         case active:
3058                 if (mddev->pers) {
3059                         restart_array(mddev);
3060                         if (mddev->external)
3061                                 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
3062                         wake_up(&mddev->sb_wait);
3063                         err = 0;
3064                 } else {
3065                         mddev->ro = 0;
3066                         set_disk_ro(mddev->gendisk, 0);
3067                         err = do_md_run(mddev);
3068                 }
3069                 break;
3070         case write_pending:
3071         case active_idle:
3072                 /* these cannot be set */
3073                 break;
3074         }
3075         if (err)
3076                 return err;
3077         else {
3078                 sysfs_notify_dirent(mddev->sysfs_state);
3079                 return len;
3080         }
3081 }
3082 static struct md_sysfs_entry md_array_state =
3083 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3084
3085 static ssize_t
3086 null_show(mddev_t *mddev, char *page)
3087 {
3088         return -EINVAL;
3089 }
3090
3091 static ssize_t
3092 new_dev_store(mddev_t *mddev, const char *buf, size_t len)
3093 {
3094         /* buf must be %d:%d\n? giving major and minor numbers */
3095         /* The new device is added to the array.
3096          * If the array has a persistent superblock, we read the
3097          * superblock to initialise info and check validity.
3098          * Otherwise, only checking done is that in bind_rdev_to_array,
3099          * which mainly checks size.
3100          */
3101         char *e;
3102         int major = simple_strtoul(buf, &e, 10);
3103         int minor;
3104         dev_t dev;
3105         mdk_rdev_t *rdev;
3106         int err;
3107
3108         if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
3109                 return -EINVAL;
3110         minor = simple_strtoul(e+1, &e, 10);
3111         if (*e && *e != '\n')
3112                 return -EINVAL;
3113         dev = MKDEV(major, minor);
3114         if (major != MAJOR(dev) ||
3115             minor != MINOR(dev))
3116                 return -EOVERFLOW;
3117
3118
3119         if (mddev->persistent) {
3120                 rdev = md_import_device(dev, mddev->major_version,
3121                                         mddev->minor_version);
3122                 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
3123                         mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3124                                                        mdk_rdev_t, same_set);
3125                         err = super_types[mddev->major_version]
3126                                 .load_super(rdev, rdev0, mddev->minor_version);
3127                         if (err < 0)
3128                                 goto out;
3129                 }
3130         } else if (mddev->external)
3131                 rdev = md_import_device(dev, -2, -1);
3132         else
3133                 rdev = md_import_device(dev, -1, -1);
3134
3135         if (IS_ERR(rdev))
3136                 return PTR_ERR(rdev);
3137         err = bind_rdev_to_array(rdev, mddev);
3138  out:
3139         if (err)
3140                 export_rdev(rdev);
3141         return err ? err : len;
3142 }
3143
3144 static struct md_sysfs_entry md_new_device =
3145 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
3146
3147 static ssize_t
3148 bitmap_store(mddev_t *mddev, const char *buf, size_t len)
3149 {
3150         char *end;
3151         unsigned long chunk, end_chunk;
3152
3153         if (!mddev->bitmap)
3154                 goto out;
3155         /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
3156         while (*buf) {
3157                 chunk = end_chunk = simple_strtoul(buf, &end, 0);
3158                 if (buf == end) break;
3159                 if (*end == '-') { /* range */
3160                         buf = end + 1;
3161                         end_chunk = simple_strtoul(buf, &end, 0);
3162                         if (buf == end) break;
3163                 }
3164                 if (*end && !isspace(*end)) break;
3165                 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
3166                 buf = end;
3167                 while (isspace(*buf)) buf++;
3168         }
3169         bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
3170 out:
3171         return len;
3172 }
3173
3174 static struct md_sysfs_entry md_bitmap =
3175 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
3176
3177 static ssize_t
3178 size_show(mddev_t *mddev, char *page)
3179 {
3180         return sprintf(page, "%llu\n",
3181                 (unsigned long long)mddev->dev_sectors / 2);
3182 }
3183
3184 static int update_size(mddev_t *mddev, sector_t num_sectors);
3185
3186 static ssize_t
3187 size_store(mddev_t *mddev, const char *buf, size_t len)
3188 {
3189         /* If array is inactive, we can reduce the component size, but
3190          * not increase it (except from 0).
3191          * If array is active, we can try an on-line resize
3192          */
3193         sector_t sectors;
3194         int err = strict_blocks_to_sectors(buf, &sectors);
3195
3196         if (err < 0)
3197                 return err;
3198         if (mddev->pers) {
3199                 err = update_size(mddev, sectors);
3200                 md_update_sb(mddev, 1);
3201         } else {
3202                 if (mddev->dev_sectors == 0 ||
3203                     mddev->dev_sectors > sectors)
3204                         mddev->dev_sectors = sectors;
3205                 else
3206                         err = -ENOSPC;
3207         }
3208         return err ? err : len;
3209 }
3210
3211 static struct md_sysfs_entry md_size =
3212 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
3213
3214
3215 /* Metdata version.
3216  * This is one of
3217  *   'none' for arrays with no metadata (good luck...)
3218  *   'external' for arrays with externally managed metadata,
3219  * or N.M for internally known formats
3220  */
3221 static ssize_t
3222 metadata_show(mddev_t *mddev, char *page)
3223 {
3224         if (mddev->persistent)
3225                 return sprintf(page, "%d.%d\n",
3226                                mddev->major_version, mddev->minor_version);
3227         else if (mddev->external)
3228                 return sprintf(page, "external:%s\n", mddev->metadata_type);
3229         else
3230                 return sprintf(page, "none\n");
3231 }
3232
3233 static ssize_t
3234 metadata_store(mddev_t *mddev, const char *buf, size_t len)
3235 {
3236         int major, minor;
3237         char *e;
3238         /* Changing the details of 'external' metadata is
3239          * always permitted.  Otherwise there must be
3240          * no devices attached to the array.
3241          */
3242         if (mddev->external && strncmp(buf, "external:", 9) == 0)
3243                 ;
3244         else if (!list_empty(&mddev->disks))
3245                 return -EBUSY;
3246
3247         if (cmd_match(buf, "none")) {
3248                 mddev->persistent = 0;
3249                 mddev->external = 0;
3250                 mddev->major_version = 0;
3251                 mddev->minor_version = 90;
3252                 return len;
3253         }
3254         if (strncmp(buf, "external:", 9) == 0) {
3255                 size_t namelen = len-9;
3256                 if (namelen >= sizeof(mddev->metadata_type))
3257                         namelen = sizeof(mddev->metadata_type)-1;
3258                 strncpy(mddev->metadata_type, buf+9, namelen);
3259                 mddev->metadata_type[namelen] = 0;
3260                 if (namelen && mddev->metadata_type[namelen-1] == '\n')
3261                         mddev->metadata_type[--namelen] = 0;
3262                 mddev->persistent = 0;
3263                 mddev->external = 1;
3264                 mddev->major_version = 0;
3265                 mddev->minor_version = 90;
3266                 return len;
3267         }
3268         major = simple_strtoul(buf, &e, 10);
3269         if (e==buf || *e != '.')
3270                 return -EINVAL;
3271         buf = e+1;
3272         minor = simple_strtoul(buf, &e, 10);
3273         if (e==buf || (*e && *e != '\n') )
3274                 return -EINVAL;
3275         if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
3276                 return -ENOENT;
3277         mddev->major_version = major;
3278         mddev->minor_version = minor;
3279         mddev->persistent = 1;
3280         mddev->external = 0;
3281         return len;
3282 }
3283
3284 static struct md_sysfs_entry md_metadata =
3285 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
3286
3287 static ssize_t
3288 action_show(mddev_t *mddev, char *page)
3289 {
3290         char *type = "idle";
3291         if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3292                 type = "frozen";
3293         else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3294             (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
3295                 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3296                         type = "reshape";
3297                 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3298                         if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
3299                                 type = "resync";
3300                         else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
3301                                 type = "check";
3302                         else
3303                                 type = "repair";
3304                 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
3305                         type = "recover";
3306         }
3307         return sprintf(page, "%s\n", type);
3308 }
3309
3310 static ssize_t
3311 action_store(mddev_t *mddev, const char *page, size_t len)
3312 {
3313         if (!mddev->pers || !mddev->pers->sync_request)
3314                 return -EINVAL;
3315
3316         if (cmd_match(page, "frozen"))
3317                 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3318         else
3319                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3320
3321         if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
3322                 if (mddev->sync_thread) {
3323                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3324                         md_unregister_thread(mddev->sync_thread);
3325                         mddev->sync_thread = NULL;
3326                         mddev->recovery = 0;
3327                 }
3328         } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3329                    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
3330                 return -EBUSY;
3331         else if (cmd_match(page, "resync"))
3332                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3333         else if (cmd_match(page, "recover")) {
3334                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
3335                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3336         } else if (cmd_match(page, "reshape")) {
3337                 int err;
3338                 if (mddev->pers->start_reshape == NULL)
3339                         return -EINVAL;
3340                 err = mddev->pers->start_reshape(mddev);
3341                 if (err)
3342                         return err;
3343                 sysfs_notify(&mddev->kobj, NULL, "degraded");
3344         } else {
3345                 if (cmd_match(page, "check"))
3346                         set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3347                 else if (!cmd_match(page, "repair"))
3348                         return -EINVAL;
3349                 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
3350                 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3351         }
3352         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3353         md_wakeup_thread(mddev->thread);
3354         sysfs_notify_dirent(mddev->sysfs_action);
3355         return len;
3356 }
3357
3358 static ssize_t
3359 mismatch_cnt_show(mddev_t *mddev, char *page)
3360 {
3361         return sprintf(page, "%llu\n",
3362                        (unsigned long long) mddev->resync_mismatches);
3363 }
3364
3365 static struct md_sysfs_entry md_scan_mode =
3366 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
3367
3368
3369 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
3370
3371 static ssize_t
3372 sync_min_show(mddev_t *mddev, char *page)
3373 {
3374         return sprintf(page, "%d (%s)\n", speed_min(mddev),
3375                        mddev->sync_speed_min ? "local": "system");
3376 }
3377
3378 static ssize_t
3379 sync_min_store(mddev_t *mddev, const char *buf, size_t len)
3380 {
3381         int min;
3382         char *e;
3383         if (strncmp(buf, "system", 6)==0) {
3384                 mddev->sync_speed_min = 0;
3385                 return len;
3386         }
3387         min = simple_strtoul(buf, &e, 10);
3388         if (buf == e || (*e && *e != '\n') || min <= 0)
3389                 return -EINVAL;
3390         mddev->sync_speed_min = min;
3391         return len;
3392 }
3393
3394 static struct md_sysfs_entry md_sync_min =
3395 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
3396
3397 static ssize_t
3398 sync_max_show(mddev_t *mddev, char *page)
3399 {
3400         return sprintf(page, "%d (%s)\n", speed_max(mddev),
3401                        mddev->sync_speed_max ? "local": "system");
3402 }
3403
3404 static ssize_t
3405 sync_max_store(mddev_t *mddev, const char *buf, size_t len)
3406 {
3407         int max;
3408         char *e;
3409         if (strncmp(buf, "system", 6)==0) {
3410                 mddev->sync_speed_max = 0;
3411                 return len;
3412         }
3413         max = simple_strtoul(buf, &e, 10);
3414         if (buf == e || (*e && *e != '\n') || max <= 0)
3415                 return -EINVAL;
3416         mddev->sync_speed_max = max;
3417         return len;
3418 }
3419
3420 static struct md_sysfs_entry md_sync_max =
3421 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
3422
3423 static ssize_t
3424 degraded_show(mddev_t *mddev, char *page)
3425 {
3426         return sprintf(page, "%d\n", mddev->degraded);
3427 }
3428 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
3429
3430 static ssize_t
3431 sync_force_parallel_show(mddev_t *mddev, char *page)
3432 {
3433         return sprintf(page, "%d\n", mddev->parallel_resync);
3434 }
3435
3436 static ssize_t
3437 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
3438 {
3439         long n;
3440
3441         if (strict_strtol(buf, 10, &n))
3442                 return -EINVAL;
3443
3444         if (n != 0 && n != 1)
3445                 return -EINVAL;
3446
3447         mddev->parallel_resync = n;
3448
3449         if (mddev->sync_thread)
3450                 wake_up(&resync_wait);
3451
3452         return len;
3453 }
3454
3455 /* force parallel resync, even with shared block devices */
3456 static struct md_sysfs_entry md_sync_force_parallel =
3457 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
3458        sync_force_parallel_show, sync_force_parallel_store);
3459
3460 static ssize_t
3461 sync_speed_show(mddev_t *mddev, char *page)
3462 {
3463         unsigned long resync, dt, db;
3464         if (mddev->curr_resync == 0)
3465                 return sprintf(page, "none\n");
3466         resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3467         dt = (jiffies - mddev->resync_mark) / HZ;
3468         if (!dt) dt++;
3469         db = resync - mddev->resync_mark_cnt;
3470         return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
3471 }
3472
3473 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
3474
3475 static ssize_t
3476 sync_completed_show(mddev_t *mddev, char *page)
3477 {
3478         unsigned long max_sectors, resync;
3479
3480         if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3481                 return sprintf(page, "none\n");
3482
3483         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3484                 max_sectors = mddev->resync_max_sectors;
3485         else
3486                 max_sectors = mddev->dev_sectors;
3487
3488         resync = mddev->curr_resync_completed;
3489         return sprintf(page, "%lu / %lu\n", resync, max_sectors);
3490 }
3491
3492 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
3493
3494 static ssize_t
3495 min_sync_show(mddev_t *mddev, char *page)
3496 {
3497         return sprintf(page, "%llu\n",
3498                        (unsigned long long)mddev->resync_min);
3499 }
3500 static ssize_t
3501 min_sync_store(mddev_t *mddev, const char *buf, size_t len)
3502 {
3503         unsigned long long min;
3504         if (strict_strtoull(buf, 10, &min))
3505                 return -EINVAL;
3506         if (min > mddev->resync_max)
3507                 return -EINVAL;
3508         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3509                 return -EBUSY;
3510
3511         /* Must be a multiple of chunk_size */
3512         if (mddev->chunk_sectors) {
3513                 sector_t temp = min;
3514                 if (sector_div(temp, mddev->chunk_sectors))
3515                         return -EINVAL;
3516         }
3517         mddev->resync_min = min;
3518
3519         return len;
3520 }
3521
3522 static struct md_sysfs_entry md_min_sync =
3523 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
3524
3525 static ssize_t
3526 max_sync_show(mddev_t *mddev, char *page)
3527 {
3528         if (mddev->resync_max == MaxSector)
3529                 return sprintf(page, "max\n");
3530         else
3531                 return sprintf(page, "%llu\n",
3532                                (unsigned long long)mddev->resync_max);
3533 }
3534 static ssize_t
3535 max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3536 {
3537         if (strncmp(buf, "max", 3) == 0)
3538                 mddev->resync_max = MaxSector;
3539         else {
3540                 unsigned long long max;
3541                 if (strict_strtoull(buf, 10, &max))
3542                         return -EINVAL;
3543                 if (max < mddev->resync_min)
3544                         return -EINVAL;
3545                 if (max < mddev->resync_max &&
3546                     test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3547                         return -EBUSY;
3548
3549                 /* Must be a multiple of chunk_size */
3550                 if (mddev->chunk_sectors) {
3551                         sector_t temp = max;
3552                         if (sector_div(temp, mddev->chunk_sectors))
3553                                 return -EINVAL;
3554                 }
3555                 mddev->resync_max = max;
3556         }
3557         wake_up(&mddev->recovery_wait);
3558         return len;
3559 }
3560
3561 static struct md_sysfs_entry md_max_sync =
3562 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
3563
3564 static ssize_t
3565 suspend_lo_show(mddev_t *mddev, char *page)
3566 {
3567         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
3568 }
3569
3570 static ssize_t
3571 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
3572 {
3573         char *e;
3574         unsigned long long new = simple_strtoull(buf, &e, 10);
3575
3576         if (mddev->pers->quiesce == NULL)
3577                 return -EINVAL;
3578         if (buf == e || (*e && *e != '\n'))
3579                 return -EINVAL;
3580         if (new >= mddev->suspend_hi ||
3581             (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
3582                 mddev->suspend_lo = new;
3583                 mddev->pers->quiesce(mddev, 2);
3584                 return len;
3585         } else
3586                 return -EINVAL;
3587 }
3588 static struct md_sysfs_entry md_suspend_lo =
3589 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
3590
3591
3592 static ssize_t
3593 suspend_hi_show(mddev_t *mddev, char *page)
3594 {
3595         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
3596 }
3597
3598 static ssize_t
3599 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
3600 {
3601         char *e;
3602         unsigned long long new = simple_strtoull(buf, &e, 10);
3603
3604         if (mddev->pers->quiesce == NULL)
3605                 return -EINVAL;
3606         if (buf == e || (*e && *e != '\n'))
3607                 return -EINVAL;
3608         if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
3609             (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
3610                 mddev->suspend_hi = new;
3611                 mddev->pers->quiesce(mddev, 1);
3612                 mddev->pers->quiesce(mddev, 0);
3613                 return len;
3614         } else
3615                 return -EINVAL;
3616 }
3617 static struct md_sysfs_entry md_suspend_hi =
3618 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
3619
3620 static ssize_t
3621 reshape_position_show(mddev_t *mddev, char *page)
3622 {
3623         if (mddev->reshape_position != MaxSector)
3624                 return sprintf(page, "%llu\n",
3625                                (unsigned long long)mddev->reshape_position);
3626         strcpy(page, "none\n");
3627         return 5;
3628 }
3629
3630 static ssize_t
3631 reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
3632 {
3633         char *e;
3634         unsigned long long new = simple_strtoull(buf, &e, 10);
3635         if (mddev->pers)
3636                 return -EBUSY;
3637         if (buf == e || (*e && *e != '\n'))
3638                 return -EINVAL;
3639         mddev->reshape_position = new;
3640         mddev->delta_disks = 0;
3641         mddev->new_level = mddev->level;
3642         mddev->new_layout = mddev->layout;
3643         mddev->new_chunk_sectors = mddev->chunk_sectors;
3644         return len;
3645 }
3646
3647 static struct md_sysfs_entry md_reshape_position =
3648 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
3649        reshape_position_store);
3650
3651 static ssize_t
3652 array_size_show(mddev_t *mddev, char *page)
3653 {
3654         if (mddev->external_size)
3655                 return sprintf(page, "%llu\n",
3656                                (unsigned long long)mddev->array_sectors/2);
3657         else
3658                 return sprintf(page, "default\n");
3659 }
3660
3661 static ssize_t
3662 array_size_store(mddev_t *mddev, const char *buf, size_t len)
3663 {
3664         sector_t sectors;
3665
3666         if (strncmp(buf, "default", 7) == 0) {
3667                 if (mddev->pers)
3668                         sectors = mddev->pers->size(mddev, 0, 0);
3669                 else
3670                         sectors = mddev->array_sectors;
3671
3672                 mddev->external_size = 0;
3673         } else {
3674                 if (strict_blocks_to_sectors(buf, &sectors) < 0)
3675                         return -EINVAL;
3676                 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
3677                         return -E2BIG;
3678
3679                 mddev->external_size = 1;
3680         }
3681
3682         mddev->array_sectors = sectors;
3683         set_capacity(mddev->gendisk, mddev->array_sectors);
3684         if (mddev->pers) {
3685                 struct block_device *bdev = bdget_disk(mddev->gendisk, 0);
3686
3687                 if (bdev) {
3688                         mutex_lock(&bdev->bd_inode->i_mutex);
3689                         i_size_write(bdev->bd_inode,
3690                                      (loff_t)mddev->array_sectors << 9);
3691                         mutex_unlock(&bdev->bd_inode->i_mutex);
3692                         bdput(bdev);
3693                 }
3694         }
3695
3696         return len;
3697 }
3698
3699 static struct md_sysfs_entry md_array_size =
3700 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
3701        array_size_store);
3702
3703 static struct attribute *md_default_attrs[] = {
3704         &md_level.attr,
3705         &md_layout.attr,
3706         &md_raid_disks.attr,
3707         &md_chunk_size.attr,
3708         &md_size.attr,
3709         &md_resync_start.attr,
3710         &md_metadata.attr,
3711         &md_new_device.attr,
3712         &md_safe_delay.attr,
3713         &md_array_state.attr,
3714         &md_reshape_position.attr,
3715         &md_array_size.attr,
3716         NULL,
3717 };
3718
3719 static struct attribute *md_redundancy_attrs[] = {
3720         &md_scan_mode.attr,
3721         &md_mismatches.attr,
3722         &md_sync_min.attr,
3723         &md_sync_max.attr,
3724         &md_sync_speed.attr,
3725         &md_sync_force_parallel.attr,
3726         &md_sync_completed.attr,
3727         &md_min_sync.attr,
3728         &md_max_sync.attr,
3729         &md_suspend_lo.attr,
3730         &md_suspend_hi.attr,
3731         &md_bitmap.attr,
3732         &md_degraded.attr,
3733         NULL,
3734 };
3735 static struct attribute_group md_redundancy_group = {
3736         .name = NULL,
3737         .attrs = md_redundancy_attrs,
3738 };
3739
3740
3741 static ssize_t
3742 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3743 {
3744         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3745         mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3746         ssize_t rv;
3747
3748         if (!entry->show)
3749                 return -EIO;
3750         rv = mddev_lock(mddev);
3751         if (!rv) {
3752                 rv = entry->show(mddev, page);
3753                 mddev_unlock(mddev);
3754         }
3755         return rv;
3756 }
3757
3758 static ssize_t
3759 md_attr_store(struct kobject *kobj, struct attribute *attr,
3760               const char *page, size_t length)
3761 {
3762         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3763         mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3764         ssize_t rv;
3765
3766         if (!entry->store)
3767                 return -EIO;
3768         if (!capable(CAP_SYS_ADMIN))
3769                 return -EACCES;
3770         rv = mddev_lock(mddev);
3771         if (mddev->hold_active == UNTIL_IOCTL)
3772                 mddev->hold_active = 0;
3773         if (!rv) {
3774                 rv = entry->store(mddev, page, length);
3775                 mddev_unlock(mddev);
3776         }
3777         return rv;
3778 }
3779
3780 static void md_free(struct kobject *ko)
3781 {
3782         mddev_t *mddev = container_of(ko, mddev_t, kobj);
3783
3784         if (mddev->sysfs_state)
3785                 sysfs_put(mddev->sysfs_state);
3786
3787         if (mddev->gendisk) {
3788                 del_gendisk(mddev->gendisk);
3789                 put_disk(mddev->gendisk);
3790         }
3791         if (mddev->queue)
3792                 blk_cleanup_queue(mddev->queue);
3793
3794         kfree(mddev);
3795 }
3796
3797 static struct sysfs_ops md_sysfs_ops = {
3798         .show   = md_attr_show,
3799         .store  = md_attr_store,
3800 };
3801 static struct kobj_type md_ktype = {
3802         .release        = md_free,
3803         .sysfs_ops      = &md_sysfs_ops,
3804         .default_attrs  = md_default_attrs,
3805 };
3806
3807 int mdp_major = 0;
3808
3809 static void mddev_delayed_delete(struct work_struct *ws)
3810 {
3811         mddev_t *mddev = container_of(ws, mddev_t, del_work);
3812
3813         if (mddev->private == &md_redundancy_group) {
3814                 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3815                 if (mddev->sysfs_action)
3816                         sysfs_put(mddev->sysfs_action);
3817                 mddev->sysfs_action = NULL;
3818                 mddev->private = NULL;
3819         }
3820         kobject_del(&mddev->kobj);
3821         kobject_put(&mddev->kobj);
3822 }
3823
3824 static int md_alloc(dev_t dev, char *name)
3825 {
3826         static DEFINE_MUTEX(disks_mutex);
3827         mddev_t *mddev = mddev_find(dev);
3828         struct gendisk *disk;
3829         int partitioned;
3830         int shift;
3831         int unit;
3832         int error;
3833
3834         if (!mddev)
3835                 return -ENODEV;
3836
3837         partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
3838         shift = partitioned ? MdpMinorShift : 0;
3839         unit = MINOR(mddev->unit) >> shift;
3840
3841         /* wait for any previous instance if this device
3842          * to be completed removed (mddev_delayed_delete).
3843          */
3844         flush_scheduled_work();
3845
3846         mutex_lock(&disks_mutex);
3847         if (mddev->gendisk) {
3848                 mutex_unlock(&disks_mutex);
3849                 mddev_put(mddev);
3850                 return -EEXIST;
3851         }
3852
3853         if (name) {
3854                 /* Need to ensure that 'name' is not a duplicate.
3855                  */
3856                 mddev_t *mddev2;
3857                 spin_lock(&all_mddevs_lock);
3858
3859                 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
3860                         if (mddev2->gendisk &&
3861                             strcmp(mddev2->gendisk->disk_name, name) == 0) {
3862                                 spin_unlock(&all_mddevs_lock);
3863                                 return -EEXIST;
3864                         }
3865                 spin_unlock(&all_mddevs_lock);
3866         }
3867
3868         mddev->queue = blk_alloc_queue(GFP_KERNEL);
3869         if (!mddev->queue) {
3870                 mutex_unlock(&disks_mutex);
3871                 mddev_put(mddev);
3872                 return -ENOMEM;
3873         }
3874         mddev->queue->queuedata = mddev;
3875
3876         /* Can be unlocked because the queue is new: no concurrency */
3877         queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
3878
3879         blk_queue_make_request(mddev->queue, md_make_request);
3880
3881         disk = alloc_disk(1 << shift);
3882         if (!disk) {
3883                 mutex_unlock(&disks_mutex);
3884                 blk_cleanup_queue(mddev->queue);
3885                 mddev->queue = NULL;
3886                 mddev_put(mddev);
3887                 return -ENOMEM;
3888         }
3889         disk->major = MAJOR(mddev->unit);
3890         disk->first_minor = unit << shift;
3891         if (name)
3892                 strcpy(disk->disk_name, name);
3893         else if (partitioned)
3894                 sprintf(disk->disk_name, "md_d%d", unit);
3895         else
3896                 sprintf(disk->disk_name, "md%d", unit);
3897         disk->fops = &md_fops;
3898         disk->private_data = mddev;
3899         disk->queue = mddev->queue;
3900         /* Allow extended partitions.  This makes the
3901          * 'mdp' device redundant, but we can't really
3902          * remove it now.
3903          */
3904         disk->flags |= GENHD_FL_EXT_DEVT;
3905         add_disk(disk);
3906         mddev->gendisk = disk;
3907         error = kobject_init_and_add(&mddev->kobj, &md_ktype,
3908                                      &disk_to_dev(disk)->kobj, "%s", "md");
3909         mutex_unlock(&disks_mutex);
3910         if (error)
3911                 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3912                        disk->disk_name);
3913         else {
3914                 kobject_uevent(&mddev->kobj, KOBJ_ADD);
3915                 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
3916         }
3917         mddev_put(mddev);
3918         return 0;
3919 }
3920
3921 static struct kobject *md_probe(dev_t dev, int *part, void *data)
3922 {
3923         md_alloc(dev, NULL);
3924         return NULL;
3925 }
3926
3927 static int add_named_array(const char *val, struct kernel_param *kp)
3928 {
3929         /* val must be "md_*" where * is not all digits.
3930          * We allocate an array with a large free minor number, and
3931          * set the name to val.  val must not already be an active name.
3932          */
3933         int len = strlen(val);
3934         char buf[DISK_NAME_LEN];
3935
3936         while (len && val[len-1] == '\n')
3937                 len--;
3938         if (len >= DISK_NAME_LEN)
3939                 return -E2BIG;
3940         strlcpy(buf, val, len+1);
3941         if (strncmp(buf, "md_", 3) != 0)
3942                 return -EINVAL;
3943         return md_alloc(0, buf);
3944 }
3945
3946 static void md_safemode_timeout(unsigned long data)
3947 {
3948         mddev_t *mddev = (mddev_t *) data;
3949
3950         if (!atomic_read(&mddev->writes_pending)) {
3951                 mddev->safemode = 1;
3952                 if (mddev->external)
3953                         sysfs_notify_dirent(mddev->sysfs_state);
3954         }
3955         md_wakeup_thread(mddev->thread);
3956 }
3957
3958 static int start_dirty_degraded;
3959
3960 static int do_md_run(mddev_t * mddev)
3961 {
3962         int err;
3963         mdk_rdev_t *rdev;
3964         struct gendisk *disk;
3965         struct mdk_personality *pers;
3966
3967         if (list_empty(&mddev->disks))
3968                 /* cannot run an array with no devices.. */
3969                 return -EINVAL;
3970
3971         if (mddev->pers)
3972                 return -EBUSY;
3973
3974         /*
3975          * Analyze all RAID superblock(s)
3976          */
3977         if (!mddev->raid_disks) {
3978                 if (!mddev->persistent)
3979                         return -EINVAL;
3980                 analyze_sbs(mddev);
3981         }
3982
3983         if (mddev->level != LEVEL_NONE)
3984                 request_module("md-level-%d", mddev->level);
3985         else if (mddev->clevel[0])
3986                 request_module("md-%s", mddev->clevel);
3987
3988         /*
3989          * Drop all container device buffers, from now on
3990          * the only valid external interface is through the md
3991          * device.
3992          */
3993         list_for_each_entry(rdev, &mddev->disks, same_set) {
3994                 if (test_bit(Faulty, &rdev->flags))
3995                         continue;
3996                 sync_blockdev(rdev->bdev);
3997                 invalidate_bdev(rdev->bdev);
3998
3999                 /* perform some consistency tests on the device.
4000                  * We don't want the data to overlap the metadata,
4001                  * Internal Bitmap issues have been handled elsewhere.
4002                  */
4003                 if (rdev->data_offset < rdev->sb_start) {
4004                         if (mddev->dev_sectors &&
4005                             rdev->data_offset + mddev->dev_sectors
4006                             > rdev->sb_start) {
4007                                 printk("md: %s: data overlaps metadata\n",
4008                                        mdname(mddev));
4009                                 return -EINVAL;
4010                         }
4011                 } else {
4012                         if (rdev->sb_start + rdev->sb_size/512
4013                             > rdev->data_offset) {
4014                                 printk("md: %s: metadata overlaps data\n",
4015                                        mdname(mddev));
4016                                 return -EINVAL;
4017                         }
4018                 }
4019                 sysfs_notify_dirent(rdev->sysfs_state);
4020         }
4021
4022         md_probe(mddev->unit, NULL, NULL);
4023         disk = mddev->gendisk;
4024         if (!disk)
4025                 return -ENOMEM;
4026
4027         spin_lock(&pers_lock);
4028         pers = find_pers(mddev->level, mddev->clevel);
4029         if (!pers || !try_module_get(pers->owner)) {
4030                 spin_unlock(&pers_lock);
4031                 if (mddev->level != LEVEL_NONE)
4032                         printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
4033                                mddev->level);
4034                 else
4035                         printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
4036                                mddev->clevel);
4037                 return -EINVAL;
4038         }
4039         mddev->pers = pers;
4040         spin_unlock(&pers_lock);
4041         if (mddev->level != pers->level) {
4042                 mddev->level = pers->level;
4043                 mddev->new_level = pers->level;
4044         }
4045         strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
4046
4047         if (pers->level >= 4 && pers->level <= 6)
4048                 /* Cannot support integrity (yet) */
4049                 blk_integrity_unregister(mddev->gendisk);
4050
4051         if (mddev->reshape_position != MaxSector &&
4052             pers->start_reshape == NULL) {
4053                 /* This personality cannot handle reshaping... */
4054                 mddev->pers = NULL;
4055                 module_put(pers->owner);
4056                 return -EINVAL;
4057         }
4058
4059         if (pers->sync_request) {
4060                 /* Warn if this is a potentially silly
4061                  * configuration.
4062                  */
4063                 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4064                 mdk_rdev_t *rdev2;
4065                 int warned = 0;
4066
4067                 list_for_each_entry(rdev, &mddev->disks, same_set)
4068                         list_for_each_entry(rdev2, &mddev->disks, same_set) {
4069                                 if (rdev < rdev2 &&
4070                                     rdev->bdev->bd_contains ==
4071                                     rdev2->bdev->bd_contains) {
4072                                         printk(KERN_WARNING
4073                                                "%s: WARNING: %s appears to be"
4074                                                " on the same physical disk as"
4075                                                " %s.\n",
4076                                                mdname(mddev),
4077                                                bdevname(rdev->bdev,b),
4078                                                bdevname(rdev2->bdev,b2));
4079                                         warned = 1;
4080                                 }
4081                         }
4082
4083                 if (warned)
4084                         printk(KERN_WARNING
4085                                "True protection against single-disk"
4086                                " failure might be compromised.\n");
4087         }
4088
4089         mddev->recovery = 0;
4090         /* may be over-ridden by personality */
4091         mddev->resync_max_sectors = mddev->dev_sectors;
4092
4093         mddev->barriers_work = 1;
4094         mddev->ok_start_degraded = start_dirty_degraded;
4095
4096         if (start_readonly)
4097                 mddev->ro = 2; /* read-only, but switch on first write */
4098
4099         err = mddev->pers->run(mddev);
4100         if (err)
4101                 printk(KERN_ERR "md: pers->run() failed ...\n");
4102         else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
4103                 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
4104                           " but 'external_size' not in effect?\n", __func__);
4105                 printk(KERN_ERR
4106                        "md: invalid array_size %llu > default size %llu\n",
4107                        (unsigned long long)mddev->array_sectors / 2,
4108                        (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
4109                 err = -EINVAL;
4110                 mddev->pers->stop(mddev);
4111         }
4112         if (err == 0 && mddev->pers->sync_request) {
4113                 err = bitmap_create(mddev);
4114                 if (err) {
4115                         printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
4116                                mdname(mddev), err);
4117                         mddev->pers->stop(mddev);
4118                 }
4119         }
4120         if (err) {
4121                 module_put(mddev->pers->owner);
4122                 mddev->pers = NULL;
4123                 bitmap_destroy(mddev);
4124                 return err;
4125         }
4126         if (mddev->pers->sync_request) {
4127                 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4128                         printk(KERN_WARNING
4129                                "md: cannot register extra attributes for %s\n",
4130                                mdname(mddev));
4131                 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4132         } else if (mddev->ro == 2) /* auto-readonly not meaningful */
4133                 mddev->ro = 0;
4134
4135         atomic_set(&mddev->writes_pending,0);
4136         mddev->safemode = 0;
4137         mddev->safemode_timer.function = md_safemode_timeout;
4138         mddev->safemode_timer.data = (unsigned long) mddev;
4139         mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
4140         mddev->in_sync = 1;
4141
4142         list_for_each_entry(rdev, &mddev->disks, same_set)
4143                 if (rdev->raid_disk >= 0) {
4144                         char nm[20];
4145                         sprintf(nm, "rd%d", rdev->raid_disk);
4146                         if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
4147                                 printk("md: cannot register %s for %s\n",
4148                                        nm, mdname(mddev));
4149                 }
4150         
4151         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4152         
4153         if (mddev->flags)
4154                 md_update_sb(mddev, 0);
4155
4156         set_capacity(disk, mddev->array_sectors);
4157
4158         /* If there is a partially-recovered drive we need to
4159          * start recovery here.  If we leave it to md_check_recovery,
4160          * it will remove the drives and not do the right thing
4161          */
4162         if (mddev->degraded && !mddev->sync_thread) {
4163                 int spares = 0;
4164                 list_for_each_entry(rdev, &mddev->disks, same_set)
4165                         if (rdev->raid_disk >= 0 &&
4166                             !test_bit(In_sync, &rdev->flags) &&
4167                             !test_bit(Faulty, &rdev->flags))
4168                                 /* complete an interrupted recovery */
4169                                 spares++;
4170                 if (spares && mddev->pers->sync_request) {
4171                         mddev->recovery = 0;
4172                         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4173                         mddev->sync_thread = md_register_thread(md_do_sync,
4174                                                                 mddev,
4175                                                                 "%s_resync");
4176                         if (!mddev->sync_thread) {
4177                                 printk(KERN_ERR "%s: could not start resync"
4178                                        " thread...\n",
4179                                        mdname(mddev));
4180                                 /* leave the spares where they are, it shouldn't hurt */
4181                                 mddev->recovery = 0;
4182                         }
4183                 }
4184         }
4185         md_wakeup_thread(mddev->thread);
4186         md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4187
4188         mddev->changed = 1;
4189         md_new_event(mddev);
4190         sysfs_notify_dirent(mddev->sysfs_state);
4191         if (mddev->sysfs_action)
4192                 sysfs_notify_dirent(mddev->sysfs_action);
4193         sysfs_notify(&mddev->kobj, NULL, "degraded");
4194         kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4195         return 0;
4196 }
4197
4198 static int restart_array(mddev_t *mddev)
4199 {
4200         struct gendisk *disk = mddev->gendisk;
4201
4202         /* Complain if it has no devices */
4203         if (list_empty(&mddev->disks))
4204                 return -ENXIO;
4205         if (!mddev->pers)
4206                 return -EINVAL;
4207         if (!mddev->ro)
4208                 return -EBUSY;
4209         mddev->safemode = 0;
4210         mddev->ro = 0;
4211         set_disk_ro(disk, 0);
4212         printk(KERN_INFO "md: %s switched to read-write mode.\n",
4213                 mdname(mddev));
4214         /* Kick recovery or resync if necessary */
4215         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4216         md_wakeup_thread(mddev->thread);
4217         md_wakeup_thread(mddev->sync_thread);
4218         sysfs_notify_dirent(mddev->sysfs_state);
4219         return 0;
4220 }
4221
4222 /* similar to deny_write_access, but accounts for our holding a reference
4223  * to the file ourselves */
4224 static int deny_bitmap_write_access(struct file * file)
4225 {
4226         struct inode *inode = file->f_mapping->host;
4227
4228         spin_lock(&inode->i_lock);
4229         if (atomic_read(&inode->i_writecount) > 1) {
4230                 spin_unlock(&inode->i_lock);
4231                 return -ETXTBSY;
4232         }
4233         atomic_set(&inode->i_writecount, -1);
4234         spin_unlock(&inode->i_lock);
4235
4236         return 0;
4237 }
4238
4239 static void restore_bitmap_write_access(struct file *file)
4240 {
4241         struct inode *inode = file->f_mapping->host;
4242
4243         spin_lock(&inode->i_lock);
4244         atomic_set(&inode->i_writecount, 1);
4245         spin_unlock(&inode->i_lock);
4246 }
4247
4248 /* mode:
4249  *   0 - completely stop and dis-assemble array
4250  *   1 - switch to readonly
4251  *   2 - stop but do not disassemble array
4252  */
4253 static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4254 {
4255         int err = 0;
4256         struct gendisk *disk = mddev->gendisk;
4257         mdk_rdev_t *rdev;
4258
4259         if (atomic_read(&mddev->openers) > is_open) {
4260                 printk("md: %s still in use.\n",mdname(mddev));
4261                 return -EBUSY;
4262         }
4263
4264         if (mddev->pers) {
4265
4266                 if (mddev->sync_thread) {
4267                         set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4268                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4269                         md_unregister_thread(mddev->sync_thread);
4270                         mddev->sync_thread = NULL;
4271                 }
4272
4273                 del_timer_sync(&mddev->safemode_timer);
4274
4275                 switch(mode) {
4276                 case 1: /* readonly */
4277                         err  = -ENXIO;
4278                         if (mddev->ro==1)
4279                                 goto out;
4280                         mddev->ro = 1;
4281                         break;
4282                 case 0: /* disassemble */
4283                 case 2: /* stop */
4284                         bitmap_flush(mddev);
4285                         md_super_wait(mddev);
4286                         if (mddev->ro)
4287                                 set_disk_ro(disk, 0);
4288
4289                         mddev->pers->stop(mddev);
4290                         mddev->queue->merge_bvec_fn = NULL;
4291                         mddev->queue->unplug_fn = NULL;
4292                         mddev->queue->backing_dev_info.congested_fn = NULL;
4293                         module_put(mddev->pers->owner);
4294                         if (mddev->pers->sync_request)
4295                                 mddev->private = &md_redundancy_group;
4296                         mddev->pers = NULL;
4297                         /* tell userspace to handle 'inactive' */
4298                         sysfs_notify_dirent(mddev->sysfs_state);
4299
4300                         list_for_each_entry(rdev, &mddev->disks, same_set)
4301                                 if (rdev->raid_disk >= 0) {
4302                                         char nm[20];
4303                                         sprintf(nm, "rd%d", rdev->raid_disk);
4304                                         sysfs_remove_link(&mddev->kobj, nm);
4305                                 }
4306
4307                         set_capacity(disk, 0);
4308                         mddev->changed = 1;
4309
4310                         if (mddev->ro)
4311                                 mddev->ro = 0;
4312                 }
4313                 if (!mddev->in_sync || mddev->flags) {
4314                         /* mark array as shutdown cleanly */
4315                         mddev->in_sync = 1;
4316                         md_update_sb(mddev, 1);
4317                 }
4318                 if (mode == 1)
4319                         set_disk_ro(disk, 1);
4320                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4321         }
4322
4323         /*
4324          * Free resources if final stop
4325          */
4326         if (mode == 0) {
4327
4328                 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
4329
4330                 bitmap_destroy(mddev);
4331                 if (mddev->bitmap_file) {
4332                         restore_bitmap_write_access(mddev->bitmap_file);
4333                         fput(mddev->bitmap_file);
4334                         mddev->bitmap_file = NULL;
4335                 }
4336                 mddev->bitmap_offset = 0;
4337
4338                 /* make sure all md_delayed_delete calls have finished */
4339                 flush_scheduled_work();
4340
4341                 export_array(mddev);
4342
4343                 mddev->array_sectors = 0;
4344                 mddev->external_size = 0;
4345                 mddev->dev_sectors = 0;
4346                 mddev->raid_disks = 0;
4347                 mddev->recovery_cp = 0;
4348                 mddev->resync_min = 0;
4349                 mddev->resync_max = MaxSector;
4350                 mddev->reshape_position = MaxSector;
4351                 mddev->external = 0;
4352                 mddev->persistent = 0;
4353                 mddev->level = LEVEL_NONE;
4354                 mddev->clevel[0] = 0;
4355                 mddev->flags = 0;
4356                 mddev->ro = 0;
4357                 mddev->metadata_type[0] = 0;
4358                 mddev->chunk_sectors = 0;
4359                 mddev->ctime = mddev->utime = 0;
4360                 mddev->layout = 0;
4361                 mddev->max_disks = 0;
4362                 mddev->events = 0;
4363                 mddev->delta_disks = 0;
4364                 mddev->new_level = LEVEL_NONE;
4365                 mddev->new_layout = 0;
4366                 mddev->new_chunk_sectors = 0;
4367                 mddev->curr_resync = 0;
4368                 mddev->resync_mismatches = 0;
4369                 mddev->suspend_lo = mddev->suspend_hi = 0;
4370                 mddev->sync_speed_min = mddev->sync_speed_max = 0;
4371                 mddev->recovery = 0;
4372                 mddev->in_sync = 0;
4373                 mddev->changed = 0;
4374                 mddev->degraded = 0;
4375                 mddev->barriers_work = 0;
4376                 mddev->safemode = 0;
4377                 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4378                 if (mddev->hold_active == UNTIL_STOP)
4379                         mddev->hold_active = 0;
4380
4381         } else if (mddev->pers)
4382                 printk(KERN_INFO "md: %s switched to read-only mode.\n",
4383                         mdname(mddev));
4384         err = 0;
4385         blk_integrity_unregister(disk);
4386         md_new_event(mddev);
4387         sysfs_notify_dirent(mddev->sysfs_state);
4388 out:
4389         return err;
4390 }
4391
4392 #ifndef MODULE
4393 static void autorun_array(mddev_t *mddev)
4394 {
4395         mdk_rdev_t *rdev;
4396         int err;
4397
4398         if (list_empty(&mddev->disks))
4399                 return;
4400
4401         printk(KERN_INFO "md: running: ");
4402
4403         list_for_each_entry(rdev, &mddev->disks, same_set) {
4404                 char b[BDEVNAME_SIZE];
4405                 printk("<%s>", bdevname(rdev->bdev,b));
4406         }
4407         printk("\n");
4408
4409         err = do_md_run(mddev);
4410         if (err) {
4411                 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
4412                 do_md_stop(mddev, 0, 0);
4413         }
4414 }
4415
4416 /*
4417  * lets try to run arrays based on all disks that have arrived
4418  * until now. (those are in pending_raid_disks)
4419  *
4420  * the method: pick the first pending disk, collect all disks with
4421  * the same UUID, remove all from the pending list and put them into
4422  * the 'same_array' list. Then order this list based on superblock
4423  * update time (freshest comes first), kick out 'old' disks and
4424  * compare superblocks. If everything's fine then run it.
4425  *
4426  * If "unit" is allocated, then bump its reference count
4427  */
4428 static void autorun_devices(int part)
4429 {
4430         mdk_rdev_t *rdev0, *rdev, *tmp;
4431         mddev_t *mddev;
4432         char b[BDEVNAME_SIZE];
4433
4434         printk(KERN_INFO "md: autorun ...\n");
4435         while (!list_empty(&pending_raid_disks)) {
4436                 int unit;
4437                 dev_t dev;
4438                 LIST_HEAD(candidates);
4439                 rdev0 = list_entry(pending_raid_disks.next,
4440                                          mdk_rdev_t, same_set);
4441
4442                 printk(KERN_INFO "md: considering %s ...\n",
4443                         bdevname(rdev0->bdev,b));
4444                 INIT_LIST_HEAD(&candidates);
4445                 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
4446                         if (super_90_load(rdev, rdev0, 0) >= 0) {
4447                                 printk(KERN_INFO "md:  adding %s ...\n",
4448                                         bdevname(rdev->bdev,b));
4449                                 list_move(&rdev->same_set, &candidates);
4450                         }
4451                 /*
4452                  * now we have a set of devices, with all of them having
4453                  * mostly sane superblocks. It's time to allocate the
4454                  * mddev.
4455                  */
4456                 if (part) {
4457                         dev = MKDEV(mdp_major,
4458                                     rdev0->preferred_minor << MdpMinorShift);
4459                         unit = MINOR(dev) >> MdpMinorShift;
4460                 } else {
4461                         dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
4462                         unit = MINOR(dev);
4463                 }
4464                 if (rdev0->preferred_minor != unit) {
4465                         printk(KERN_INFO "md: unit number in %s is bad: %d\n",
4466                                bdevname(rdev0->bdev, b), rdev0->preferred_minor);
4467                         break;
4468                 }
4469
4470                 md_probe(dev, NULL, NULL);
4471                 mddev = mddev_find(dev);
4472                 if (!mddev || !mddev->gendisk) {
4473                         if (mddev)
4474                                 mddev_put(mddev);
4475                         printk(KERN_ERR
4476                                 "md: cannot allocate memory for md drive.\n");
4477                         break;
4478                 }
4479                 if (mddev_lock(mddev)) 
4480                         printk(KERN_WARNING "md: %s locked, cannot run\n",
4481                                mdname(mddev));
4482                 else if (mddev->raid_disks || mddev->major_version
4483                          || !list_empty(&mddev->disks)) {
4484                         printk(KERN_WARNING 
4485                                 "md: %s already running, cannot run %s\n",
4486                                 mdname(mddev), bdevname(rdev0->bdev,b));
4487                         mddev_unlock(mddev);
4488                 } else {
4489                         printk(KERN_INFO "md: created %s\n", mdname(mddev));
4490                         mddev->persistent = 1;
4491                         rdev_for_each_list(rdev, tmp, &candidates) {
4492                                 list_del_init(&rdev->same_set);
4493                                 if (bind_rdev_to_array(rdev, mddev))
4494                                         export_rdev(rdev);
4495                         }
4496                         autorun_array(mddev);
4497                         mddev_unlock(mddev);
4498                 }
4499                 /* on success, candidates will be empty, on error
4500                  * it won't...
4501                  */
4502                 rdev_for_each_list(rdev, tmp, &candidates) {
4503                         list_del_init(&rdev->same_set);
4504                         export_rdev(rdev);
4505                 }
4506                 mddev_put(mddev);
4507         }
4508         printk(KERN_INFO "md: ... autorun DONE.\n");
4509 }
4510 #endif /* !MODULE */
4511
4512 static int get_version(void __user * arg)
4513 {
4514         mdu_version_t ver;
4515
4516         ver.major = MD_MAJOR_VERSION;
4517         ver.minor = MD_MINOR_VERSION;
4518         ver.patchlevel = MD_PATCHLEVEL_VERSION;
4519
4520         if (copy_to_user(arg, &ver, sizeof(ver)))
4521                 return -EFAULT;
4522
4523         return 0;
4524 }
4525
4526 static int get_array_info(mddev_t * mddev, void __user * arg)
4527 {
4528         mdu_array_info_t info;
4529         int nr,working,active,failed,spare;
4530         mdk_rdev_t *rdev;
4531
4532         nr=working=active=failed=spare=0;
4533         list_for_each_entry(rdev, &mddev->disks, same_set) {
4534                 nr++;
4535                 if (test_bit(Faulty, &rdev->flags))
4536                         failed++;
4537                 else {
4538                         working++;
4539                         if (test_bit(In_sync, &rdev->flags))
4540                                 active++;       
4541                         else
4542                                 spare++;
4543                 }
4544         }
4545
4546         info.major_version = mddev->major_version;
4547         info.minor_version = mddev->minor_version;
4548         info.patch_version = MD_PATCHLEVEL_VERSION;
4549         info.ctime         = mddev->ctime;
4550         info.level         = mddev->level;
4551         info.size          = mddev->dev_sectors / 2;
4552         if (info.size != mddev->dev_sectors / 2) /* overflow */
4553                 info.size = -1;
4554         info.nr_disks      = nr;
4555         info.raid_disks    = mddev->raid_disks;
4556         info.md_minor      = mddev->md_minor;
4557         info.not_persistent= !mddev->persistent;
4558
4559         info.utime         = mddev->utime;
4560         info.state         = 0;
4561         if (mddev->in_sync)
4562                 info.state = (1<<MD_SB_CLEAN);
4563         if (mddev->bitmap && mddev->bitmap_offset)
4564                 info.state = (1<<MD_SB_BITMAP_PRESENT);
4565         info.active_disks  = active;
4566         info.working_disks = working;
4567         info.failed_disks  = failed;
4568         info.spare_disks   = spare;
4569
4570         info.layout        = mddev->layout;
4571         info.chunk_size    = mddev->chunk_sectors << 9;
4572
4573         if (copy_to_user(arg, &info, sizeof(info)))
4574                 return -EFAULT;
4575
4576         return 0;
4577 }
4578
4579 static int get_bitmap_file(mddev_t * mddev, void __user * arg)
4580 {
4581         mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
4582         char *ptr, *buf = NULL;
4583         int err = -ENOMEM;
4584
4585         if (md_allow_write(mddev))
4586                 file = kmalloc(sizeof(*file), GFP_NOIO);
4587         else
4588                 file = kmalloc(sizeof(*file), GFP_KERNEL);
4589
4590         if (!file)
4591                 goto out;
4592
4593         /* bitmap disabled, zero the first byte and copy out */
4594         if (!mddev->bitmap || !mddev->bitmap->file) {
4595                 file->pathname[0] = '\0';
4596                 goto copy_out;
4597         }
4598
4599         buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
4600         if (!buf)
4601                 goto out;
4602
4603         ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
4604         if (IS_ERR(ptr))
4605                 goto out;
4606
4607         strcpy(file->pathname, ptr);
4608
4609 copy_out:
4610         err = 0;
4611         if (copy_to_user(arg, file, sizeof(*file)))
4612                 err = -EFAULT;
4613 out:
4614         kfree(buf);
4615         kfree(file);
4616         return err;
4617 }
4618
4619 static int get_disk_info(mddev_t * mddev, void __user * arg)
4620 {
4621         mdu_disk_info_t info;
4622         mdk_rdev_t *rdev;
4623
4624         if (copy_from_user(&info, arg, sizeof(info)))
4625                 return -EFAULT;
4626
4627         rdev = find_rdev_nr(mddev, info.number);
4628         if (rdev) {
4629                 info.major = MAJOR(rdev->bdev->bd_dev);
4630                 info.minor = MINOR(rdev->bdev->bd_dev);
4631                 info.raid_disk = rdev->raid_disk;
4632                 info.state = 0;
4633                 if (test_bit(Faulty, &rdev->flags))
4634                         info.state |= (1<<MD_DISK_FAULTY);
4635                 else if (test_bit(In_sync, &rdev->flags)) {
4636                         info.state |= (1<<MD_DISK_ACTIVE);
4637                         info.state |= (1<<MD_DISK_SYNC);
4638                 }
4639                 if (test_bit(WriteMostly, &rdev->flags))
4640                         info.state |= (1<<MD_DISK_WRITEMOSTLY);
4641         } else {
4642                 info.major = info.minor = 0;
4643                 info.raid_disk = -1;
4644                 info.state = (1<<MD_DISK_REMOVED);
4645         }
4646
4647         if (copy_to_user(arg, &info, sizeof(info)))
4648                 return -EFAULT;
4649
4650         return 0;
4651 }
4652
4653 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4654 {
4655         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4656         mdk_rdev_t *rdev;
4657         dev_t dev = MKDEV(info->major,info->minor);
4658
4659         if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
4660                 return -EOVERFLOW;
4661
4662         if (!mddev->raid_disks) {
4663                 int err;
4664                 /* expecting a device which has a superblock */
4665                 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
4666                 if (IS_ERR(rdev)) {
4667                         printk(KERN_WARNING 
4668                                 "md: md_import_device returned %ld\n",
4669                                 PTR_ERR(rdev));
4670                         return PTR_ERR(rdev);
4671                 }
4672                 if (!list_empty(&mddev->disks)) {
4673                         mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
4674                                                         mdk_rdev_t, same_set);
4675                         int err = super_types[mddev->major_version]
4676                                 .load_super(rdev, rdev0, mddev->minor_version);
4677                         if (err < 0) {
4678                                 printk(KERN_WARNING 
4679                                         "md: %s has different UUID to %s\n",
4680                                         bdevname(rdev->bdev,b), 
4681                                         bdevname(rdev0->bdev,b2));
4682                                 export_rdev(rdev);
4683                                 return -EINVAL;
4684                         }
4685                 }
4686                 err = bind_rdev_to_array(rdev, mddev);
4687                 if (err)
4688                         export_rdev(rdev);
4689                 return err;
4690         }
4691
4692         /*
4693          * add_new_disk can be used once the array is assembled
4694          * to add "hot spares".  They must already have a superblock
4695          * written
4696          */
4697         if (mddev->pers) {
4698                 int err;
4699                 if (!mddev->pers->hot_add_disk) {
4700                         printk(KERN_WARNING 
4701                                 "%s: personality does not support diskops!\n",
4702                                mdname(mddev));
4703                         return -EINVAL;
4704                 }
4705                 if (mddev->persistent)
4706                         rdev = md_import_device(dev, mddev->major_version,
4707                                                 mddev->minor_version);
4708                 else
4709                         rdev = md_import_device(dev, -1, -1);
4710                 if (IS_ERR(rdev)) {
4711                         printk(KERN_WARNING 
4712                                 "md: md_import_device returned %ld\n",
4713                                 PTR_ERR(rdev));
4714                         return PTR_ERR(rdev);
4715                 }
4716                 /* set save_raid_disk if appropriate */
4717                 if (!mddev->persistent) {
4718                         if (info->state & (1<<MD_DISK_SYNC)  &&
4719                             info->raid_disk < mddev->raid_disks)
4720                                 rdev->raid_disk = info->raid_disk;
4721                         else
4722                                 rdev->raid_disk = -1;
4723                 } else
4724                         super_types[mddev->major_version].
4725                                 validate_super(mddev, rdev);
4726                 rdev->saved_raid_disk = rdev->raid_disk;
4727
4728                 clear_bit(In_sync, &rdev->flags); /* just to be sure */
4729                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4730                         set_bit(WriteMostly, &rdev->flags);
4731                 else
4732                         clear_bit(WriteMostly, &rdev->flags);
4733
4734                 rdev->raid_disk = -1;
4735                 err = bind_rdev_to_array(rdev, mddev);
4736                 if (!err && !mddev->pers->hot_remove_disk) {
4737                         /* If there is hot_add_disk but no hot_remove_disk
4738                          * then added disks for geometry changes,
4739                          * and should be added immediately.
4740                          */
4741                         super_types[mddev->major_version].
4742                                 validate_super(mddev, rdev);
4743                         err = mddev->pers->hot_add_disk(mddev, rdev);
4744                         if (err)
4745                                 unbind_rdev_from_array(rdev);
4746                 }
4747                 if (err)
4748                         export_rdev(rdev);
4749                 else
4750                         sysfs_notify_dirent(rdev->sysfs_state);
4751
4752                 md_update_sb(mddev, 1);
4753                 if (mddev->degraded)
4754                         set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4755                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4756                 md_wakeup_thread(mddev->thread);
4757                 return err;
4758         }
4759
4760         /* otherwise, add_new_disk is only allowed
4761          * for major_version==0 superblocks
4762          */
4763         if (mddev->major_version != 0) {
4764                 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
4765                        mdname(mddev));
4766                 return -EINVAL;
4767         }
4768
4769         if (!(info->state & (1<<MD_DISK_FAULTY))) {
4770                 int err;
4771                 rdev = md_import_device(dev, -1, 0);
4772                 if (IS_ERR(rdev)) {
4773                         printk(KERN_WARNING 
4774                                 "md: error, md_import_device() returned %ld\n",
4775                                 PTR_ERR(rdev));
4776                         return PTR_ERR(rdev);
4777                 }
4778                 rdev->desc_nr = info->number;
4779                 if (info->raid_disk < mddev->raid_disks)
4780                         rdev->raid_disk = info->raid_disk;
4781                 else
4782                         rdev->raid_disk = -1;
4783
4784                 if (rdev->raid_disk < mddev->raid_disks)
4785                         if (info->state & (1<<MD_DISK_SYNC))
4786                                 set_bit(In_sync, &rdev->flags);
4787
4788                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4789                         set_bit(WriteMostly, &rdev->flags);
4790
4791                 if (!mddev->persistent) {
4792                         printk(KERN_INFO "md: nonpersistent superblock ...\n");
4793                         rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4794                 } else 
4795                         rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4796                 rdev->sectors = rdev->sb_start;
4797
4798                 err = bind_rdev_to_array(rdev, mddev);
4799                 if (err) {
4800                         export_rdev(rdev);
4801                         return err;
4802                 }
4803         }
4804
4805         return 0;
4806 }
4807
4808 static int hot_remove_disk(mddev_t * mddev, dev_t dev)
4809 {
4810         char b[BDEVNAME_SIZE];
4811         mdk_rdev_t *rdev;
4812
4813         rdev = find_rdev(mddev, dev);
4814         if (!rdev)
4815                 return -ENXIO;
4816
4817         if (rdev->raid_disk >= 0)
4818                 goto busy;
4819
4820         kick_rdev_from_array(rdev);
4821         md_update_sb(mddev, 1);
4822         md_new_event(mddev);
4823
4824         return 0;
4825 busy:
4826         printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
4827                 bdevname(rdev->bdev,b), mdname(mddev));
4828         return -EBUSY;
4829 }
4830
4831 static int hot_add_disk(mddev_t * mddev, dev_t dev)
4832 {
4833         char b[BDEVNAME_SIZE];
4834         int err;
4835         mdk_rdev_t *rdev;
4836
4837         if (!mddev->pers)
4838                 return -ENODEV;
4839
4840         if (mddev->major_version != 0) {
4841                 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
4842                         " version-0 superblocks.\n",
4843                         mdname(mddev));
4844                 return -EINVAL;
4845         }
4846         if (!mddev->pers->hot_add_disk) {
4847                 printk(KERN_WARNING 
4848                         "%s: personality does not support diskops!\n",
4849                         mdname(mddev));
4850                 return -EINVAL;
4851         }
4852
4853         rdev = md_import_device(dev, -1, 0);
4854         if (IS_ERR(rdev)) {
4855                 printk(KERN_WARNING 
4856                         "md: error, md_import_device() returned %ld\n",
4857                         PTR_ERR(rdev));
4858                 return -EINVAL;
4859         }
4860
4861         if (mddev->persistent)
4862                 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4863         else
4864                 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4865
4866         rdev->sectors = rdev->sb_start;
4867
4868         if (test_bit(Faulty, &rdev->flags)) {
4869                 printk(KERN_WARNING 
4870                         "md: can not hot-add faulty %s disk to %s!\n",
4871                         bdevname(rdev->bdev,b), mdname(mddev));
4872                 err = -EINVAL;
4873                 goto abort_export;
4874         }
4875         clear_bit(In_sync, &rdev->flags);
4876         rdev->desc_nr = -1;
4877         rdev->saved_raid_disk = -1;
4878         err = bind_rdev_to_array(rdev, mddev);
4879         if (err)
4880                 goto abort_export;
4881
4882         /*
4883          * The rest should better be atomic, we can have disk failures
4884          * noticed in interrupt contexts ...
4885          */
4886
4887         rdev->raid_disk = -1;
4888
4889         md_update_sb(mddev, 1);
4890
4891         /*
4892          * Kick recovery, maybe this spare has to be added to the
4893          * array immediately.
4894          */
4895         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4896         md_wakeup_thread(mddev->thread);
4897         md_new_event(mddev);
4898         return 0;
4899
4900 abort_export:
4901         export_rdev(rdev);
4902         return err;
4903 }
4904
4905 static int set_bitmap_file(mddev_t *mddev, int fd)
4906 {
4907         int err;
4908
4909         if (mddev->pers) {
4910                 if (!mddev->pers->quiesce)
4911                         return -EBUSY;
4912                 if (mddev->recovery || mddev->sync_thread)
4913                         return -EBUSY;
4914                 /* we should be able to change the bitmap.. */
4915         }
4916
4917
4918         if (fd >= 0) {
4919                 if (mddev->bitmap)
4920                         return -EEXIST; /* cannot add when bitmap is present */
4921                 mddev->bitmap_file = fget(fd);
4922
4923                 if (mddev->bitmap_file == NULL) {
4924                         printk(KERN_ERR "%s: error: failed to get bitmap file\n",
4925                                mdname(mddev));
4926                         return -EBADF;
4927                 }
4928
4929                 err = deny_bitmap_write_access(mddev->bitmap_file);
4930                 if (err) {
4931                         printk(KERN_ERR "%s: error: bitmap file is already in use\n",
4932                                mdname(mddev));
4933                         fput(mddev->bitmap_file);
4934                         mddev->bitmap_file = NULL;
4935                         return err;
4936                 }
4937                 mddev->bitmap_offset = 0; /* file overrides offset */
4938         } else if (mddev->bitmap == NULL)
4939                 return -ENOENT; /* cannot remove what isn't there */
4940         err = 0;
4941         if (mddev->pers) {
4942                 mddev->pers->quiesce(mddev, 1);
4943                 if (fd >= 0)
4944                         err = bitmap_create(mddev);
4945                 if (fd < 0 || err) {
4946                         bitmap_destroy(mddev);
4947                         fd = -1; /* make sure to put the file */
4948                 }
4949                 mddev->pers->quiesce(mddev, 0);
4950         }
4951         if (fd < 0) {
4952                 if (mddev->bitmap_file) {
4953                         restore_bitmap_write_access(mddev->bitmap_file);
4954                         fput(mddev->bitmap_file);
4955                 }
4956                 mddev->bitmap_file = NULL;
4957         }
4958
4959         return err;
4960 }
4961
4962 /*
4963  * set_array_info is used two different ways
4964  * The original usage is when creating a new array.
4965  * In this usage, raid_disks is > 0 and it together with
4966  *  level, size, not_persistent,layout,chunksize determine the
4967  *  shape of the array.
4968  *  This will always create an array with a type-0.90.0 superblock.
4969  * The newer usage is when assembling an array.
4970  *  In this case raid_disks will be 0, and the major_version field is
4971  *  use to determine which style super-blocks are to be found on the devices.
4972  *  The minor and patch _version numbers are also kept incase the
4973  *  super_block handler wishes to interpret them.
4974  */
4975 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4976 {
4977
4978         if (info->raid_disks == 0) {
4979                 /* just setting version number for superblock loading */
4980                 if (info->major_version < 0 ||
4981                     info->major_version >= ARRAY_SIZE(super_types) ||
4982                     super_types[info->major_version].name == NULL) {
4983                         /* maybe try to auto-load a module? */
4984                         printk(KERN_INFO 
4985                                 "md: superblock version %d not known\n",
4986                                 info->major_version);
4987                         return -EINVAL;
4988                 }
4989                 mddev->major_version = info->major_version;
4990                 mddev->minor_version = info->minor_version;
4991                 mddev->patch_version = info->patch_version;
4992                 mddev->persistent = !info->not_persistent;
4993                 return 0;
4994         }
4995         mddev->major_version = MD_MAJOR_VERSION;
4996         mddev->minor_version = MD_MINOR_VERSION;
4997         mddev->patch_version = MD_PATCHLEVEL_VERSION;
4998         mddev->ctime         = get_seconds();
4999
5000         mddev->level         = info->level;
5001         mddev->clevel[0]     = 0;
5002         mddev->dev_sectors   = 2 * (sector_t)info->size;
5003         mddev->raid_disks    = info->raid_disks;
5004         /* don't set md_minor, it is determined by which /dev/md* was
5005          * openned
5006          */
5007         if (info->state & (1<<MD_SB_CLEAN))
5008                 mddev->recovery_cp = MaxSector;
5009         else
5010                 mddev->recovery_cp = 0;
5011         mddev->persistent    = ! info->not_persistent;
5012         mddev->external      = 0;
5013
5014         mddev->layout        = info->layout;
5015         mddev->chunk_sectors = info->chunk_size >> 9;
5016
5017         mddev->max_disks     = MD_SB_DISKS;
5018
5019         if (mddev->persistent)
5020                 mddev->flags         = 0;
5021         set_bit(MD_CHANGE_DEVS, &mddev->flags);
5022
5023         mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
5024         mddev->bitmap_offset = 0;
5025
5026         mddev->reshape_position = MaxSector;
5027
5028         /*
5029          * Generate a 128 bit UUID
5030          */
5031         get_random_bytes(mddev->uuid, 16);
5032
5033         mddev->new_level = mddev->level;
5034         mddev->new_chunk_sectors = mddev->chunk_sectors;
5035         mddev->new_layout = mddev->layout;
5036         mddev->delta_disks = 0;
5037
5038         return 0;
5039 }
5040
5041 void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
5042 {
5043         WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
5044
5045         if (mddev->external_size)
5046                 return;
5047
5048         mddev->array_sectors = array_sectors;
5049 }
5050 EXPORT_SYMBOL(md_set_array_sectors);
5051
5052 static int update_size(mddev_t *mddev, sector_t num_sectors)
5053 {
5054         mdk_rdev_t *rdev;
5055         int rv;
5056         int fit = (num_sectors == 0);
5057
5058         if (mddev->pers->resize == NULL)
5059                 return -EINVAL;
5060         /* The "num_sectors" is the number of sectors of each device that
5061          * is used.  This can only make sense for arrays with redundancy.
5062          * linear and raid0 always use whatever space is available. We can only
5063          * consider changing this number if no resync or reconstruction is
5064          * happening, and if the new size is acceptable. It must fit before the
5065          * sb_start or, if that is <data_offset, it must fit before the size
5066          * of each device.  If num_sectors is zero, we find the largest size
5067          * that fits.
5068
5069          */
5070         if (mddev->sync_thread)
5071                 return -EBUSY;
5072         if (mddev->bitmap)
5073                 /* Sorry, cannot grow a bitmap yet, just remove it,
5074                  * grow, and re-add.
5075                  */
5076                 return -EBUSY;
5077         list_for_each_entry(rdev, &mddev->disks, same_set) {
5078                 sector_t avail = rdev->sectors;
5079
5080                 if (fit && (num_sectors == 0 || num_sectors > avail))
5081                         num_sectors = avail;
5082                 if (avail < num_sectors)
5083                         return -ENOSPC;
5084         }
5085         rv = mddev->pers->resize(mddev, num_sectors);
5086         if (!rv) {
5087                 struct block_device *bdev;
5088
5089                 bdev = bdget_disk(mddev->gendisk, 0);
5090                 if (bdev) {
5091                         mutex_lock(&bdev->bd_inode->i_mutex);
5092                         i_size_write(bdev->bd_inode,
5093                                      (loff_t)mddev->array_sectors << 9);
5094                         mutex_unlock(&bdev->bd_inode->i_mutex);
5095                         bdput(bdev);
5096                 }
5097         }
5098         return rv;
5099 }
5100
5101 static int update_raid_disks(mddev_t *mddev, int raid_disks)
5102 {
5103         int rv;
5104         /* change the number of raid disks */
5105         if (mddev->pers->check_reshape == NULL)
5106                 return -EINVAL;
5107         if (raid_disks <= 0 ||
5108             raid_disks >= mddev->max_disks)
5109                 return -EINVAL;
5110         if (mddev->sync_thread || mddev->reshape_position != MaxSector)
5111                 return -EBUSY;
5112         mddev->delta_disks = raid_disks - mddev->raid_disks;
5113
5114         rv = mddev->pers->check_reshape(mddev);
5115         return rv;
5116 }
5117
5118
5119 /*
5120  * update_array_info is used to change the configuration of an
5121  * on-line array.
5122  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
5123  * fields in the info are checked against the array.
5124  * Any differences that cannot be handled will cause an error.
5125  * Normally, only one change can be managed at a time.
5126  */
5127 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5128 {
5129         int rv = 0;
5130         int cnt = 0;
5131         int state = 0;
5132
5133         /* calculate expected state,ignoring low bits */
5134         if (mddev->bitmap && mddev->bitmap_offset)
5135                 state |= (1 << MD_SB_BITMAP_PRESENT);
5136
5137         if (mddev->major_version != info->major_version ||
5138             mddev->minor_version != info->minor_version ||
5139 /*          mddev->patch_version != info->patch_version || */
5140             mddev->ctime         != info->ctime         ||
5141             mddev->level         != info->level         ||
5142 /*          mddev->layout        != info->layout        || */
5143             !mddev->persistent   != info->not_persistent||
5144             mddev->chunk_sectors != info->chunk_size >> 9 ||
5145             /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
5146             ((state^info->state) & 0xfffffe00)
5147                 )
5148                 return -EINVAL;
5149         /* Check there is only one change */
5150         if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5151                 cnt++;
5152         if (mddev->raid_disks != info->raid_disks)
5153                 cnt++;
5154         if (mddev->layout != info->layout)
5155                 cnt++;
5156         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
5157                 cnt++;
5158         if (cnt == 0)
5159                 return 0;
5160         if (cnt > 1)
5161                 return -EINVAL;
5162
5163         if (mddev->layout != info->layout) {
5164                 /* Change layout
5165                  * we don't need to do anything at the md level, the
5166                  * personality will take care of it all.
5167                  */
5168                 if (mddev->pers->check_reshape == NULL)
5169                         return -EINVAL;
5170                 else {
5171                         mddev->new_layout = info->layout;
5172                         rv = mddev->pers->check_reshape(mddev);
5173                         if (rv)
5174                                 mddev->new_layout = mddev->layout;
5175                         return rv;
5176                 }
5177         }
5178         if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5179                 rv = update_size(mddev, (sector_t)info->size * 2);
5180
5181         if (mddev->raid_disks    != info->raid_disks)
5182                 rv = update_raid_disks(mddev, info->raid_disks);
5183
5184         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
5185                 if (mddev->pers->quiesce == NULL)
5186                         return -EINVAL;
5187                 if (mddev->recovery || mddev->sync_thread)
5188                         return -EBUSY;
5189                 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
5190                         /* add the bitmap */
5191                         if (mddev->bitmap)
5192                                 return -EEXIST;
5193                         if (mddev->default_bitmap_offset == 0)
5194                                 return -EINVAL;
5195                         mddev->bitmap_offset = mddev->default_bitmap_offset;
5196                         mddev->pers->quiesce(mddev, 1);
5197                         rv = bitmap_create(mddev);
5198                         if (rv)
5199                                 bitmap_destroy(mddev);
5200                         mddev->pers->quiesce(mddev, 0);
5201                 } else {
5202                         /* remove the bitmap */
5203                         if (!mddev->bitmap)
5204                                 return -ENOENT;
5205                         if (mddev->bitmap->file)
5206                                 return -EINVAL;
5207                         mddev->pers->quiesce(mddev, 1);
5208                         bitmap_destroy(mddev);
5209                         mddev->pers->quiesce(mddev, 0);
5210                         mddev->bitmap_offset = 0;
5211                 }
5212         }
5213         md_update_sb(mddev, 1);
5214         return rv;
5215 }
5216
5217 static int set_disk_faulty(mddev_t *mddev, dev_t dev)
5218 {
5219         mdk_rdev_t *rdev;
5220
5221         if (mddev->pers == NULL)
5222                 return -ENODEV;
5223
5224         rdev = find_rdev(mddev, dev);
5225         if (!rdev)
5226                 return -ENODEV;
5227
5228         md_error(mddev, rdev);
5229         return 0;
5230 }
5231
5232 /*
5233  * We have a problem here : there is no easy way to give a CHS
5234  * virtual geometry. We currently pretend that we have a 2 heads
5235  * 4 sectors (with a BIG number of cylinders...). This drives
5236  * dosfs just mad... ;-)
5237  */
5238 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
5239 {
5240         mddev_t *mddev = bdev->bd_disk->private_data;
5241
5242         geo->heads = 2;
5243         geo->sectors = 4;
5244         geo->cylinders = get_capacity(mddev->gendisk) / 8;
5245         return 0;
5246 }
5247
5248 static int md_ioctl(struct block_device *bdev, fmode_t mode,
5249                         unsigned int cmd, unsigned long arg)
5250 {
5251         int err = 0;
5252         void __user *argp = (void __user *)arg;
5253         mddev_t *mddev = NULL;
5254
5255         if (!capable(CAP_SYS_ADMIN))
5256                 return -EACCES;
5257
5258         /*
5259          * Commands dealing with the RAID driver but not any
5260          * particular array:
5261          */
5262         switch (cmd)
5263         {
5264                 case RAID_VERSION:
5265                         err = get_version(argp);
5266                         goto done;
5267
5268                 case PRINT_RAID_DEBUG:
5269                         err = 0;
5270                         md_print_devices();
5271                         goto done;
5272
5273 #ifndef MODULE
5274                 case RAID_AUTORUN:
5275                         err = 0;
5276                         autostart_arrays(arg);
5277                         goto done;
5278 #endif
5279                 default:;
5280         }
5281
5282         /*
5283          * Commands creating/starting a new array:
5284          */
5285
5286         mddev = bdev->bd_disk->private_data;
5287
5288         if (!mddev) {
5289                 BUG();
5290                 goto abort;
5291         }
5292
5293         err = mddev_lock(mddev);
5294         if (err) {
5295                 printk(KERN_INFO 
5296                         "md: ioctl lock interrupted, reason %d, cmd %d\n",
5297                         err, cmd);
5298                 goto abort;
5299         }
5300
5301         switch (cmd)
5302         {
5303                 case SET_ARRAY_INFO:
5304                         {
5305                                 mdu_array_info_t info;
5306                                 if (!arg)
5307                                         memset(&info, 0, sizeof(info));
5308                                 else if (copy_from_user(&info, argp, sizeof(info))) {
5309                                         err = -EFAULT;
5310                                         goto abort_unlock;
5311                                 }
5312                                 if (mddev->pers) {
5313                                         err = update_array_info(mddev, &info);
5314                                         if (err) {
5315                                                 printk(KERN_WARNING "md: couldn't update"
5316                                                        " array info. %d\n", err);
5317                                                 goto abort_unlock;
5318                                         }
5319                                         goto done_unlock;
5320                                 }
5321                                 if (!list_empty(&mddev->disks)) {
5322                                         printk(KERN_WARNING
5323                                                "md: array %s already has disks!\n",
5324                                                mdname(mddev));
5325                                         err = -EBUSY;
5326                                         goto abort_unlock;
5327                                 }
5328                                 if (mddev->raid_disks) {
5329                                         printk(KERN_WARNING
5330                                                "md: array %s already initialised!\n",
5331                                                mdname(mddev));
5332                                         err = -EBUSY;
5333                                         goto abort_unlock;
5334                                 }
5335                                 err = set_array_info(mddev, &info);
5336                                 if (err) {
5337                                         printk(KERN_WARNING "md: couldn't set"
5338                                                " array info. %d\n", err);
5339                                         goto abort_unlock;
5340                                 }
5341                         }
5342                         goto done_unlock;
5343
5344                 default:;
5345         }
5346
5347         /*
5348          * Commands querying/configuring an existing array:
5349          */
5350         /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
5351          * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
5352         if ((!mddev->raid_disks && !mddev->external)
5353             && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
5354             && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
5355             && cmd != GET_BITMAP_FILE) {
5356                 err = -ENODEV;
5357                 goto abort_unlock;
5358         }
5359
5360         /*
5361          * Commands even a read-only array can execute:
5362          */
5363         switch (cmd)
5364         {
5365                 case GET_ARRAY_INFO:
5366                         err = get_array_info(mddev, argp);
5367                         goto done_unlock;
5368
5369                 case GET_BITMAP_FILE:
5370                         err = get_bitmap_file(mddev, argp);
5371                         goto done_unlock;
5372
5373                 case GET_DISK_INFO:
5374                         err = get_disk_info(mddev, argp);
5375                         goto done_unlock;
5376
5377                 case RESTART_ARRAY_RW:
5378                         err = restart_array(mddev);
5379                         goto done_unlock;
5380
5381                 case STOP_ARRAY:
5382                         err = do_md_stop(mddev, 0, 1);
5383                         goto done_unlock;
5384
5385                 case STOP_ARRAY_RO:
5386                         err = do_md_stop(mddev, 1, 1);
5387                         goto done_unlock;
5388
5389         }
5390
5391         /*
5392          * The remaining ioctls are changing the state of the
5393          * superblock, so we do not allow them on read-only arrays.
5394          * However non-MD ioctls (e.g. get-size) will still come through
5395          * here and hit the 'default' below, so only disallow
5396          * 'md' ioctls, and switch to rw mode if started auto-readonly.
5397          */
5398         if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
5399                 if (mddev->ro == 2) {
5400                         mddev->ro = 0;
5401                         sysfs_notify_dirent(mddev->sysfs_state);
5402                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5403                         md_wakeup_thread(mddev->thread);
5404                 } else {
5405                         err = -EROFS;
5406                         goto abort_unlock;
5407                 }
5408         }
5409
5410         switch (cmd)
5411         {
5412                 case ADD_NEW_DISK:
5413                 {
5414                         mdu_disk_info_t info;
5415                         if (copy_from_user(&info, argp, sizeof(info)))
5416                                 err = -EFAULT;
5417                         else
5418                                 err = add_new_disk(mddev, &info);
5419                         goto done_unlock;
5420                 }
5421
5422                 case HOT_REMOVE_DISK:
5423                         err = hot_remove_disk(mddev, new_decode_dev(arg));
5424                         goto done_unlock;
5425
5426                 case HOT_ADD_DISK:
5427                         err = hot_add_disk(mddev, new_decode_dev(arg));
5428                         goto done_unlock;
5429
5430                 case SET_DISK_FAULTY:
5431                         err = set_disk_faulty(mddev, new_decode_dev(arg));
5432                         goto done_unlock;
5433
5434                 case RUN_ARRAY:
5435                         err = do_md_run(mddev);
5436                         goto done_unlock;
5437
5438                 case SET_BITMAP_FILE:
5439                         err = set_bitmap_file(mddev, (int)arg);
5440                         goto done_unlock;
5441
5442                 default:
5443                         err = -EINVAL;
5444                         goto abort_unlock;
5445         }
5446
5447 done_unlock:
5448 abort_unlock:
5449         if (mddev->hold_active == UNTIL_IOCTL &&
5450             err != -EINVAL)
5451                 mddev->hold_active = 0;
5452         mddev_unlock(mddev);
5453
5454         return err;
5455 done:
5456         if (err)
5457                 MD_BUG();
5458 abort:
5459         return err;
5460 }
5461
5462 static int md_open(struct block_device *bdev, fmode_t mode)
5463 {
5464         /*
5465          * Succeed if we can lock the mddev, which confirms that
5466          * it isn't being stopped right now.
5467          */
5468         mddev_t *mddev = mddev_find(bdev->bd_dev);
5469         int err;
5470
5471         if (mddev->gendisk != bdev->bd_disk) {
5472                 /* we are racing with mddev_put which is discarding this
5473                  * bd_disk.
5474                  */
5475                 mddev_put(mddev);
5476                 /* Wait until bdev->bd_disk is definitely gone */
5477                 flush_scheduled_work();
5478                 /* Then retry the open from the top */
5479                 return -ERESTARTSYS;
5480         }
5481         BUG_ON(mddev != bdev->bd_disk->private_data);
5482
5483         if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
5484                 goto out;
5485
5486         err = 0;
5487         atomic_inc(&mddev->openers);
5488         mddev_unlock(mddev);
5489
5490         check_disk_change(bdev);
5491  out:
5492         return err;
5493 }
5494
5495 static int md_release(struct gendisk *disk, fmode_t mode)
5496 {
5497         mddev_t *mddev = disk->private_data;
5498
5499         BUG_ON(!mddev);
5500         atomic_dec(&mddev->openers);
5501         mddev_put(mddev);
5502
5503         return 0;
5504 }
5505
5506 static int md_media_changed(struct gendisk *disk)
5507 {
5508         mddev_t *mddev = disk->private_data;
5509
5510         return mddev->changed;
5511 }
5512
5513 static int md_revalidate(struct gendisk *disk)
5514 {
5515         mddev_t *mddev = disk->private_data;
5516
5517         mddev->changed = 0;
5518         return 0;
5519 }
5520 static struct block_device_operations md_fops =
5521 {
5522         .owner          = THIS_MODULE,
5523         .open           = md_open,
5524         .release        = md_release,
5525         .ioctl          = md_ioctl,
5526         .getgeo         = md_getgeo,
5527         .media_changed  = md_media_changed,
5528         .revalidate_disk= md_revalidate,
5529 };
5530
5531 static int md_thread(void * arg)
5532 {
5533         mdk_thread_t *thread = arg;
5534
5535         /*
5536          * md_thread is a 'system-thread', it's priority should be very
5537          * high. We avoid resource deadlocks individually in each
5538          * raid personality. (RAID5 does preallocation) We also use RR and
5539          * the very same RT priority as kswapd, thus we will never get
5540          * into a priority inversion deadlock.
5541          *
5542          * we definitely have to have equal or higher priority than
5543          * bdflush, otherwise bdflush will deadlock if there are too
5544          * many dirty RAID5 blocks.
5545          */
5546
5547         allow_signal(SIGKILL);
5548         while (!kthread_should_stop()) {
5549
5550                 /* We need to wait INTERRUPTIBLE so that
5551                  * we don't add to the load-average.
5552                  * That means we need to be sure no signals are
5553                  * pending
5554                  */
5555                 if (signal_pending(current))
5556                         flush_signals(current);
5557
5558                 wait_event_interruptible_timeout
5559                         (thread->wqueue,
5560                          test_bit(THREAD_WAKEUP, &thread->flags)
5561                          || kthread_should_stop(),
5562                          thread->timeout);
5563
5564                 clear_bit(THREAD_WAKEUP, &thread->flags);
5565
5566                 thread->run(thread->mddev);
5567         }
5568
5569         return 0;
5570 }
5571
5572 void md_wakeup_thread(mdk_thread_t *thread)
5573 {
5574         if (thread) {
5575                 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
5576                 set_bit(THREAD_WAKEUP, &thread->flags);
5577                 wake_up(&thread->wqueue);
5578         }
5579 }
5580
5581 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
5582                                  const char *name)
5583 {
5584         mdk_thread_t *thread;
5585
5586         thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
5587         if (!thread)
5588                 return NULL;
5589
5590         init_waitqueue_head(&thread->wqueue);
5591
5592         thread->run = run;
5593         thread->mddev = mddev;
5594         thread->timeout = MAX_SCHEDULE_TIMEOUT;
5595         thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
5596         if (IS_ERR(thread->tsk)) {
5597                 kfree(thread);
5598                 return NULL;
5599         }
5600         return thread;
5601 }
5602
5603 void md_unregister_thread(mdk_thread_t *thread)
5604 {
5605         if (!thread)
5606                 return;
5607         dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
5608
5609         kthread_stop(thread->tsk);
5610         kfree(thread);
5611 }
5612
5613 void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5614 {
5615         if (!mddev) {
5616                 MD_BUG();
5617                 return;
5618         }
5619
5620         if (!rdev || test_bit(Faulty, &rdev->flags))
5621                 return;
5622
5623         if (mddev->external)
5624                 set_bit(Blocked, &rdev->flags);
5625 /*
5626         dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
5627                 mdname(mddev),
5628                 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
5629                 __builtin_return_address(0),__builtin_return_address(1),
5630                 __builtin_return_address(2),__builtin_return_address(3));
5631 */
5632         if (!mddev->pers)
5633                 return;
5634         if (!mddev->pers->error_handler)
5635                 return;
5636         mddev->pers->error_handler(mddev,rdev);
5637         if (mddev->degraded)
5638                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5639         set_bit(StateChanged, &rdev->flags);
5640         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5641         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5642         md_wakeup_thread(mddev->thread);
5643         md_new_event_inintr(mddev);
5644 }
5645
5646 /* seq_file implementation /proc/mdstat */
5647
5648 static void status_unused(struct seq_file *seq)
5649 {
5650         int i = 0;
5651         mdk_rdev_t *rdev;
5652
5653         seq_printf(seq, "unused devices: ");
5654
5655         list_for_each_entry(rdev, &pending_raid_disks, same_set) {
5656                 char b[BDEVNAME_SIZE];
5657                 i++;
5658                 seq_printf(seq, "%s ",
5659                               bdevname(rdev->bdev,b));
5660         }
5661         if (!i)
5662                 seq_printf(seq, "<none>");
5663
5664         seq_printf(seq, "\n");
5665 }
5666
5667
5668 static void status_resync(struct seq_file *seq, mddev_t * mddev)
5669 {
5670         sector_t max_sectors, resync, res;
5671         unsigned long dt, db;
5672         sector_t rt;
5673         int scale;
5674         unsigned int per_milli;
5675
5676         resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
5677
5678         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5679                 max_sectors = mddev->resync_max_sectors;
5680         else
5681                 max_sectors = mddev->dev_sectors;
5682
5683         /*
5684          * Should not happen.
5685          */
5686         if (!max_sectors) {
5687                 MD_BUG();
5688                 return;
5689         }
5690         /* Pick 'scale' such that (resync>>scale)*1000 will fit
5691          * in a sector_t, and (max_sectors>>scale) will fit in a
5692          * u32, as those are the requirements for sector_div.
5693          * Thus 'scale' must be at least 10
5694          */
5695         scale = 10;
5696         if (sizeof(sector_t) > sizeof(unsigned long)) {
5697                 while ( max_sectors/2 > (1ULL<<(scale+32)))
5698                         scale++;
5699         }
5700         res = (resync>>scale)*1000;
5701         sector_div(res, (u32)((max_sectors>>scale)+1));
5702
5703         per_milli = res;
5704         {
5705                 int i, x = per_milli/50, y = 20-x;
5706                 seq_printf(seq, "[");
5707                 for (i = 0; i < x; i++)
5708                         seq_printf(seq, "=");
5709                 seq_printf(seq, ">");
5710                 for (i = 0; i < y; i++)
5711                         seq_printf(seq, ".");
5712                 seq_printf(seq, "] ");
5713         }
5714         seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
5715                    (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
5716                     "reshape" :
5717                     (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
5718                      "check" :
5719                      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
5720                       "resync" : "recovery"))),
5721                    per_milli/10, per_milli % 10,
5722                    (unsigned long long) resync/2,
5723                    (unsigned long long) max_sectors/2);
5724
5725         /*
5726          * dt: time from mark until now
5727          * db: blocks written from mark until now
5728          * rt: remaining time
5729          *
5730          * rt is a sector_t, so could be 32bit or 64bit.
5731          * So we divide before multiply in case it is 32bit and close
5732          * to the limit.
5733          * We scale the divisor (db) by 32 to avoid loosing precision
5734          * near the end of resync when the number of remaining sectors
5735          * is close to 'db'.
5736          * We then divide rt by 32 after multiplying by db to compensate.
5737          * The '+1' avoids division by zero if db is very small.
5738          */
5739         dt = ((jiffies - mddev->resync_mark) / HZ);
5740         if (!dt) dt++;
5741         db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
5742                 - mddev->resync_mark_cnt;
5743
5744         rt = max_sectors - resync;    /* number of remaining sectors */
5745         sector_div(rt, db/32+1);
5746         rt *= dt;
5747         rt >>= 5;
5748
5749         seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
5750                    ((unsigned long)rt % 60)/6);
5751
5752         seq_printf(seq, " speed=%ldK/sec", db/2/dt);
5753 }
5754
5755 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
5756 {
5757         struct list_head *tmp;
5758         loff_t l = *pos;
5759         mddev_t *mddev;
5760
5761         if (l >= 0x10000)
5762                 return NULL;
5763         if (!l--)
5764                 /* header */
5765                 return (void*)1;
5766
5767         spin_lock(&all_mddevs_lock);
5768         list_for_each(tmp,&all_mddevs)
5769                 if (!l--) {
5770                         mddev = list_entry(tmp, mddev_t, all_mddevs);
5771                         mddev_get(mddev);
5772                         spin_unlock(&all_mddevs_lock);
5773                         return mddev;
5774                 }
5775         spin_unlock(&all_mddevs_lock);
5776         if (!l--)
5777                 return (void*)2;/* tail */
5778         return NULL;
5779 }
5780
5781 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
5782 {
5783         struct list_head *tmp;
5784         mddev_t *next_mddev, *mddev = v;
5785         
5786         ++*pos;
5787         if (v == (void*)2)
5788                 return NULL;
5789
5790         spin_lock(&all_mddevs_lock);
5791         if (v == (void*)1)
5792                 tmp = all_mddevs.next;
5793         else
5794                 tmp = mddev->all_mddevs.next;
5795         if (tmp != &all_mddevs)
5796                 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
5797         else {
5798                 next_mddev = (void*)2;
5799                 *pos = 0x10000;
5800         }               
5801         spin_unlock(&all_mddevs_lock);
5802
5803         if (v != (void*)1)
5804                 mddev_put(mddev);
5805         return next_mddev;
5806
5807 }
5808
5809 static void md_seq_stop(struct seq_file *seq, void *v)
5810 {
5811         mddev_t *mddev = v;
5812
5813         if (mddev && v != (void*)1 && v != (void*)2)
5814                 mddev_put(mddev);
5815 }
5816
5817 struct mdstat_info {
5818         int event;
5819 };
5820
5821 static int md_seq_show(struct seq_file *seq, void *v)
5822 {
5823         mddev_t *mddev = v;
5824         sector_t sectors;
5825         mdk_rdev_t *rdev;
5826         struct mdstat_info *mi = seq->private;
5827         struct bitmap *bitmap;
5828
5829         if (v == (void*)1) {
5830                 struct mdk_personality *pers;
5831                 seq_printf(seq, "Personalities : ");
5832                 spin_lock(&pers_lock);
5833                 list_for_each_entry(pers, &pers_list, list)
5834                         seq_printf(seq, "[%s] ", pers->name);
5835
5836                 spin_unlock(&pers_lock);
5837                 seq_printf(seq, "\n");
5838                 mi->event = atomic_read(&md_event_count);
5839                 return 0;
5840         }
5841         if (v == (void*)2) {
5842                 status_unused(seq);
5843                 return 0;
5844         }
5845
5846         if (mddev_lock(mddev) < 0)
5847                 return -EINTR;
5848
5849         if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
5850                 seq_printf(seq, "%s : %sactive", mdname(mddev),
5851                                                 mddev->pers ? "" : "in");
5852                 if (mddev->pers) {
5853                         if (mddev->ro==1)
5854                                 seq_printf(seq, " (read-only)");
5855                         if (mddev->ro==2)
5856                                 seq_printf(seq, " (auto-read-only)");
5857                         seq_printf(seq, " %s", mddev->pers->name);
5858                 }
5859
5860                 sectors = 0;
5861                 list_for_each_entry(rdev, &mddev->disks, same_set) {
5862                         char b[BDEVNAME_SIZE];
5863                         seq_printf(seq, " %s[%d]",
5864                                 bdevname(rdev->bdev,b), rdev->desc_nr);
5865                         if (test_bit(WriteMostly, &rdev->flags))
5866                                 seq_printf(seq, "(W)");
5867                         if (test_bit(Faulty, &rdev->flags)) {
5868                                 seq_printf(seq, "(F)");
5869                                 continue;
5870                         } else if (rdev->raid_disk < 0)
5871                                 seq_printf(seq, "(S)"); /* spare */
5872                         sectors += rdev->sectors;
5873                 }
5874
5875                 if (!list_empty(&mddev->disks)) {
5876                         if (mddev->pers)
5877                                 seq_printf(seq, "\n      %llu blocks",
5878                                            (unsigned long long)
5879                                            mddev->array_sectors / 2);
5880                         else
5881                                 seq_printf(seq, "\n      %llu blocks",
5882                                            (unsigned long long)sectors / 2);
5883                 }
5884                 if (mddev->persistent) {
5885                         if (mddev->major_version != 0 ||
5886                             mddev->minor_version != 90) {
5887                                 seq_printf(seq," super %d.%d",
5888                                            mddev->major_version,
5889                                            mddev->minor_version);
5890                         }
5891                 } else if (mddev->external)
5892                         seq_printf(seq, " super external:%s",
5893                                    mddev->metadata_type);
5894                 else
5895                         seq_printf(seq, " super non-persistent");
5896
5897                 if (mddev->pers) {