Merge git://git.kernel.org/pub/scm/linux/kernel/git/sam/kbuild
[pandora-kernel.git] / drivers / md / md.c
1 /*
2    md.c : Multiple Devices driver for Linux
3           Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5      completely rewritten, based on the MD driver code from Marc Zyngier
6
7    Changes:
8
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13    - kmod support by: Cyrus Durgin
14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17    - lots of fixes and improvements to the RAID1/RAID5 and generic
18      RAID code (such as request based resynchronization):
19
20      Neil Brown <neilb@cse.unsw.edu.au>.
21
22    - persistent bitmap code
23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25    This program is free software; you can redistribute it and/or modify
26    it under the terms of the GNU General Public License as published by
27    the Free Software Foundation; either version 2, or (at your option)
28    any later version.
29
30    You should have received a copy of the GNU General Public License
31    (for example /usr/src/linux/COPYING); if not, write to the Free
32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */
34
35 #include <linux/module.h>
36 #include <linux/kthread.h>
37 #include <linux/linkage.h>
38 #include <linux/raid/md.h>
39 #include <linux/raid/bitmap.h>
40 #include <linux/sysctl.h>
41 #include <linux/buffer_head.h> /* for invalidate_bdev */
42 #include <linux/suspend.h>
43 #include <linux/poll.h>
44 #include <linux/mutex.h>
45 #include <linux/ctype.h>
46
47 #include <linux/init.h>
48
49 #include <linux/file.h>
50
51 #ifdef CONFIG_KMOD
52 #include <linux/kmod.h>
53 #endif
54
55 #include <asm/unaligned.h>
56
57 #define MAJOR_NR MD_MAJOR
58 #define MD_DRIVER
59
60 /* 63 partitions with the alternate major number (mdp) */
61 #define MdpMinorShift 6
62
63 #define DEBUG 0
64 #define dprintk(x...) ((void)(DEBUG && printk(x)))
65
66
67 #ifndef MODULE
68 static void autostart_arrays (int part);
69 #endif
70
71 static LIST_HEAD(pers_list);
72 static DEFINE_SPINLOCK(pers_lock);
73
74 static void md_print_devices(void);
75
76 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
77
78 /*
79  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
80  * is 1000 KB/sec, so the extra system load does not show up that much.
81  * Increase it if you want to have more _guaranteed_ speed. Note that
82  * the RAID driver will use the maximum available bandwidth if the IO
83  * subsystem is idle. There is also an 'absolute maximum' reconstruction
84  * speed limit - in case reconstruction slows down your system despite
85  * idle IO detection.
86  *
87  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
88  * or /sys/block/mdX/md/sync_speed_{min,max}
89  */
90
91 static int sysctl_speed_limit_min = 1000;
92 static int sysctl_speed_limit_max = 200000;
93 static inline int speed_min(mddev_t *mddev)
94 {
95         return mddev->sync_speed_min ?
96                 mddev->sync_speed_min : sysctl_speed_limit_min;
97 }
98
99 static inline int speed_max(mddev_t *mddev)
100 {
101         return mddev->sync_speed_max ?
102                 mddev->sync_speed_max : sysctl_speed_limit_max;
103 }
104
105 static struct ctl_table_header *raid_table_header;
106
107 static ctl_table raid_table[] = {
108         {
109                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MIN,
110                 .procname       = "speed_limit_min",
111                 .data           = &sysctl_speed_limit_min,
112                 .maxlen         = sizeof(int),
113                 .mode           = 0644,
114                 .proc_handler   = &proc_dointvec,
115         },
116         {
117                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MAX,
118                 .procname       = "speed_limit_max",
119                 .data           = &sysctl_speed_limit_max,
120                 .maxlen         = sizeof(int),
121                 .mode           = 0644,
122                 .proc_handler   = &proc_dointvec,
123         },
124         { .ctl_name = 0 }
125 };
126
127 static ctl_table raid_dir_table[] = {
128         {
129                 .ctl_name       = DEV_RAID,
130                 .procname       = "raid",
131                 .maxlen         = 0,
132                 .mode           = 0555,
133                 .child          = raid_table,
134         },
135         { .ctl_name = 0 }
136 };
137
138 static ctl_table raid_root_table[] = {
139         {
140                 .ctl_name       = CTL_DEV,
141                 .procname       = "dev",
142                 .maxlen         = 0,
143                 .mode           = 0555,
144                 .child          = raid_dir_table,
145         },
146         { .ctl_name = 0 }
147 };
148
149 static struct block_device_operations md_fops;
150
151 static int start_readonly;
152
153 /*
154  * We have a system wide 'event count' that is incremented
155  * on any 'interesting' event, and readers of /proc/mdstat
156  * can use 'poll' or 'select' to find out when the event
157  * count increases.
158  *
159  * Events are:
160  *  start array, stop array, error, add device, remove device,
161  *  start build, activate spare
162  */
163 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
164 static atomic_t md_event_count;
165 void md_new_event(mddev_t *mddev)
166 {
167         atomic_inc(&md_event_count);
168         wake_up(&md_event_waiters);
169         sysfs_notify(&mddev->kobj, NULL, "sync_action");
170 }
171 EXPORT_SYMBOL_GPL(md_new_event);
172
173 /* Alternate version that can be called from interrupts
174  * when calling sysfs_notify isn't needed.
175  */
176 static void md_new_event_inintr(mddev_t *mddev)
177 {
178         atomic_inc(&md_event_count);
179         wake_up(&md_event_waiters);
180 }
181
182 /*
183  * Enables to iterate over all existing md arrays
184  * all_mddevs_lock protects this list.
185  */
186 static LIST_HEAD(all_mddevs);
187 static DEFINE_SPINLOCK(all_mddevs_lock);
188
189
190 /*
191  * iterates through all used mddevs in the system.
192  * We take care to grab the all_mddevs_lock whenever navigating
193  * the list, and to always hold a refcount when unlocked.
194  * Any code which breaks out of this loop while own
195  * a reference to the current mddev and must mddev_put it.
196  */
197 #define ITERATE_MDDEV(mddev,tmp)                                        \
198                                                                         \
199         for (({ spin_lock(&all_mddevs_lock);                            \
200                 tmp = all_mddevs.next;                                  \
201                 mddev = NULL;});                                        \
202              ({ if (tmp != &all_mddevs)                                 \
203                         mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
204                 spin_unlock(&all_mddevs_lock);                          \
205                 if (mddev) mddev_put(mddev);                            \
206                 mddev = list_entry(tmp, mddev_t, all_mddevs);           \
207                 tmp != &all_mddevs;});                                  \
208              ({ spin_lock(&all_mddevs_lock);                            \
209                 tmp = tmp->next;})                                      \
210                 )
211
212
213 static int md_fail_request (request_queue_t *q, struct bio *bio)
214 {
215         bio_io_error(bio, bio->bi_size);
216         return 0;
217 }
218
219 static inline mddev_t *mddev_get(mddev_t *mddev)
220 {
221         atomic_inc(&mddev->active);
222         return mddev;
223 }
224
225 static void mddev_put(mddev_t *mddev)
226 {
227         if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
228                 return;
229         if (!mddev->raid_disks && list_empty(&mddev->disks)) {
230                 list_del(&mddev->all_mddevs);
231                 spin_unlock(&all_mddevs_lock);
232                 blk_cleanup_queue(mddev->queue);
233                 kobject_unregister(&mddev->kobj);
234         } else
235                 spin_unlock(&all_mddevs_lock);
236 }
237
238 static mddev_t * mddev_find(dev_t unit)
239 {
240         mddev_t *mddev, *new = NULL;
241
242  retry:
243         spin_lock(&all_mddevs_lock);
244         list_for_each_entry(mddev, &all_mddevs, all_mddevs)
245                 if (mddev->unit == unit) {
246                         mddev_get(mddev);
247                         spin_unlock(&all_mddevs_lock);
248                         kfree(new);
249                         return mddev;
250                 }
251
252         if (new) {
253                 list_add(&new->all_mddevs, &all_mddevs);
254                 spin_unlock(&all_mddevs_lock);
255                 return new;
256         }
257         spin_unlock(&all_mddevs_lock);
258
259         new = kzalloc(sizeof(*new), GFP_KERNEL);
260         if (!new)
261                 return NULL;
262
263         new->unit = unit;
264         if (MAJOR(unit) == MD_MAJOR)
265                 new->md_minor = MINOR(unit);
266         else
267                 new->md_minor = MINOR(unit) >> MdpMinorShift;
268
269         mutex_init(&new->reconfig_mutex);
270         INIT_LIST_HEAD(&new->disks);
271         INIT_LIST_HEAD(&new->all_mddevs);
272         init_timer(&new->safemode_timer);
273         atomic_set(&new->active, 1);
274         spin_lock_init(&new->write_lock);
275         init_waitqueue_head(&new->sb_wait);
276
277         new->queue = blk_alloc_queue(GFP_KERNEL);
278         if (!new->queue) {
279                 kfree(new);
280                 return NULL;
281         }
282         set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
283
284         blk_queue_make_request(new->queue, md_fail_request);
285
286         goto retry;
287 }
288
289 static inline int mddev_lock(mddev_t * mddev)
290 {
291         return mutex_lock_interruptible(&mddev->reconfig_mutex);
292 }
293
294 static inline int mddev_trylock(mddev_t * mddev)
295 {
296         return mutex_trylock(&mddev->reconfig_mutex);
297 }
298
299 static inline void mddev_unlock(mddev_t * mddev)
300 {
301         mutex_unlock(&mddev->reconfig_mutex);
302
303         md_wakeup_thread(mddev->thread);
304 }
305
306 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
307 {
308         mdk_rdev_t * rdev;
309         struct list_head *tmp;
310
311         ITERATE_RDEV(mddev,rdev,tmp) {
312                 if (rdev->desc_nr == nr)
313                         return rdev;
314         }
315         return NULL;
316 }
317
318 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
319 {
320         struct list_head *tmp;
321         mdk_rdev_t *rdev;
322
323         ITERATE_RDEV(mddev,rdev,tmp) {
324                 if (rdev->bdev->bd_dev == dev)
325                         return rdev;
326         }
327         return NULL;
328 }
329
330 static struct mdk_personality *find_pers(int level, char *clevel)
331 {
332         struct mdk_personality *pers;
333         list_for_each_entry(pers, &pers_list, list) {
334                 if (level != LEVEL_NONE && pers->level == level)
335                         return pers;
336                 if (strcmp(pers->name, clevel)==0)
337                         return pers;
338         }
339         return NULL;
340 }
341
342 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
343 {
344         sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
345         return MD_NEW_SIZE_BLOCKS(size);
346 }
347
348 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
349 {
350         sector_t size;
351
352         size = rdev->sb_offset;
353
354         if (chunk_size)
355                 size &= ~((sector_t)chunk_size/1024 - 1);
356         return size;
357 }
358
359 static int alloc_disk_sb(mdk_rdev_t * rdev)
360 {
361         if (rdev->sb_page)
362                 MD_BUG();
363
364         rdev->sb_page = alloc_page(GFP_KERNEL);
365         if (!rdev->sb_page) {
366                 printk(KERN_ALERT "md: out of memory.\n");
367                 return -EINVAL;
368         }
369
370         return 0;
371 }
372
373 static void free_disk_sb(mdk_rdev_t * rdev)
374 {
375         if (rdev->sb_page) {
376                 put_page(rdev->sb_page);
377                 rdev->sb_loaded = 0;
378                 rdev->sb_page = NULL;
379                 rdev->sb_offset = 0;
380                 rdev->size = 0;
381         }
382 }
383
384
385 static int super_written(struct bio *bio, unsigned int bytes_done, int error)
386 {
387         mdk_rdev_t *rdev = bio->bi_private;
388         mddev_t *mddev = rdev->mddev;
389         if (bio->bi_size)
390                 return 1;
391
392         if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
393                 md_error(mddev, rdev);
394
395         if (atomic_dec_and_test(&mddev->pending_writes))
396                 wake_up(&mddev->sb_wait);
397         bio_put(bio);
398         return 0;
399 }
400
401 static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error)
402 {
403         struct bio *bio2 = bio->bi_private;
404         mdk_rdev_t *rdev = bio2->bi_private;
405         mddev_t *mddev = rdev->mddev;
406         if (bio->bi_size)
407                 return 1;
408
409         if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
410             error == -EOPNOTSUPP) {
411                 unsigned long flags;
412                 /* barriers don't appear to be supported :-( */
413                 set_bit(BarriersNotsupp, &rdev->flags);
414                 mddev->barriers_work = 0;
415                 spin_lock_irqsave(&mddev->write_lock, flags);
416                 bio2->bi_next = mddev->biolist;
417                 mddev->biolist = bio2;
418                 spin_unlock_irqrestore(&mddev->write_lock, flags);
419                 wake_up(&mddev->sb_wait);
420                 bio_put(bio);
421                 return 0;
422         }
423         bio_put(bio2);
424         bio->bi_private = rdev;
425         return super_written(bio, bytes_done, error);
426 }
427
428 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
429                    sector_t sector, int size, struct page *page)
430 {
431         /* write first size bytes of page to sector of rdev
432          * Increment mddev->pending_writes before returning
433          * and decrement it on completion, waking up sb_wait
434          * if zero is reached.
435          * If an error occurred, call md_error
436          *
437          * As we might need to resubmit the request if BIO_RW_BARRIER
438          * causes ENOTSUPP, we allocate a spare bio...
439          */
440         struct bio *bio = bio_alloc(GFP_NOIO, 1);
441         int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
442
443         bio->bi_bdev = rdev->bdev;
444         bio->bi_sector = sector;
445         bio_add_page(bio, page, size, 0);
446         bio->bi_private = rdev;
447         bio->bi_end_io = super_written;
448         bio->bi_rw = rw;
449
450         atomic_inc(&mddev->pending_writes);
451         if (!test_bit(BarriersNotsupp, &rdev->flags)) {
452                 struct bio *rbio;
453                 rw |= (1<<BIO_RW_BARRIER);
454                 rbio = bio_clone(bio, GFP_NOIO);
455                 rbio->bi_private = bio;
456                 rbio->bi_end_io = super_written_barrier;
457                 submit_bio(rw, rbio);
458         } else
459                 submit_bio(rw, bio);
460 }
461
462 void md_super_wait(mddev_t *mddev)
463 {
464         /* wait for all superblock writes that were scheduled to complete.
465          * if any had to be retried (due to BARRIER problems), retry them
466          */
467         DEFINE_WAIT(wq);
468         for(;;) {
469                 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
470                 if (atomic_read(&mddev->pending_writes)==0)
471                         break;
472                 while (mddev->biolist) {
473                         struct bio *bio;
474                         spin_lock_irq(&mddev->write_lock);
475                         bio = mddev->biolist;
476                         mddev->biolist = bio->bi_next ;
477                         bio->bi_next = NULL;
478                         spin_unlock_irq(&mddev->write_lock);
479                         submit_bio(bio->bi_rw, bio);
480                 }
481                 schedule();
482         }
483         finish_wait(&mddev->sb_wait, &wq);
484 }
485
486 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
487 {
488         if (bio->bi_size)
489                 return 1;
490
491         complete((struct completion*)bio->bi_private);
492         return 0;
493 }
494
495 int sync_page_io(struct block_device *bdev, sector_t sector, int size,
496                    struct page *page, int rw)
497 {
498         struct bio *bio = bio_alloc(GFP_NOIO, 1);
499         struct completion event;
500         int ret;
501
502         rw |= (1 << BIO_RW_SYNC);
503
504         bio->bi_bdev = bdev;
505         bio->bi_sector = sector;
506         bio_add_page(bio, page, size, 0);
507         init_completion(&event);
508         bio->bi_private = &event;
509         bio->bi_end_io = bi_complete;
510         submit_bio(rw, bio);
511         wait_for_completion(&event);
512
513         ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
514         bio_put(bio);
515         return ret;
516 }
517 EXPORT_SYMBOL_GPL(sync_page_io);
518
519 static int read_disk_sb(mdk_rdev_t * rdev, int size)
520 {
521         char b[BDEVNAME_SIZE];
522         if (!rdev->sb_page) {
523                 MD_BUG();
524                 return -EINVAL;
525         }
526         if (rdev->sb_loaded)
527                 return 0;
528
529
530         if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
531                 goto fail;
532         rdev->sb_loaded = 1;
533         return 0;
534
535 fail:
536         printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
537                 bdevname(rdev->bdev,b));
538         return -EINVAL;
539 }
540
541 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
542 {
543         if (    (sb1->set_uuid0 == sb2->set_uuid0) &&
544                 (sb1->set_uuid1 == sb2->set_uuid1) &&
545                 (sb1->set_uuid2 == sb2->set_uuid2) &&
546                 (sb1->set_uuid3 == sb2->set_uuid3))
547
548                 return 1;
549
550         return 0;
551 }
552
553
554 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
555 {
556         int ret;
557         mdp_super_t *tmp1, *tmp2;
558
559         tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
560         tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
561
562         if (!tmp1 || !tmp2) {
563                 ret = 0;
564                 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
565                 goto abort;
566         }
567
568         *tmp1 = *sb1;
569         *tmp2 = *sb2;
570
571         /*
572          * nr_disks is not constant
573          */
574         tmp1->nr_disks = 0;
575         tmp2->nr_disks = 0;
576
577         if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
578                 ret = 0;
579         else
580                 ret = 1;
581
582 abort:
583         kfree(tmp1);
584         kfree(tmp2);
585         return ret;
586 }
587
588 static unsigned int calc_sb_csum(mdp_super_t * sb)
589 {
590         unsigned int disk_csum, csum;
591
592         disk_csum = sb->sb_csum;
593         sb->sb_csum = 0;
594         csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
595         sb->sb_csum = disk_csum;
596         return csum;
597 }
598
599
600 /*
601  * Handle superblock details.
602  * We want to be able to handle multiple superblock formats
603  * so we have a common interface to them all, and an array of
604  * different handlers.
605  * We rely on user-space to write the initial superblock, and support
606  * reading and updating of superblocks.
607  * Interface methods are:
608  *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
609  *      loads and validates a superblock on dev.
610  *      if refdev != NULL, compare superblocks on both devices
611  *    Return:
612  *      0 - dev has a superblock that is compatible with refdev
613  *      1 - dev has a superblock that is compatible and newer than refdev
614  *          so dev should be used as the refdev in future
615  *     -EINVAL superblock incompatible or invalid
616  *     -othererror e.g. -EIO
617  *
618  *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
619  *      Verify that dev is acceptable into mddev.
620  *       The first time, mddev->raid_disks will be 0, and data from
621  *       dev should be merged in.  Subsequent calls check that dev
622  *       is new enough.  Return 0 or -EINVAL
623  *
624  *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
625  *     Update the superblock for rdev with data in mddev
626  *     This does not write to disc.
627  *
628  */
629
630 struct super_type  {
631         char            *name;
632         struct module   *owner;
633         int             (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
634         int             (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
635         void            (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
636 };
637
638 /*
639  * load_super for 0.90.0 
640  */
641 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
642 {
643         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
644         mdp_super_t *sb;
645         int ret;
646         sector_t sb_offset;
647
648         /*
649          * Calculate the position of the superblock,
650          * it's at the end of the disk.
651          *
652          * It also happens to be a multiple of 4Kb.
653          */
654         sb_offset = calc_dev_sboffset(rdev->bdev);
655         rdev->sb_offset = sb_offset;
656
657         ret = read_disk_sb(rdev, MD_SB_BYTES);
658         if (ret) return ret;
659
660         ret = -EINVAL;
661
662         bdevname(rdev->bdev, b);
663         sb = (mdp_super_t*)page_address(rdev->sb_page);
664
665         if (sb->md_magic != MD_SB_MAGIC) {
666                 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
667                        b);
668                 goto abort;
669         }
670
671         if (sb->major_version != 0 ||
672             sb->minor_version < 90 ||
673             sb->minor_version > 91) {
674                 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
675                         sb->major_version, sb->minor_version,
676                         b);
677                 goto abort;
678         }
679
680         if (sb->raid_disks <= 0)
681                 goto abort;
682
683         if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) {
684                 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
685                         b);
686                 goto abort;
687         }
688
689         rdev->preferred_minor = sb->md_minor;
690         rdev->data_offset = 0;
691         rdev->sb_size = MD_SB_BYTES;
692
693         if (sb->level == LEVEL_MULTIPATH)
694                 rdev->desc_nr = -1;
695         else
696                 rdev->desc_nr = sb->this_disk.number;
697
698         if (refdev == 0)
699                 ret = 1;
700         else {
701                 __u64 ev1, ev2;
702                 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
703                 if (!uuid_equal(refsb, sb)) {
704                         printk(KERN_WARNING "md: %s has different UUID to %s\n",
705                                 b, bdevname(refdev->bdev,b2));
706                         goto abort;
707                 }
708                 if (!sb_equal(refsb, sb)) {
709                         printk(KERN_WARNING "md: %s has same UUID"
710                                " but different superblock to %s\n",
711                                b, bdevname(refdev->bdev, b2));
712                         goto abort;
713                 }
714                 ev1 = md_event(sb);
715                 ev2 = md_event(refsb);
716                 if (ev1 > ev2)
717                         ret = 1;
718                 else 
719                         ret = 0;
720         }
721         rdev->size = calc_dev_size(rdev, sb->chunk_size);
722
723         if (rdev->size < sb->size && sb->level > 1)
724                 /* "this cannot possibly happen" ... */
725                 ret = -EINVAL;
726
727  abort:
728         return ret;
729 }
730
731 /*
732  * validate_super for 0.90.0
733  */
734 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
735 {
736         mdp_disk_t *desc;
737         mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
738         __u64 ev1 = md_event(sb);
739
740         rdev->raid_disk = -1;
741         rdev->flags = 0;
742         if (mddev->raid_disks == 0) {
743                 mddev->major_version = 0;
744                 mddev->minor_version = sb->minor_version;
745                 mddev->patch_version = sb->patch_version;
746                 mddev->persistent = ! sb->not_persistent;
747                 mddev->chunk_size = sb->chunk_size;
748                 mddev->ctime = sb->ctime;
749                 mddev->utime = sb->utime;
750                 mddev->level = sb->level;
751                 mddev->clevel[0] = 0;
752                 mddev->layout = sb->layout;
753                 mddev->raid_disks = sb->raid_disks;
754                 mddev->size = sb->size;
755                 mddev->events = ev1;
756                 mddev->bitmap_offset = 0;
757                 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
758
759                 if (mddev->minor_version >= 91) {
760                         mddev->reshape_position = sb->reshape_position;
761                         mddev->delta_disks = sb->delta_disks;
762                         mddev->new_level = sb->new_level;
763                         mddev->new_layout = sb->new_layout;
764                         mddev->new_chunk = sb->new_chunk;
765                 } else {
766                         mddev->reshape_position = MaxSector;
767                         mddev->delta_disks = 0;
768                         mddev->new_level = mddev->level;
769                         mddev->new_layout = mddev->layout;
770                         mddev->new_chunk = mddev->chunk_size;
771                 }
772
773                 if (sb->state & (1<<MD_SB_CLEAN))
774                         mddev->recovery_cp = MaxSector;
775                 else {
776                         if (sb->events_hi == sb->cp_events_hi && 
777                                 sb->events_lo == sb->cp_events_lo) {
778                                 mddev->recovery_cp = sb->recovery_cp;
779                         } else
780                                 mddev->recovery_cp = 0;
781                 }
782
783                 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
784                 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
785                 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
786                 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
787
788                 mddev->max_disks = MD_SB_DISKS;
789
790                 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
791                     mddev->bitmap_file == NULL) {
792                         if (mddev->level != 1 && mddev->level != 4
793                             && mddev->level != 5 && mddev->level != 6
794                             && mddev->level != 10) {
795                                 /* FIXME use a better test */
796                                 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
797                                 return -EINVAL;
798                         }
799                         mddev->bitmap_offset = mddev->default_bitmap_offset;
800                 }
801
802         } else if (mddev->pers == NULL) {
803                 /* Insist on good event counter while assembling */
804                 ++ev1;
805                 if (ev1 < mddev->events) 
806                         return -EINVAL;
807         } else if (mddev->bitmap) {
808                 /* if adding to array with a bitmap, then we can accept an
809                  * older device ... but not too old.
810                  */
811                 if (ev1 < mddev->bitmap->events_cleared)
812                         return 0;
813         } else {
814                 if (ev1 < mddev->events)
815                         /* just a hot-add of a new device, leave raid_disk at -1 */
816                         return 0;
817         }
818
819         if (mddev->level != LEVEL_MULTIPATH) {
820                 desc = sb->disks + rdev->desc_nr;
821
822                 if (desc->state & (1<<MD_DISK_FAULTY))
823                         set_bit(Faulty, &rdev->flags);
824                 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
825                             desc->raid_disk < mddev->raid_disks */) {
826                         set_bit(In_sync, &rdev->flags);
827                         rdev->raid_disk = desc->raid_disk;
828                 }
829                 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
830                         set_bit(WriteMostly, &rdev->flags);
831         } else /* MULTIPATH are always insync */
832                 set_bit(In_sync, &rdev->flags);
833         return 0;
834 }
835
836 /*
837  * sync_super for 0.90.0
838  */
839 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
840 {
841         mdp_super_t *sb;
842         struct list_head *tmp;
843         mdk_rdev_t *rdev2;
844         int next_spare = mddev->raid_disks;
845
846
847         /* make rdev->sb match mddev data..
848          *
849          * 1/ zero out disks
850          * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
851          * 3/ any empty disks < next_spare become removed
852          *
853          * disks[0] gets initialised to REMOVED because
854          * we cannot be sure from other fields if it has
855          * been initialised or not.
856          */
857         int i;
858         int active=0, working=0,failed=0,spare=0,nr_disks=0;
859
860         rdev->sb_size = MD_SB_BYTES;
861
862         sb = (mdp_super_t*)page_address(rdev->sb_page);
863
864         memset(sb, 0, sizeof(*sb));
865
866         sb->md_magic = MD_SB_MAGIC;
867         sb->major_version = mddev->major_version;
868         sb->patch_version = mddev->patch_version;
869         sb->gvalid_words  = 0; /* ignored */
870         memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
871         memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
872         memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
873         memcpy(&sb->set_uuid3, mddev->uuid+12,4);
874
875         sb->ctime = mddev->ctime;
876         sb->level = mddev->level;
877         sb->size  = mddev->size;
878         sb->raid_disks = mddev->raid_disks;
879         sb->md_minor = mddev->md_minor;
880         sb->not_persistent = !mddev->persistent;
881         sb->utime = mddev->utime;
882         sb->state = 0;
883         sb->events_hi = (mddev->events>>32);
884         sb->events_lo = (u32)mddev->events;
885
886         if (mddev->reshape_position == MaxSector)
887                 sb->minor_version = 90;
888         else {
889                 sb->minor_version = 91;
890                 sb->reshape_position = mddev->reshape_position;
891                 sb->new_level = mddev->new_level;
892                 sb->delta_disks = mddev->delta_disks;
893                 sb->new_layout = mddev->new_layout;
894                 sb->new_chunk = mddev->new_chunk;
895         }
896         mddev->minor_version = sb->minor_version;
897         if (mddev->in_sync)
898         {
899                 sb->recovery_cp = mddev->recovery_cp;
900                 sb->cp_events_hi = (mddev->events>>32);
901                 sb->cp_events_lo = (u32)mddev->events;
902                 if (mddev->recovery_cp == MaxSector)
903                         sb->state = (1<< MD_SB_CLEAN);
904         } else
905                 sb->recovery_cp = 0;
906
907         sb->layout = mddev->layout;
908         sb->chunk_size = mddev->chunk_size;
909
910         if (mddev->bitmap && mddev->bitmap_file == NULL)
911                 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
912
913         sb->disks[0].state = (1<<MD_DISK_REMOVED);
914         ITERATE_RDEV(mddev,rdev2,tmp) {
915                 mdp_disk_t *d;
916                 int desc_nr;
917                 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
918                     && !test_bit(Faulty, &rdev2->flags))
919                         desc_nr = rdev2->raid_disk;
920                 else
921                         desc_nr = next_spare++;
922                 rdev2->desc_nr = desc_nr;
923                 d = &sb->disks[rdev2->desc_nr];
924                 nr_disks++;
925                 d->number = rdev2->desc_nr;
926                 d->major = MAJOR(rdev2->bdev->bd_dev);
927                 d->minor = MINOR(rdev2->bdev->bd_dev);
928                 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
929                     && !test_bit(Faulty, &rdev2->flags))
930                         d->raid_disk = rdev2->raid_disk;
931                 else
932                         d->raid_disk = rdev2->desc_nr; /* compatibility */
933                 if (test_bit(Faulty, &rdev2->flags))
934                         d->state = (1<<MD_DISK_FAULTY);
935                 else if (test_bit(In_sync, &rdev2->flags)) {
936                         d->state = (1<<MD_DISK_ACTIVE);
937                         d->state |= (1<<MD_DISK_SYNC);
938                         active++;
939                         working++;
940                 } else {
941                         d->state = 0;
942                         spare++;
943                         working++;
944                 }
945                 if (test_bit(WriteMostly, &rdev2->flags))
946                         d->state |= (1<<MD_DISK_WRITEMOSTLY);
947         }
948         /* now set the "removed" and "faulty" bits on any missing devices */
949         for (i=0 ; i < mddev->raid_disks ; i++) {
950                 mdp_disk_t *d = &sb->disks[i];
951                 if (d->state == 0 && d->number == 0) {
952                         d->number = i;
953                         d->raid_disk = i;
954                         d->state = (1<<MD_DISK_REMOVED);
955                         d->state |= (1<<MD_DISK_FAULTY);
956                         failed++;
957                 }
958         }
959         sb->nr_disks = nr_disks;
960         sb->active_disks = active;
961         sb->working_disks = working;
962         sb->failed_disks = failed;
963         sb->spare_disks = spare;
964
965         sb->this_disk = sb->disks[rdev->desc_nr];
966         sb->sb_csum = calc_sb_csum(sb);
967 }
968
969 /*
970  * version 1 superblock
971  */
972
973 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
974 {
975         unsigned int disk_csum, csum;
976         unsigned long long newcsum;
977         int size = 256 + le32_to_cpu(sb->max_dev)*2;
978         unsigned int *isuper = (unsigned int*)sb;
979         int i;
980
981         disk_csum = sb->sb_csum;
982         sb->sb_csum = 0;
983         newcsum = 0;
984         for (i=0; size>=4; size -= 4 )
985                 newcsum += le32_to_cpu(*isuper++);
986
987         if (size == 2)
988                 newcsum += le16_to_cpu(*(unsigned short*) isuper);
989
990         csum = (newcsum & 0xffffffff) + (newcsum >> 32);
991         sb->sb_csum = disk_csum;
992         return cpu_to_le32(csum);
993 }
994
995 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
996 {
997         struct mdp_superblock_1 *sb;
998         int ret;
999         sector_t sb_offset;
1000         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1001         int bmask;
1002
1003         /*
1004          * Calculate the position of the superblock.
1005          * It is always aligned to a 4K boundary and
1006          * depeding on minor_version, it can be:
1007          * 0: At least 8K, but less than 12K, from end of device
1008          * 1: At start of device
1009          * 2: 4K from start of device.
1010          */
1011         switch(minor_version) {
1012         case 0:
1013                 sb_offset = rdev->bdev->bd_inode->i_size >> 9;
1014                 sb_offset -= 8*2;
1015                 sb_offset &= ~(sector_t)(4*2-1);
1016                 /* convert from sectors to K */
1017                 sb_offset /= 2;
1018                 break;
1019         case 1:
1020                 sb_offset = 0;
1021                 break;
1022         case 2:
1023                 sb_offset = 4;
1024                 break;
1025         default:
1026                 return -EINVAL;
1027         }
1028         rdev->sb_offset = sb_offset;
1029
1030         /* superblock is rarely larger than 1K, but it can be larger,
1031          * and it is safe to read 4k, so we do that
1032          */
1033         ret = read_disk_sb(rdev, 4096);
1034         if (ret) return ret;
1035
1036
1037         sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1038
1039         if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1040             sb->major_version != cpu_to_le32(1) ||
1041             le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1042             le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
1043             (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1044                 return -EINVAL;
1045
1046         if (calc_sb_1_csum(sb) != sb->sb_csum) {
1047                 printk("md: invalid superblock checksum on %s\n",
1048                         bdevname(rdev->bdev,b));
1049                 return -EINVAL;
1050         }
1051         if (le64_to_cpu(sb->data_size) < 10) {
1052                 printk("md: data_size too small on %s\n",
1053                        bdevname(rdev->bdev,b));
1054                 return -EINVAL;
1055         }
1056         rdev->preferred_minor = 0xffff;
1057         rdev->data_offset = le64_to_cpu(sb->data_offset);
1058         atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1059
1060         rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1061         bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
1062         if (rdev->sb_size & bmask)
1063                 rdev-> sb_size = (rdev->sb_size | bmask)+1;
1064
1065         if (refdev == 0)
1066                 ret = 1;
1067         else {
1068                 __u64 ev1, ev2;
1069                 struct mdp_superblock_1 *refsb = 
1070                         (struct mdp_superblock_1*)page_address(refdev->sb_page);
1071
1072                 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1073                     sb->level != refsb->level ||
1074                     sb->layout != refsb->layout ||
1075                     sb->chunksize != refsb->chunksize) {
1076                         printk(KERN_WARNING "md: %s has strangely different"
1077                                 " superblock to %s\n",
1078                                 bdevname(rdev->bdev,b),
1079                                 bdevname(refdev->bdev,b2));
1080                         return -EINVAL;
1081                 }
1082                 ev1 = le64_to_cpu(sb->events);
1083                 ev2 = le64_to_cpu(refsb->events);
1084
1085                 if (ev1 > ev2)
1086                         ret = 1;
1087                 else
1088                         ret = 0;
1089         }
1090         if (minor_version) 
1091                 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
1092         else
1093                 rdev->size = rdev->sb_offset;
1094         if (rdev->size < le64_to_cpu(sb->data_size)/2)
1095                 return -EINVAL;
1096         rdev->size = le64_to_cpu(sb->data_size)/2;
1097         if (le32_to_cpu(sb->chunksize))
1098                 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
1099
1100         if (le32_to_cpu(sb->size) > rdev->size*2)
1101                 return -EINVAL;
1102         return ret;
1103 }
1104
1105 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1106 {
1107         struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1108         __u64 ev1 = le64_to_cpu(sb->events);
1109
1110         rdev->raid_disk = -1;
1111         rdev->flags = 0;
1112         if (mddev->raid_disks == 0) {
1113                 mddev->major_version = 1;
1114                 mddev->patch_version = 0;
1115                 mddev->persistent = 1;
1116                 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
1117                 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1118                 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1119                 mddev->level = le32_to_cpu(sb->level);
1120                 mddev->clevel[0] = 0;
1121                 mddev->layout = le32_to_cpu(sb->layout);
1122                 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1123                 mddev->size = le64_to_cpu(sb->size)/2;
1124                 mddev->events = ev1;
1125                 mddev->bitmap_offset = 0;
1126                 mddev->default_bitmap_offset = 1024 >> 9;
1127                 
1128                 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1129                 memcpy(mddev->uuid, sb->set_uuid, 16);
1130
1131                 mddev->max_disks =  (4096-256)/2;
1132
1133                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1134                     mddev->bitmap_file == NULL ) {
1135                         if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
1136                             && mddev->level != 10) {
1137                                 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
1138                                 return -EINVAL;
1139                         }
1140                         mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1141                 }
1142                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1143                         mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1144                         mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1145                         mddev->new_level = le32_to_cpu(sb->new_level);
1146                         mddev->new_layout = le32_to_cpu(sb->new_layout);
1147                         mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
1148                 } else {
1149                         mddev->reshape_position = MaxSector;
1150                         mddev->delta_disks = 0;
1151                         mddev->new_level = mddev->level;
1152                         mddev->new_layout = mddev->layout;
1153                         mddev->new_chunk = mddev->chunk_size;
1154                 }
1155
1156         } else if (mddev->pers == NULL) {
1157                 /* Insist of good event counter while assembling */
1158                 ++ev1;
1159                 if (ev1 < mddev->events)
1160                         return -EINVAL;
1161         } else if (mddev->bitmap) {
1162                 /* If adding to array with a bitmap, then we can accept an
1163                  * older device, but not too old.
1164                  */
1165                 if (ev1 < mddev->bitmap->events_cleared)
1166                         return 0;
1167         } else {
1168                 if (ev1 < mddev->events)
1169                         /* just a hot-add of a new device, leave raid_disk at -1 */
1170                         return 0;
1171         }
1172         if (mddev->level != LEVEL_MULTIPATH) {
1173                 int role;
1174                 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1175                 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1176                 switch(role) {
1177                 case 0xffff: /* spare */
1178                         break;
1179                 case 0xfffe: /* faulty */
1180                         set_bit(Faulty, &rdev->flags);
1181                         break;
1182                 default:
1183                         if ((le32_to_cpu(sb->feature_map) &
1184                              MD_FEATURE_RECOVERY_OFFSET))
1185                                 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1186                         else
1187                                 set_bit(In_sync, &rdev->flags);
1188                         rdev->raid_disk = role;
1189                         break;
1190                 }
1191                 if (sb->devflags & WriteMostly1)
1192                         set_bit(WriteMostly, &rdev->flags);
1193         } else /* MULTIPATH are always insync */
1194                 set_bit(In_sync, &rdev->flags);
1195
1196         return 0;
1197 }
1198
1199 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1200 {
1201         struct mdp_superblock_1 *sb;
1202         struct list_head *tmp;
1203         mdk_rdev_t *rdev2;
1204         int max_dev, i;
1205         /* make rdev->sb match mddev and rdev data. */
1206
1207         sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1208
1209         sb->feature_map = 0;
1210         sb->pad0 = 0;
1211         sb->recovery_offset = cpu_to_le64(0);
1212         memset(sb->pad1, 0, sizeof(sb->pad1));
1213         memset(sb->pad2, 0, sizeof(sb->pad2));
1214         memset(sb->pad3, 0, sizeof(sb->pad3));
1215
1216         sb->utime = cpu_to_le64((__u64)mddev->utime);
1217         sb->events = cpu_to_le64(mddev->events);
1218         if (mddev->in_sync)
1219                 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1220         else
1221                 sb->resync_offset = cpu_to_le64(0);
1222
1223         sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
1224
1225         sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1226         sb->size = cpu_to_le64(mddev->size<<1);
1227
1228         if (mddev->bitmap && mddev->bitmap_file == NULL) {
1229                 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1230                 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1231         }
1232
1233         if (rdev->raid_disk >= 0 &&
1234             !test_bit(In_sync, &rdev->flags) &&
1235             rdev->recovery_offset > 0) {
1236                 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1237                 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
1238         }
1239
1240         if (mddev->reshape_position != MaxSector) {
1241                 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1242                 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1243                 sb->new_layout = cpu_to_le32(mddev->new_layout);
1244                 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1245                 sb->new_level = cpu_to_le32(mddev->new_level);
1246                 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
1247         }
1248
1249         max_dev = 0;
1250         ITERATE_RDEV(mddev,rdev2,tmp)
1251                 if (rdev2->desc_nr+1 > max_dev)
1252                         max_dev = rdev2->desc_nr+1;
1253         
1254         sb->max_dev = cpu_to_le32(max_dev);
1255         for (i=0; i<max_dev;i++)
1256                 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1257         
1258         ITERATE_RDEV(mddev,rdev2,tmp) {
1259                 i = rdev2->desc_nr;
1260                 if (test_bit(Faulty, &rdev2->flags))
1261                         sb->dev_roles[i] = cpu_to_le16(0xfffe);
1262                 else if (test_bit(In_sync, &rdev2->flags))
1263                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1264                 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1265                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1266                 else
1267                         sb->dev_roles[i] = cpu_to_le16(0xffff);
1268         }
1269
1270         sb->sb_csum = calc_sb_1_csum(sb);
1271 }
1272
1273
1274 static struct super_type super_types[] = {
1275         [0] = {
1276                 .name   = "0.90.0",
1277                 .owner  = THIS_MODULE,
1278                 .load_super     = super_90_load,
1279                 .validate_super = super_90_validate,
1280                 .sync_super     = super_90_sync,
1281         },
1282         [1] = {
1283                 .name   = "md-1",
1284                 .owner  = THIS_MODULE,
1285                 .load_super     = super_1_load,
1286                 .validate_super = super_1_validate,
1287                 .sync_super     = super_1_sync,
1288         },
1289 };
1290         
1291 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
1292 {
1293         struct list_head *tmp;
1294         mdk_rdev_t *rdev;
1295
1296         ITERATE_RDEV(mddev,rdev,tmp)
1297                 if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
1298                         return rdev;
1299
1300         return NULL;
1301 }
1302
1303 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1304 {
1305         struct list_head *tmp;
1306         mdk_rdev_t *rdev;
1307
1308         ITERATE_RDEV(mddev1,rdev,tmp)
1309                 if (match_dev_unit(mddev2, rdev))
1310                         return 1;
1311
1312         return 0;
1313 }
1314
1315 static LIST_HEAD(pending_raid_disks);
1316
1317 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1318 {
1319         mdk_rdev_t *same_pdev;
1320         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1321         struct kobject *ko;
1322         char *s;
1323
1324         if (rdev->mddev) {
1325                 MD_BUG();
1326                 return -EINVAL;
1327         }
1328         /* make sure rdev->size exceeds mddev->size */
1329         if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1330                 if (mddev->pers)
1331                         /* Cannot change size, so fail */
1332                         return -ENOSPC;
1333                 else
1334                         mddev->size = rdev->size;
1335         }
1336         same_pdev = match_dev_unit(mddev, rdev);
1337         if (same_pdev)
1338                 printk(KERN_WARNING
1339                         "%s: WARNING: %s appears to be on the same physical"
1340                         " disk as %s. True\n     protection against single-disk"
1341                         " failure might be compromised.\n",
1342                         mdname(mddev), bdevname(rdev->bdev,b),
1343                         bdevname(same_pdev->bdev,b2));
1344
1345         /* Verify rdev->desc_nr is unique.
1346          * If it is -1, assign a free number, else
1347          * check number is not in use
1348          */
1349         if (rdev->desc_nr < 0) {
1350                 int choice = 0;
1351                 if (mddev->pers) choice = mddev->raid_disks;
1352                 while (find_rdev_nr(mddev, choice))
1353                         choice++;
1354                 rdev->desc_nr = choice;
1355         } else {
1356                 if (find_rdev_nr(mddev, rdev->desc_nr))
1357                         return -EBUSY;
1358         }
1359         bdevname(rdev->bdev,b);
1360         if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0)
1361                 return -ENOMEM;
1362         while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL)
1363                 *s = '!';
1364                         
1365         list_add(&rdev->same_set, &mddev->disks);
1366         rdev->mddev = mddev;
1367         printk(KERN_INFO "md: bind<%s>\n", b);
1368
1369         rdev->kobj.parent = &mddev->kobj;
1370         kobject_add(&rdev->kobj);
1371
1372         if (rdev->bdev->bd_part)
1373                 ko = &rdev->bdev->bd_part->kobj;
1374         else
1375                 ko = &rdev->bdev->bd_disk->kobj;
1376         sysfs_create_link(&rdev->kobj, ko, "block");
1377         bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
1378         return 0;
1379 }
1380
1381 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1382 {
1383         char b[BDEVNAME_SIZE];
1384         if (!rdev->mddev) {
1385                 MD_BUG();
1386                 return;
1387         }
1388         bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1389         list_del_init(&rdev->same_set);
1390         printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1391         rdev->mddev = NULL;
1392         sysfs_remove_link(&rdev->kobj, "block");
1393         kobject_del(&rdev->kobj);
1394 }
1395
1396 /*
1397  * prevent the device from being mounted, repartitioned or
1398  * otherwise reused by a RAID array (or any other kernel
1399  * subsystem), by bd_claiming the device.
1400  */
1401 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
1402 {
1403         int err = 0;
1404         struct block_device *bdev;
1405         char b[BDEVNAME_SIZE];
1406
1407         bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1408         if (IS_ERR(bdev)) {
1409                 printk(KERN_ERR "md: could not open %s.\n",
1410                         __bdevname(dev, b));
1411                 return PTR_ERR(bdev);
1412         }
1413         err = bd_claim(bdev, rdev);
1414         if (err) {
1415                 printk(KERN_ERR "md: could not bd_claim %s.\n",
1416                         bdevname(bdev, b));
1417                 blkdev_put_partition(bdev);
1418                 return err;
1419         }
1420         rdev->bdev = bdev;
1421         return err;
1422 }
1423
1424 static void unlock_rdev(mdk_rdev_t *rdev)
1425 {
1426         struct block_device *bdev = rdev->bdev;
1427         rdev->bdev = NULL;
1428         if (!bdev)
1429                 MD_BUG();
1430         bd_release(bdev);
1431         blkdev_put_partition(bdev);
1432 }
1433
1434 void md_autodetect_dev(dev_t dev);
1435
1436 static void export_rdev(mdk_rdev_t * rdev)
1437 {
1438         char b[BDEVNAME_SIZE];
1439         printk(KERN_INFO "md: export_rdev(%s)\n",
1440                 bdevname(rdev->bdev,b));
1441         if (rdev->mddev)
1442                 MD_BUG();
1443         free_disk_sb(rdev);
1444         list_del_init(&rdev->same_set);
1445 #ifndef MODULE
1446         md_autodetect_dev(rdev->bdev->bd_dev);
1447 #endif
1448         unlock_rdev(rdev);
1449         kobject_put(&rdev->kobj);
1450 }
1451
1452 static void kick_rdev_from_array(mdk_rdev_t * rdev)
1453 {
1454         unbind_rdev_from_array(rdev);
1455         export_rdev(rdev);
1456 }
1457
1458 static void export_array(mddev_t *mddev)
1459 {
1460         struct list_head *tmp;
1461         mdk_rdev_t *rdev;
1462
1463         ITERATE_RDEV(mddev,rdev,tmp) {
1464                 if (!rdev->mddev) {
1465                         MD_BUG();
1466                         continue;
1467                 }
1468                 kick_rdev_from_array(rdev);
1469         }
1470         if (!list_empty(&mddev->disks))
1471                 MD_BUG();
1472         mddev->raid_disks = 0;
1473         mddev->major_version = 0;
1474 }
1475
1476 static void print_desc(mdp_disk_t *desc)
1477 {
1478         printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1479                 desc->major,desc->minor,desc->raid_disk,desc->state);
1480 }
1481
1482 static void print_sb(mdp_super_t *sb)
1483 {
1484         int i;
1485
1486         printk(KERN_INFO 
1487                 "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1488                 sb->major_version, sb->minor_version, sb->patch_version,
1489                 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1490                 sb->ctime);
1491         printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1492                 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1493                 sb->md_minor, sb->layout, sb->chunk_size);
1494         printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
1495                 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1496                 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1497                 sb->failed_disks, sb->spare_disks,
1498                 sb->sb_csum, (unsigned long)sb->events_lo);
1499
1500         printk(KERN_INFO);
1501         for (i = 0; i < MD_SB_DISKS; i++) {
1502                 mdp_disk_t *desc;
1503
1504                 desc = sb->disks + i;
1505                 if (desc->number || desc->major || desc->minor ||
1506                     desc->raid_disk || (desc->state && (desc->state != 4))) {
1507                         printk("     D %2d: ", i);
1508                         print_desc(desc);
1509                 }
1510         }
1511         printk(KERN_INFO "md:     THIS: ");
1512         print_desc(&sb->this_disk);
1513
1514 }
1515
1516 static void print_rdev(mdk_rdev_t *rdev)
1517 {
1518         char b[BDEVNAME_SIZE];
1519         printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1520                 bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1521                 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1522                 rdev->desc_nr);
1523         if (rdev->sb_loaded) {
1524                 printk(KERN_INFO "md: rdev superblock:\n");
1525                 print_sb((mdp_super_t*)page_address(rdev->sb_page));
1526         } else
1527                 printk(KERN_INFO "md: no rdev superblock!\n");
1528 }
1529
1530 static void md_print_devices(void)
1531 {
1532         struct list_head *tmp, *tmp2;
1533         mdk_rdev_t *rdev;
1534         mddev_t *mddev;
1535         char b[BDEVNAME_SIZE];
1536
1537         printk("\n");
1538         printk("md:     **********************************\n");
1539         printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
1540         printk("md:     **********************************\n");
1541         ITERATE_MDDEV(mddev,tmp) {
1542
1543                 if (mddev->bitmap)
1544                         bitmap_print_sb(mddev->bitmap);
1545                 else
1546                         printk("%s: ", mdname(mddev));
1547                 ITERATE_RDEV(mddev,rdev,tmp2)
1548                         printk("<%s>", bdevname(rdev->bdev,b));
1549                 printk("\n");
1550
1551                 ITERATE_RDEV(mddev,rdev,tmp2)
1552                         print_rdev(rdev);
1553         }
1554         printk("md:     **********************************\n");
1555         printk("\n");
1556 }
1557
1558
1559 static void sync_sbs(mddev_t * mddev, int nospares)
1560 {
1561         /* Update each superblock (in-memory image), but
1562          * if we are allowed to, skip spares which already
1563          * have the right event counter, or have one earlier
1564          * (which would mean they aren't being marked as dirty
1565          * with the rest of the array)
1566          */
1567         mdk_rdev_t *rdev;
1568         struct list_head *tmp;
1569
1570         ITERATE_RDEV(mddev,rdev,tmp) {
1571                 if (rdev->sb_events == mddev->events ||
1572                     (nospares &&
1573                      rdev->raid_disk < 0 &&
1574                      (rdev->sb_events&1)==0 &&
1575                      rdev->sb_events+1 == mddev->events)) {
1576                         /* Don't update this superblock */
1577                         rdev->sb_loaded = 2;
1578                 } else {
1579                         super_types[mddev->major_version].
1580                                 sync_super(mddev, rdev);
1581                         rdev->sb_loaded = 1;
1582                 }
1583         }
1584 }
1585
1586 void md_update_sb(mddev_t * mddev)
1587 {
1588         int err;
1589         struct list_head *tmp;
1590         mdk_rdev_t *rdev;
1591         int sync_req;
1592         int nospares = 0;
1593
1594 repeat:
1595         spin_lock_irq(&mddev->write_lock);
1596         sync_req = mddev->in_sync;
1597         mddev->utime = get_seconds();
1598         if (mddev->sb_dirty == 3)
1599                 /* just a clean<-> dirty transition, possibly leave spares alone,
1600                  * though if events isn't the right even/odd, we will have to do
1601                  * spares after all
1602                  */
1603                 nospares = 1;
1604
1605         /* If this is just a dirty<->clean transition, and the array is clean
1606          * and 'events' is odd, we can roll back to the previous clean state */
1607         if (mddev->sb_dirty == 3
1608             && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1609             && (mddev->events & 1))
1610                 mddev->events--;
1611         else {
1612                 /* otherwise we have to go forward and ... */
1613                 mddev->events ++;
1614                 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1615                         /* .. if the array isn't clean, insist on an odd 'events' */
1616                         if ((mddev->events&1)==0) {
1617                                 mddev->events++;
1618                                 nospares = 0;
1619                         }
1620                 } else {
1621                         /* otherwise insist on an even 'events' (for clean states) */
1622                         if ((mddev->events&1)) {
1623                                 mddev->events++;
1624                                 nospares = 0;
1625                         }
1626                 }
1627         }
1628
1629         if (!mddev->events) {
1630                 /*
1631                  * oops, this 64-bit counter should never wrap.
1632                  * Either we are in around ~1 trillion A.C., assuming
1633                  * 1 reboot per second, or we have a bug:
1634                  */
1635                 MD_BUG();
1636                 mddev->events --;
1637         }
1638         mddev->sb_dirty = 2;
1639         sync_sbs(mddev, nospares);
1640
1641         /*
1642          * do not write anything to disk if using
1643          * nonpersistent superblocks
1644          */
1645         if (!mddev->persistent) {
1646                 mddev->sb_dirty = 0;
1647                 spin_unlock_irq(&mddev->write_lock);
1648                 wake_up(&mddev->sb_wait);
1649                 return;
1650         }
1651         spin_unlock_irq(&mddev->write_lock);
1652
1653         dprintk(KERN_INFO 
1654                 "md: updating %s RAID superblock on device (in sync %d)\n",
1655                 mdname(mddev),mddev->in_sync);
1656
1657         err = bitmap_update_sb(mddev->bitmap);
1658         ITERATE_RDEV(mddev,rdev,tmp) {
1659                 char b[BDEVNAME_SIZE];
1660                 dprintk(KERN_INFO "md: ");
1661                 if (rdev->sb_loaded != 1)
1662                         continue; /* no noise on spare devices */
1663                 if (test_bit(Faulty, &rdev->flags))
1664                         dprintk("(skipping faulty ");
1665
1666                 dprintk("%s ", bdevname(rdev->bdev,b));
1667                 if (!test_bit(Faulty, &rdev->flags)) {
1668                         md_super_write(mddev,rdev,
1669                                        rdev->sb_offset<<1, rdev->sb_size,
1670                                        rdev->sb_page);
1671                         dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1672                                 bdevname(rdev->bdev,b),
1673                                 (unsigned long long)rdev->sb_offset);
1674                         rdev->sb_events = mddev->events;
1675
1676                 } else
1677                         dprintk(")\n");
1678                 if (mddev->level == LEVEL_MULTIPATH)
1679                         /* only need to write one superblock... */
1680                         break;
1681         }
1682         md_super_wait(mddev);
1683         /* if there was a failure, sb_dirty was set to 1, and we re-write super */
1684
1685         spin_lock_irq(&mddev->write_lock);
1686         if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
1687                 /* have to write it out again */
1688                 spin_unlock_irq(&mddev->write_lock);
1689                 goto repeat;
1690         }
1691         mddev->sb_dirty = 0;
1692         spin_unlock_irq(&mddev->write_lock);
1693         wake_up(&mddev->sb_wait);
1694
1695 }
1696 EXPORT_SYMBOL_GPL(md_update_sb);
1697
1698 /* words written to sysfs files may, or my not, be \n terminated.
1699  * We want to accept with case. For this we use cmd_match.
1700  */
1701 static int cmd_match(const char *cmd, const char *str)
1702 {
1703         /* See if cmd, written into a sysfs file, matches
1704          * str.  They must either be the same, or cmd can
1705          * have a trailing newline
1706          */
1707         while (*cmd && *str && *cmd == *str) {
1708                 cmd++;
1709                 str++;
1710         }
1711         if (*cmd == '\n')
1712                 cmd++;
1713         if (*str || *cmd)
1714                 return 0;
1715         return 1;
1716 }
1717
1718 struct rdev_sysfs_entry {
1719         struct attribute attr;
1720         ssize_t (*show)(mdk_rdev_t *, char *);
1721         ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
1722 };
1723
1724 static ssize_t
1725 state_show(mdk_rdev_t *rdev, char *page)
1726 {
1727         char *sep = "";
1728         int len=0;
1729
1730         if (test_bit(Faulty, &rdev->flags)) {
1731                 len+= sprintf(page+len, "%sfaulty",sep);
1732                 sep = ",";
1733         }
1734         if (test_bit(In_sync, &rdev->flags)) {
1735                 len += sprintf(page+len, "%sin_sync",sep);
1736                 sep = ",";
1737         }
1738         if (test_bit(WriteMostly, &rdev->flags)) {
1739                 len += sprintf(page+len, "%swrite_mostly",sep);
1740                 sep = ",";
1741         }
1742         if (!test_bit(Faulty, &rdev->flags) &&
1743             !test_bit(In_sync, &rdev->flags)) {
1744                 len += sprintf(page+len, "%sspare", sep);
1745                 sep = ",";
1746         }
1747         return len+sprintf(page+len, "\n");
1748 }
1749
1750 static ssize_t
1751 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1752 {
1753         /* can write
1754          *  faulty  - simulates and error
1755          *  remove  - disconnects the device
1756          *  writemostly - sets write_mostly
1757          *  -writemostly - clears write_mostly
1758          */
1759         int err = -EINVAL;
1760         if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
1761                 md_error(rdev->mddev, rdev);
1762                 err = 0;
1763         } else if (cmd_match(buf, "remove")) {
1764                 if (rdev->raid_disk >= 0)
1765                         err = -EBUSY;
1766                 else {
1767                         mddev_t *mddev = rdev->mddev;
1768                         kick_rdev_from_array(rdev);
1769                         md_update_sb(mddev);
1770                         md_new_event(mddev);
1771                         err = 0;
1772                 }
1773         } else if (cmd_match(buf, "writemostly")) {
1774                 set_bit(WriteMostly, &rdev->flags);
1775                 err = 0;
1776         } else if (cmd_match(buf, "-writemostly")) {
1777                 clear_bit(WriteMostly, &rdev->flags);
1778                 err = 0;
1779         }
1780         return err ? err : len;
1781 }
1782 static struct rdev_sysfs_entry
1783 rdev_state = __ATTR(state, 0644, state_show, state_store);
1784
1785 static ssize_t
1786 super_show(mdk_rdev_t *rdev, char *page)
1787 {
1788         if (rdev->sb_loaded && rdev->sb_size) {
1789                 memcpy(page, page_address(rdev->sb_page), rdev->sb_size);
1790                 return rdev->sb_size;
1791         } else
1792                 return 0;
1793 }
1794 static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
1795
1796 static ssize_t
1797 errors_show(mdk_rdev_t *rdev, char *page)
1798 {
1799         return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
1800 }
1801
1802 static ssize_t
1803 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1804 {
1805         char *e;
1806         unsigned long n = simple_strtoul(buf, &e, 10);
1807         if (*buf && (*e == 0 || *e == '\n')) {
1808                 atomic_set(&rdev->corrected_errors, n);
1809                 return len;
1810         }
1811         return -EINVAL;
1812 }
1813 static struct rdev_sysfs_entry rdev_errors =
1814 __ATTR(errors, 0644, errors_show, errors_store);
1815
1816 static ssize_t
1817 slot_show(mdk_rdev_t *rdev, char *page)
1818 {
1819         if (rdev->raid_disk < 0)
1820                 return sprintf(page, "none\n");
1821         else
1822                 return sprintf(page, "%d\n", rdev->raid_disk);
1823 }
1824
1825 static ssize_t
1826 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1827 {
1828         char *e;
1829         int slot = simple_strtoul(buf, &e, 10);
1830         if (strncmp(buf, "none", 4)==0)
1831                 slot = -1;
1832         else if (e==buf || (*e && *e!= '\n'))
1833                 return -EINVAL;
1834         if (rdev->mddev->pers)
1835                 /* Cannot set slot in active array (yet) */
1836                 return -EBUSY;
1837         if (slot >= rdev->mddev->raid_disks)
1838                 return -ENOSPC;
1839         rdev->raid_disk = slot;
1840         /* assume it is working */
1841         rdev->flags = 0;
1842         set_bit(In_sync, &rdev->flags);
1843         return len;
1844 }
1845
1846
1847 static struct rdev_sysfs_entry rdev_slot =
1848 __ATTR(slot, 0644, slot_show, slot_store);
1849
1850 static ssize_t
1851 offset_show(mdk_rdev_t *rdev, char *page)
1852 {
1853         return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
1854 }
1855
1856 static ssize_t
1857 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1858 {
1859         char *e;
1860         unsigned long long offset = simple_strtoull(buf, &e, 10);
1861         if (e==buf || (*e && *e != '\n'))
1862                 return -EINVAL;
1863         if (rdev->mddev->pers)
1864                 return -EBUSY;
1865         rdev->data_offset = offset;
1866         return len;
1867 }
1868
1869 static struct rdev_sysfs_entry rdev_offset =
1870 __ATTR(offset, 0644, offset_show, offset_store);
1871
1872 static ssize_t
1873 rdev_size_show(mdk_rdev_t *rdev, char *page)
1874 {
1875         return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
1876 }
1877
1878 static ssize_t
1879 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1880 {
1881         char *e;
1882         unsigned long long size = simple_strtoull(buf, &e, 10);
1883         if (e==buf || (*e && *e != '\n'))
1884                 return -EINVAL;
1885         if (rdev->mddev->pers)
1886                 return -EBUSY;
1887         rdev->size = size;
1888         if (size < rdev->mddev->size || rdev->mddev->size == 0)
1889                 rdev->mddev->size = size;
1890         return len;
1891 }
1892
1893 static struct rdev_sysfs_entry rdev_size =
1894 __ATTR(size, 0644, rdev_size_show, rdev_size_store);
1895
1896 static struct attribute *rdev_default_attrs[] = {
1897         &rdev_state.attr,
1898         &rdev_super.attr,
1899         &rdev_errors.attr,
1900         &rdev_slot.attr,
1901         &rdev_offset.attr,
1902         &rdev_size.attr,
1903         NULL,
1904 };
1905 static ssize_t
1906 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1907 {
1908         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1909         mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1910
1911         if (!entry->show)
1912                 return -EIO;
1913         return entry->show(rdev, page);
1914 }
1915
1916 static ssize_t
1917 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
1918               const char *page, size_t length)
1919 {
1920         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
1921         mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
1922
1923         if (!entry->store)
1924                 return -EIO;
1925         return entry->store(rdev, page, length);
1926 }
1927
1928 static void rdev_free(struct kobject *ko)
1929 {
1930         mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
1931         kfree(rdev);
1932 }
1933 static struct sysfs_ops rdev_sysfs_ops = {
1934         .show           = rdev_attr_show,
1935         .store          = rdev_attr_store,
1936 };
1937 static struct kobj_type rdev_ktype = {
1938         .release        = rdev_free,
1939         .sysfs_ops      = &rdev_sysfs_ops,
1940         .default_attrs  = rdev_default_attrs,
1941 };
1942
1943 /*
1944  * Import a device. If 'super_format' >= 0, then sanity check the superblock
1945  *
1946  * mark the device faulty if:
1947  *
1948  *   - the device is nonexistent (zero size)
1949  *   - the device has no valid superblock
1950  *
1951  * a faulty rdev _never_ has rdev->sb set.
1952  */
1953 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
1954 {
1955         char b[BDEVNAME_SIZE];
1956         int err;
1957         mdk_rdev_t *rdev;
1958         sector_t size;
1959
1960         rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
1961         if (!rdev) {
1962                 printk(KERN_ERR "md: could not alloc mem for new device!\n");
1963                 return ERR_PTR(-ENOMEM);
1964         }
1965
1966         if ((err = alloc_disk_sb(rdev)))
1967                 goto abort_free;
1968
1969         err = lock_rdev(rdev, newdev);
1970         if (err)
1971                 goto abort_free;
1972
1973         rdev->kobj.parent = NULL;
1974         rdev->kobj.ktype = &rdev_ktype;
1975         kobject_init(&rdev->kobj);
1976
1977         rdev->desc_nr = -1;
1978         rdev->flags = 0;
1979         rdev->data_offset = 0;
1980         rdev->sb_events = 0;
1981         atomic_set(&rdev->nr_pending, 0);
1982         atomic_set(&rdev->read_errors, 0);
1983         atomic_set(&rdev->corrected_errors, 0);
1984
1985         size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
1986         if (!size) {
1987                 printk(KERN_WARNING 
1988                         "md: %s has zero or unknown size, marking faulty!\n",
1989                         bdevname(rdev->bdev,b));
1990                 err = -EINVAL;
1991                 goto abort_free;
1992         }
1993
1994         if (super_format >= 0) {
1995                 err = super_types[super_format].
1996                         load_super(rdev, NULL, super_minor);
1997                 if (err == -EINVAL) {
1998                         printk(KERN_WARNING 
1999                                 "md: %s has invalid sb, not importing!\n",
2000                                 bdevname(rdev->bdev,b));
2001                         goto abort_free;
2002                 }
2003                 if (err < 0) {
2004                         printk(KERN_WARNING 
2005                                 "md: could not read %s's sb, not importing!\n",
2006                                 bdevname(rdev->bdev,b));
2007                         goto abort_free;
2008                 }
2009         }
2010         INIT_LIST_HEAD(&rdev->same_set);
2011
2012         return rdev;
2013
2014 abort_free:
2015         if (rdev->sb_page) {
2016                 if (rdev->bdev)
2017                         unlock_rdev(rdev);
2018                 free_disk_sb(rdev);
2019         }
2020         kfree(rdev);
2021         return ERR_PTR(err);
2022 }
2023
2024 /*
2025  * Check a full RAID array for plausibility
2026  */
2027
2028
2029 static void analyze_sbs(mddev_t * mddev)
2030 {
2031         int i;
2032         struct list_head *tmp;
2033         mdk_rdev_t *rdev, *freshest;
2034         char b[BDEVNAME_SIZE];
2035
2036         freshest = NULL;
2037         ITERATE_RDEV(mddev,rdev,tmp)
2038                 switch (super_types[mddev->major_version].
2039                         load_super(rdev, freshest, mddev->minor_version)) {
2040                 case 1:
2041                         freshest = rdev;
2042                         break;
2043                 case 0:
2044                         break;
2045                 default:
2046                         printk( KERN_ERR \
2047                                 "md: fatal superblock inconsistency in %s"
2048                                 " -- removing from array\n", 
2049                                 bdevname(rdev->bdev,b));
2050                         kick_rdev_from_array(rdev);
2051                 }
2052
2053
2054         super_types[mddev->major_version].
2055                 validate_super(mddev, freshest);
2056
2057         i = 0;
2058         ITERATE_RDEV(mddev,rdev,tmp) {
2059                 if (rdev != freshest)
2060                         if (super_types[mddev->major_version].
2061                             validate_super(mddev, rdev)) {
2062                                 printk(KERN_WARNING "md: kicking non-fresh %s"
2063                                         " from array!\n",
2064                                         bdevname(rdev->bdev,b));
2065                                 kick_rdev_from_array(rdev);
2066                                 continue;
2067                         }
2068                 if (mddev->level == LEVEL_MULTIPATH) {
2069                         rdev->desc_nr = i++;
2070                         rdev->raid_disk = rdev->desc_nr;
2071                         set_bit(In_sync, &rdev->flags);
2072                 }
2073         }
2074
2075
2076
2077         if (mddev->recovery_cp != MaxSector &&
2078             mddev->level >= 1)
2079                 printk(KERN_ERR "md: %s: raid array is not clean"
2080                        " -- starting background reconstruction\n",
2081                        mdname(mddev));
2082
2083 }
2084
2085 static ssize_t
2086 safe_delay_show(mddev_t *mddev, char *page)
2087 {
2088         int msec = (mddev->safemode_delay*1000)/HZ;
2089         return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2090 }
2091 static ssize_t
2092 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2093 {
2094         int scale=1;
2095         int dot=0;
2096         int i;
2097         unsigned long msec;
2098         char buf[30];
2099         char *e;
2100         /* remove a period, and count digits after it */
2101         if (len >= sizeof(buf))
2102                 return -EINVAL;
2103         strlcpy(buf, cbuf, len);
2104         buf[len] = 0;
2105         for (i=0; i<len; i++) {
2106                 if (dot) {
2107                         if (isdigit(buf[i])) {
2108                                 buf[i-1] = buf[i];
2109                                 scale *= 10;
2110                         }
2111                         buf[i] = 0;
2112                 } else if (buf[i] == '.') {
2113                         dot=1;
2114                         buf[i] = 0;
2115                 }
2116         }
2117         msec = simple_strtoul(buf, &e, 10);
2118         if (e == buf || (*e && *e != '\n'))
2119                 return -EINVAL;
2120         msec = (msec * 1000) / scale;
2121         if (msec == 0)
2122                 mddev->safemode_delay = 0;
2123         else {
2124                 mddev->safemode_delay = (msec*HZ)/1000;
2125                 if (mddev->safemode_delay == 0)
2126                         mddev->safemode_delay = 1;
2127         }
2128         return len;
2129 }
2130 static struct md_sysfs_entry md_safe_delay =
2131 __ATTR(safe_mode_delay, 0644,safe_delay_show, safe_delay_store);
2132
2133 static ssize_t
2134 level_show(mddev_t *mddev, char *page)
2135 {
2136         struct mdk_personality *p = mddev->pers;
2137         if (p)
2138                 return sprintf(page, "%s\n", p->name);
2139         else if (mddev->clevel[0])
2140                 return sprintf(page, "%s\n", mddev->clevel);
2141         else if (mddev->level != LEVEL_NONE)
2142                 return sprintf(page, "%d\n", mddev->level);
2143         else
2144                 return 0;
2145 }
2146
2147 static ssize_t
2148 level_store(mddev_t *mddev, const char *buf, size_t len)
2149 {
2150         int rv = len;
2151         if (mddev->pers)
2152                 return -EBUSY;
2153         if (len == 0)
2154                 return 0;
2155         if (len >= sizeof(mddev->clevel))
2156                 return -ENOSPC;
2157         strncpy(mddev->clevel, buf, len);
2158         if (mddev->clevel[len-1] == '\n')
2159                 len--;
2160         mddev->clevel[len] = 0;
2161         mddev->level = LEVEL_NONE;
2162         return rv;
2163 }
2164
2165 static struct md_sysfs_entry md_level =
2166 __ATTR(level, 0644, level_show, level_store);
2167
2168
2169 static ssize_t
2170 layout_show(mddev_t *mddev, char *page)
2171 {
2172         /* just a number, not meaningful for all levels */
2173         return sprintf(page, "%d\n", mddev->layout);
2174 }
2175
2176 static ssize_t
2177 layout_store(mddev_t *mddev, const char *buf, size_t len)
2178 {
2179         char *e;
2180         unsigned long n = simple_strtoul(buf, &e, 10);
2181         if (mddev->pers)
2182                 return -EBUSY;
2183
2184         if (!*buf || (*e && *e != '\n'))
2185                 return -EINVAL;
2186
2187         mddev->layout = n;
2188         return len;
2189 }
2190 static struct md_sysfs_entry md_layout =
2191 __ATTR(layout, 0655, layout_show, layout_store);
2192
2193
2194 static ssize_t
2195 raid_disks_show(mddev_t *mddev, char *page)
2196 {
2197         if (mddev->raid_disks == 0)
2198                 return 0;
2199         return sprintf(page, "%d\n", mddev->raid_disks);
2200 }
2201
2202 static int update_raid_disks(mddev_t *mddev, int raid_disks);
2203
2204 static ssize_t
2205 raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2206 {
2207         /* can only set raid_disks if array is not yet active */
2208         char *e;
2209         int rv = 0;
2210         unsigned long n = simple_strtoul(buf, &e, 10);
2211
2212         if (!*buf || (*e && *e != '\n'))
2213                 return -EINVAL;
2214
2215         if (mddev->pers)
2216                 rv = update_raid_disks(mddev, n);
2217         else
2218                 mddev->raid_disks = n;
2219         return rv ? rv : len;
2220 }
2221 static struct md_sysfs_entry md_raid_disks =
2222 __ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store);
2223
2224 static ssize_t
2225 chunk_size_show(mddev_t *mddev, char *page)
2226 {
2227         return sprintf(page, "%d\n", mddev->chunk_size);
2228 }
2229
2230 static ssize_t
2231 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2232 {
2233         /* can only set chunk_size if array is not yet active */
2234         char *e;
2235         unsigned long n = simple_strtoul(buf, &e, 10);
2236
2237         if (mddev->pers)
2238                 return -EBUSY;
2239         if (!*buf || (*e && *e != '\n'))
2240                 return -EINVAL;
2241
2242         mddev->chunk_size = n;
2243         return len;
2244 }
2245 static struct md_sysfs_entry md_chunk_size =
2246 __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
2247
2248 static ssize_t
2249 resync_start_show(mddev_t *mddev, char *page)
2250 {
2251         return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2252 }
2253
2254 static ssize_t
2255 resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2256 {
2257         /* can only set chunk_size if array is not yet active */
2258         char *e;
2259         unsigned long long n = simple_strtoull(buf, &e, 10);
2260
2261         if (mddev->pers)
2262                 return -EBUSY;
2263         if (!*buf || (*e && *e != '\n'))
2264                 return -EINVAL;
2265
2266         mddev->recovery_cp = n;
2267         return len;
2268 }
2269 static struct md_sysfs_entry md_resync_start =
2270 __ATTR(resync_start, 0644, resync_start_show, resync_start_store);
2271
2272 /*
2273  * The array state can be:
2274  *
2275  * clear
2276  *     No devices, no size, no level
2277  *     Equivalent to STOP_ARRAY ioctl
2278  * inactive
2279  *     May have some settings, but array is not active
2280  *        all IO results in error
2281  *     When written, doesn't tear down array, but just stops it
2282  * suspended (not supported yet)
2283  *     All IO requests will block. The array can be reconfigured.
2284  *     Writing this, if accepted, will block until array is quiessent
2285  * readonly
2286  *     no resync can happen.  no superblocks get written.
2287  *     write requests fail
2288  * read-auto
2289  *     like readonly, but behaves like 'clean' on a write request.
2290  *
2291  * clean - no pending writes, but otherwise active.
2292  *     When written to inactive array, starts without resync
2293  *     If a write request arrives then
2294  *       if metadata is known, mark 'dirty' and switch to 'active'.
2295  *       if not known, block and switch to write-pending
2296  *     If written to an active array that has pending writes, then fails.
2297  * active
2298  *     fully active: IO and resync can be happening.
2299  *     When written to inactive array, starts with resync
2300  *
2301  * write-pending
2302  *     clean, but writes are blocked waiting for 'active' to be written.
2303  *
2304  * active-idle
2305  *     like active, but no writes have been seen for a while (100msec).
2306  *
2307  */
2308 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2309                    write_pending, active_idle, bad_word};
2310 static char *array_states[] = {
2311         "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2312         "write-pending", "active-idle", NULL };
2313
2314 static int match_word(const char *word, char **list)
2315 {
2316         int n;
2317         for (n=0; list[n]; n++)
2318                 if (cmd_match(word, list[n]))
2319                         break;
2320         return n;
2321 }
2322
2323 static ssize_t
2324 array_state_show(mddev_t *mddev, char *page)
2325 {
2326         enum array_state st = inactive;
2327
2328         if (mddev->pers)
2329                 switch(mddev->ro) {
2330                 case 1:
2331                         st = readonly;
2332                         break;
2333                 case 2:
2334                         st = read_auto;
2335                         break;
2336                 case 0:
2337                         if (mddev->in_sync)
2338                                 st = clean;
2339                         else if (mddev->safemode)
2340                                 st = active_idle;
2341                         else
2342                                 st = active;
2343                 }
2344         else {
2345                 if (list_empty(&mddev->disks) &&
2346                     mddev->raid_disks == 0 &&
2347                     mddev->size == 0)
2348                         st = clear;
2349                 else
2350                         st = inactive;
2351         }
2352         return sprintf(page, "%s\n", array_states[st]);
2353 }
2354
2355 static int do_md_stop(mddev_t * mddev, int ro);
2356 static int do_md_run(mddev_t * mddev);
2357 static int restart_array(mddev_t *mddev);
2358
2359 static ssize_t
2360 array_state_store(mddev_t *mddev, const char *buf, size_t len)
2361 {
2362         int err = -EINVAL;
2363         enum array_state st = match_word(buf, array_states);
2364         switch(st) {
2365         case bad_word:
2366                 break;
2367         case clear:
2368                 /* stopping an active array */
2369                 if (mddev->pers) {
2370                         if (atomic_read(&mddev->active) > 1)
2371                                 return -EBUSY;
2372                         err = do_md_stop(mddev, 0);
2373                 }
2374                 break;
2375         case inactive:
2376                 /* stopping an active array */
2377                 if (mddev->pers) {
2378                         if (atomic_read(&mddev->active) > 1)
2379                                 return -EBUSY;
2380                         err = do_md_stop(mddev, 2);
2381                 }
2382                 break;
2383         case suspended:
2384                 break; /* not supported yet */
2385         case readonly:
2386                 if (mddev->pers)
2387                         err = do_md_stop(mddev, 1);
2388                 else {
2389                         mddev->ro = 1;
2390                         err = do_md_run(mddev);
2391                 }
2392                 break;
2393         case read_auto:
2394                 /* stopping an active array */
2395                 if (mddev->pers) {
2396                         err = do_md_stop(mddev, 1);
2397                         if (err == 0)
2398                                 mddev->ro = 2; /* FIXME mark devices writable */
2399                 } else {
2400                         mddev->ro = 2;
2401                         err = do_md_run(mddev);
2402                 }
2403                 break;
2404         case clean:
2405                 if (mddev->pers) {
2406                         restart_array(mddev);
2407                         spin_lock_irq(&mddev->write_lock);
2408                         if (atomic_read(&mddev->writes_pending) == 0) {
2409                                 mddev->in_sync = 1;
2410                                 mddev->sb_dirty = 1;
2411                         }
2412                         spin_unlock_irq(&mddev->write_lock);
2413                 } else {
2414                         mddev->ro = 0;
2415                         mddev->recovery_cp = MaxSector;
2416                         err = do_md_run(mddev);
2417                 }
2418                 break;
2419         case active:
2420                 if (mddev->pers) {
2421                         restart_array(mddev);
2422                         mddev->sb_dirty = 0;
2423                         wake_up(&mddev->sb_wait);
2424                         err = 0;
2425                 } else {
2426                         mddev->ro = 0;
2427                         err = do_md_run(mddev);
2428                 }
2429                 break;
2430         case write_pending:
2431         case active_idle:
2432                 /* these cannot be set */
2433                 break;
2434         }
2435         if (err)
2436                 return err;
2437         else
2438                 return len;
2439 }
2440 static struct md_sysfs_entry md_array_state = __ATTR(array_state, 0644, array_state_show, array_state_store);
2441
2442 static ssize_t
2443 null_show(mddev_t *mddev, char *page)
2444 {
2445         return -EINVAL;
2446 }
2447
2448 static ssize_t
2449 new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2450 {
2451         /* buf must be %d:%d\n? giving major and minor numbers */
2452         /* The new device is added to the array.
2453          * If the array has a persistent superblock, we read the
2454          * superblock to initialise info and check validity.
2455          * Otherwise, only checking done is that in bind_rdev_to_array,
2456          * which mainly checks size.
2457          */
2458         char *e;
2459         int major = simple_strtoul(buf, &e, 10);
2460         int minor;
2461         dev_t dev;
2462         mdk_rdev_t *rdev;
2463         int err;
2464
2465         if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
2466                 return -EINVAL;
2467         minor = simple_strtoul(e+1, &e, 10);
2468         if (*e && *e != '\n')
2469                 return -EINVAL;
2470         dev = MKDEV(major, minor);
2471         if (major != MAJOR(dev) ||
2472             minor != MINOR(dev))
2473                 return -EOVERFLOW;
2474
2475
2476         if (mddev->persistent) {
2477                 rdev = md_import_device(dev, mddev->major_version,
2478                                         mddev->minor_version);
2479                 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
2480                         mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2481                                                        mdk_rdev_t, same_set);
2482                         err = super_types[mddev->major_version]
2483                                 .load_super(rdev, rdev0, mddev->minor_version);
2484                         if (err < 0)
2485                                 goto out;
2486                 }
2487         } else
2488                 rdev = md_import_device(dev, -1, -1);
2489
2490         if (IS_ERR(rdev))
2491                 return PTR_ERR(rdev);
2492         err = bind_rdev_to_array(rdev, mddev);
2493  out:
2494         if (err)
2495                 export_rdev(rdev);
2496         return err ? err : len;
2497 }
2498
2499 static struct md_sysfs_entry md_new_device =
2500 __ATTR(new_dev, 0200, null_show, new_dev_store);
2501
2502 static ssize_t
2503 size_show(mddev_t *mddev, char *page)
2504 {
2505         return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2506 }
2507
2508 static int update_size(mddev_t *mddev, unsigned long size);
2509
2510 static ssize_t
2511 size_store(mddev_t *mddev, const char *buf, size_t len)
2512 {
2513         /* If array is inactive, we can reduce the component size, but
2514          * not increase it (except from 0).
2515          * If array is active, we can try an on-line resize
2516          */
2517         char *e;
2518         int err = 0;
2519         unsigned long long size = simple_strtoull(buf, &e, 10);
2520         if (!*buf || *buf == '\n' ||
2521             (*e && *e != '\n'))
2522                 return -EINVAL;
2523
2524         if (mddev->pers) {
2525                 err = update_size(mddev, size);
2526                 md_update_sb(mddev);
2527         } else {
2528                 if (mddev->size == 0 ||
2529                     mddev->size > size)
2530                         mddev->size = size;
2531                 else
2532                         err = -ENOSPC;
2533         }
2534         return err ? err : len;
2535 }
2536
2537 static struct md_sysfs_entry md_size =
2538 __ATTR(component_size, 0644, size_show, size_store);
2539
2540
2541 /* Metdata version.
2542  * This is either 'none' for arrays with externally managed metadata,
2543  * or N.M for internally known formats
2544  */
2545 static ssize_t
2546 metadata_show(mddev_t *mddev, char *page)
2547 {
2548         if (mddev->persistent)
2549                 return sprintf(page, "%d.%d\n",
2550                                mddev->major_version, mddev->minor_version);
2551         else
2552                 return sprintf(page, "none\n");
2553 }
2554
2555 static ssize_t
2556 metadata_store(mddev_t *mddev, const char *buf, size_t len)
2557 {
2558         int major, minor;
2559         char *e;
2560         if (!list_empty(&mddev->disks))
2561                 return -EBUSY;
2562
2563         if (cmd_match(buf, "none")) {
2564                 mddev->persistent = 0;
2565                 mddev->major_version = 0;
2566                 mddev->minor_version = 90;
2567                 return len;
2568         }
2569         major = simple_strtoul(buf, &e, 10);
2570         if (e==buf || *e != '.')
2571                 return -EINVAL;
2572         buf = e+1;
2573         minor = simple_strtoul(buf, &e, 10);
2574         if (e==buf || *e != '\n')
2575                 return -EINVAL;
2576         if (major >= sizeof(super_types)/sizeof(super_types[0]) ||
2577             super_types[major].name == NULL)
2578                 return -ENOENT;
2579         mddev->major_version = major;
2580         mddev->minor_version = minor;
2581         mddev->persistent = 1;
2582         return len;
2583 }
2584
2585 static struct md_sysfs_entry md_metadata =
2586 __ATTR(metadata_version, 0644, metadata_show, metadata_store);
2587
2588 static ssize_t
2589 action_show(mddev_t *mddev, char *page)
2590 {
2591         char *type = "idle";
2592         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2593             test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
2594                 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2595                         type = "reshape";
2596                 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2597                         if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2598                                 type = "resync";
2599                         else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2600                                 type = "check";
2601                         else
2602                                 type = "repair";
2603                 } else
2604                         type = "recover";
2605         }
2606         return sprintf(page, "%s\n", type);
2607 }
2608
2609 static ssize_t
2610 action_store(mddev_t *mddev, const char *page, size_t len)
2611 {
2612         if (!mddev->pers || !mddev->pers->sync_request)
2613                 return -EINVAL;
2614
2615         if (cmd_match(page, "idle")) {
2616                 if (mddev->sync_thread) {
2617                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2618                         md_unregister_thread(mddev->sync_thread);
2619                         mddev->sync_thread = NULL;
2620                         mddev->recovery = 0;
2621                 }
2622         } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2623                    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
2624                 return -EBUSY;
2625         else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
2626                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2627         else if (cmd_match(page, "reshape")) {
2628                 int err;
2629                 if (mddev->pers->start_reshape == NULL)
2630                         return -EINVAL;
2631                 err = mddev->pers->start_reshape(mddev);
2632                 if (err)
2633                         return err;
2634         } else {
2635                 if (cmd_match(page, "check"))
2636                         set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2637                 else if (!cmd_match(page, "repair"))
2638                         return -EINVAL;
2639                 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
2640                 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2641         }
2642         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2643         md_wakeup_thread(mddev->thread);
2644         return len;
2645 }
2646
2647 static ssize_t
2648 mismatch_cnt_show(mddev_t *mddev, char *page)
2649 {
2650         return sprintf(page, "%llu\n",
2651                        (unsigned long long) mddev->resync_mismatches);
2652 }
2653
2654 static struct md_sysfs_entry
2655 md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
2656
2657
2658 static struct md_sysfs_entry
2659 md_mismatches = __ATTR_RO(mismatch_cnt);
2660
2661 static ssize_t
2662 sync_min_show(mddev_t *mddev, char *page)
2663 {
2664         return sprintf(page, "%d (%s)\n", speed_min(mddev),
2665                        mddev->sync_speed_min ? "local": "system");
2666 }
2667
2668 static ssize_t
2669 sync_min_store(mddev_t *mddev, const char *buf, size_t len)
2670 {
2671         int min;
2672         char *e;
2673         if (strncmp(buf, "system", 6)==0) {
2674                 mddev->sync_speed_min = 0;
2675                 return len;
2676         }
2677         min = simple_strtoul(buf, &e, 10);
2678         if (buf == e || (*e && *e != '\n') || min <= 0)
2679                 return -EINVAL;
2680         mddev->sync_speed_min = min;
2681         return len;
2682 }
2683
2684 static struct md_sysfs_entry md_sync_min =
2685 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
2686
2687 static ssize_t
2688 sync_max_show(mddev_t *mddev, char *page)
2689 {
2690         return sprintf(page, "%d (%s)\n", speed_max(mddev),
2691                        mddev->sync_speed_max ? "local": "system");
2692 }
2693
2694 static ssize_t
2695 sync_max_store(mddev_t *mddev, const char *buf, size_t len)
2696 {
2697         int max;
2698         char *e;
2699         if (strncmp(buf, "system", 6)==0) {
2700                 mddev->sync_speed_max = 0;
2701                 return len;
2702         }
2703         max = simple_strtoul(buf, &e, 10);
2704         if (buf == e || (*e && *e != '\n') || max <= 0)
2705                 return -EINVAL;
2706         mddev->sync_speed_max = max;
2707         return len;
2708 }
2709
2710 static struct md_sysfs_entry md_sync_max =
2711 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
2712
2713
2714 static ssize_t
2715 sync_speed_show(mddev_t *mddev, char *page)
2716 {
2717         unsigned long resync, dt, db;
2718         resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2719         dt = ((jiffies - mddev->resync_mark) / HZ);
2720         if (!dt) dt++;
2721         db = resync - (mddev->resync_mark_cnt);
2722         return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
2723 }
2724
2725 static struct md_sysfs_entry
2726 md_sync_speed = __ATTR_RO(sync_speed);
2727
2728 static ssize_t
2729 sync_completed_show(mddev_t *mddev, char *page)
2730 {
2731         unsigned long max_blocks, resync;
2732
2733         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2734                 max_blocks = mddev->resync_max_sectors;
2735         else
2736                 max_blocks = mddev->size << 1;
2737
2738         resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2739         return sprintf(page, "%lu / %lu\n", resync, max_blocks);
2740 }
2741
2742 static struct md_sysfs_entry
2743 md_sync_completed = __ATTR_RO(sync_completed);
2744
2745 static ssize_t
2746 suspend_lo_show(mddev_t *mddev, char *page)
2747 {
2748         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
2749 }
2750
2751 static ssize_t
2752 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
2753 {
2754         char *e;
2755         unsigned long long new = simple_strtoull(buf, &e, 10);
2756
2757         if (mddev->pers->quiesce == NULL)
2758                 return -EINVAL;
2759         if (buf == e || (*e && *e != '\n'))
2760                 return -EINVAL;
2761         if (new >= mddev->suspend_hi ||
2762             (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
2763                 mddev->suspend_lo = new;
2764                 mddev->pers->quiesce(mddev, 2);
2765                 return len;
2766         } else
2767                 return -EINVAL;
2768 }
2769 static struct md_sysfs_entry md_suspend_lo =
2770 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
2771
2772
2773 static ssize_t
2774 suspend_hi_show(mddev_t *mddev, char *page)
2775 {
2776         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
2777 }
2778
2779 static ssize_t
2780 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
2781 {
2782         char *e;
2783         unsigned long long new = simple_strtoull(buf, &e, 10);
2784
2785         if (mddev->pers->quiesce == NULL)
2786                 return -EINVAL;
2787         if (buf == e || (*e && *e != '\n'))
2788                 return -EINVAL;
2789         if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
2790             (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
2791                 mddev->suspend_hi = new;
2792                 mddev->pers->quiesce(mddev, 1);
2793                 mddev->pers->quiesce(mddev, 0);
2794                 return len;
2795         } else
2796                 return -EINVAL;
2797 }
2798 static struct md_sysfs_entry md_suspend_hi =
2799 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
2800
2801
2802 static struct attribute *md_default_attrs[] = {
2803         &md_level.attr,
2804         &md_layout.attr,
2805         &md_raid_disks.attr,
2806         &md_chunk_size.attr,
2807         &md_size.attr,
2808         &md_resync_start.attr,
2809         &md_metadata.attr,
2810         &md_new_device.attr,
2811         &md_safe_delay.attr,
2812         &md_array_state.attr,
2813         NULL,
2814 };
2815
2816 static struct attribute *md_redundancy_attrs[] = {
2817         &md_scan_mode.attr,
2818         &md_mismatches.attr,
2819         &md_sync_min.attr,
2820         &md_sync_max.attr,
2821         &md_sync_speed.attr,
2822         &md_sync_completed.attr,
2823         &md_suspend_lo.attr,
2824         &md_suspend_hi.attr,
2825         NULL,
2826 };
2827 static struct attribute_group md_redundancy_group = {
2828         .name = NULL,
2829         .attrs = md_redundancy_attrs,
2830 };
2831
2832
2833 static ssize_t
2834 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2835 {
2836         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
2837         mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
2838         ssize_t rv;
2839
2840         if (!entry->show)
2841                 return -EIO;
2842         rv = mddev_lock(mddev);
2843         if (!rv) {
2844                 rv = entry->show(mddev, page);
2845                 mddev_unlock(mddev);
2846         }
2847         return rv;
2848 }
2849
2850 static ssize_t
2851 md_attr_store(struct kobject *kobj, struct attribute *attr,
2852               const char *page, size_t length)
2853 {
2854         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
2855         mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
2856         ssize_t rv;
2857
2858         if (!entry->store)
2859                 return -EIO;
2860         rv = mddev_lock(mddev);
2861         if (!rv) {
2862                 rv = entry->store(mddev, page, length);
2863                 mddev_unlock(mddev);
2864         }
2865         return rv;
2866 }
2867
2868 static void md_free(struct kobject *ko)
2869 {
2870         mddev_t *mddev = container_of(ko, mddev_t, kobj);
2871         kfree(mddev);
2872 }
2873
2874 static struct sysfs_ops md_sysfs_ops = {
2875         .show   = md_attr_show,
2876         .store  = md_attr_store,
2877 };
2878 static struct kobj_type md_ktype = {
2879         .release        = md_free,
2880         .sysfs_ops      = &md_sysfs_ops,
2881         .default_attrs  = md_default_attrs,
2882 };
2883
2884 int mdp_major = 0;
2885
2886 static struct kobject *md_probe(dev_t dev, int *part, void *data)
2887 {
2888         static DEFINE_MUTEX(disks_mutex);
2889         mddev_t *mddev = mddev_find(dev);
2890         struct gendisk *disk;
2891         int partitioned = (MAJOR(dev) != MD_MAJOR);
2892         int shift = partitioned ? MdpMinorShift : 0;
2893         int unit = MINOR(dev) >> shift;
2894
2895         if (!mddev)
2896                 return NULL;
2897
2898         mutex_lock(&disks_mutex);
2899         if (mddev->gendisk) {
2900                 mutex_unlock(&disks_mutex);
2901                 mddev_put(mddev);
2902                 return NULL;
2903         }
2904         disk = alloc_disk(1 << shift);
2905         if (!disk) {
2906                 mutex_unlock(&disks_mutex);
2907                 mddev_put(mddev);
2908                 return NULL;
2909         }
2910         disk->major = MAJOR(dev);
2911         disk->first_minor = unit << shift;
2912         if (partitioned)
2913                 sprintf(disk->disk_name, "md_d%d", unit);
2914         else
2915                 sprintf(disk->disk_name, "md%d", unit);
2916         disk->fops = &md_fops;
2917         disk->private_data = mddev;
2918         disk->queue = mddev->queue;
2919         add_disk(disk);
2920         mddev->gendisk = disk;
2921         mutex_unlock(&disks_mutex);
2922         mddev->kobj.parent = &disk->kobj;
2923         mddev->kobj.k_name = NULL;
2924         snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md");
2925         mddev->kobj.ktype = &md_ktype;
2926         kobject_register(&mddev->kobj);
2927         return NULL;
2928 }
2929
2930 static void md_safemode_timeout(unsigned long data)
2931 {
2932         mddev_t *mddev = (mddev_t *) data;
2933
2934         mddev->safemode = 1;
2935         md_wakeup_thread(mddev->thread);
2936 }
2937
2938 static int start_dirty_degraded;
2939
2940 static int do_md_run(mddev_t * mddev)
2941 {
2942         int err;
2943         int chunk_size;
2944         struct list_head *tmp;
2945         mdk_rdev_t *rdev;
2946         struct gendisk *disk;
2947         struct mdk_personality *pers;
2948         char b[BDEVNAME_SIZE];
2949
2950         if (list_empty(&mddev->disks))
2951                 /* cannot run an array with no devices.. */
2952                 return -EINVAL;
2953
2954         if (mddev->pers)
2955                 return -EBUSY;
2956
2957         /*
2958          * Analyze all RAID superblock(s)
2959          */
2960         if (!mddev->raid_disks)
2961                 analyze_sbs(mddev);
2962
2963         chunk_size = mddev->chunk_size;
2964
2965         if (chunk_size) {
2966                 if (chunk_size > MAX_CHUNK_SIZE) {
2967                         printk(KERN_ERR "too big chunk_size: %d > %d\n",
2968                                 chunk_size, MAX_CHUNK_SIZE);
2969                         return -EINVAL;
2970                 }
2971                 /*
2972                  * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
2973                  */
2974                 if ( (1 << ffz(~chunk_size)) != chunk_size) {
2975                         printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
2976                         return -EINVAL;
2977                 }
2978                 if (chunk_size < PAGE_SIZE) {
2979                         printk(KERN_ERR "too small chunk_size: %d < %ld\n",
2980                                 chunk_size, PAGE_SIZE);
2981                         return -EINVAL;
2982                 }
2983
2984                 /* devices must have minimum size of one chunk */
2985                 ITERATE_RDEV(mddev,rdev,tmp) {
2986                         if (test_bit(Faulty, &rdev->flags))
2987                                 continue;
2988                         if (rdev->size < chunk_size / 1024) {
2989                                 printk(KERN_WARNING
2990                                         "md: Dev %s smaller than chunk_size:"
2991                                         " %lluk < %dk\n",
2992                                         bdevname(rdev->bdev,b),
2993                                         (unsigned long long)rdev->size,
2994                                         chunk_size / 1024);
2995                                 return -EINVAL;
2996                         }
2997                 }
2998         }
2999
3000 #ifdef CONFIG_KMOD
3001         if (mddev->level != LEVEL_NONE)
3002                 request_module("md-level-%d", mddev->level);
3003         else if (mddev->clevel[0])
3004                 request_module("md-%s", mddev->clevel);
3005 #endif
3006
3007         /*
3008          * Drop all container device buffers, from now on
3009          * the only valid external interface is through the md
3010          * device.
3011          * Also find largest hardsector size
3012          */
3013         ITERATE_RDEV(mddev,rdev,tmp) {
3014                 if (test_bit(Faulty, &rdev->flags))
3015                         continue;
3016                 sync_blockdev(rdev->bdev);
3017                 invalidate_bdev(rdev->bdev, 0);
3018         }
3019
3020         md_probe(mddev->unit, NULL, NULL);
3021         disk = mddev->gendisk;
3022         if (!disk)
3023                 return -ENOMEM;
3024
3025         spin_lock(&pers_lock);
3026         pers = find_pers(mddev->level, mddev->clevel);
3027         if (!pers || !try_module_get(pers->owner)) {
3028                 spin_unlock(&pers_lock);
3029                 if (mddev->level != LEVEL_NONE)
3030                         printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
3031                                mddev->level);
3032                 else
3033                         printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
3034                                mddev->clevel);
3035                 return -EINVAL;
3036         }
3037         mddev->pers = pers;
3038         spin_unlock(&pers_lock);
3039         mddev->level = pers->level;
3040         strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3041
3042         if (mddev->reshape_position != MaxSector &&
3043             pers->start_reshape == NULL) {
3044                 /* This personality cannot handle reshaping... */
3045                 mddev->pers = NULL;
3046                 module_put(pers->owner);
3047                 return -EINVAL;
3048         }
3049
3050         mddev->recovery = 0;
3051         mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
3052         mddev->barriers_work = 1;
3053         mddev->ok_start_degraded = start_dirty_degraded;
3054
3055         if (start_readonly)
3056                 mddev->ro = 2; /* read-only, but switch on first write */
3057
3058         err = mddev->pers->run(mddev);
3059         if (!err && mddev->pers->sync_request) {
3060                 err = bitmap_create(mddev);
3061                 if (err) {
3062                         printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
3063                                mdname(mddev), err);
3064                         mddev->pers->stop(mddev);
3065                 }
3066         }
3067         if (err) {
3068                 printk(KERN_ERR "md: pers->run() failed ...\n");
3069                 module_put(mddev->pers->owner);
3070                 mddev->pers = NULL;
3071                 bitmap_destroy(mddev);
3072                 return err;
3073         }
3074         if (mddev->pers->sync_request)
3075                 sysfs_create_group(&mddev->kobj, &md_redundancy_group);
3076         else if (mddev->ro == 2) /* auto-readonly not meaningful */
3077                 mddev->ro = 0;
3078
3079         atomic_set(&mddev->writes_pending,0);
3080         mddev->safemode = 0;
3081         mddev->safemode_timer.function = md_safemode_timeout;
3082         mddev->safemode_timer.data = (unsigned long) mddev;
3083         mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
3084         mddev->in_sync = 1;
3085
3086         ITERATE_RDEV(mddev,rdev,tmp)
3087                 if (rdev->raid_disk >= 0) {
3088                         char nm[20];
3089                         sprintf(nm, "rd%d", rdev->raid_disk);
3090                         sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
3091                 }
3092         
3093         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3094         md_wakeup_thread(mddev->thread);
3095         
3096         if (mddev->sb_dirty)
3097                 md_update_sb(mddev);
3098
3099         set_capacity(disk, mddev->array_size<<1);
3100
3101         /* If we call blk_queue_make_request here, it will
3102          * re-initialise max_sectors etc which may have been
3103          * refined inside -> run.  So just set the bits we need to set.
3104          * Most initialisation happended when we called
3105          * blk_queue_make_request(..., md_fail_request)
3106          * earlier.
3107          */
3108         mddev->queue->queuedata = mddev;
3109         mddev->queue->make_request_fn = mddev->pers->make_request;
3110
3111         /* If there is a partially-recovered drive we need to
3112          * start recovery here.  If we leave it to md_check_recovery,
3113          * it will remove the drives and not do the right thing
3114          */
3115         if (mddev->degraded) {
3116                 struct list_head *rtmp;
3117                 int spares = 0;
3118                 ITERATE_RDEV(mddev,rdev,rtmp)
3119                         if (rdev->raid_disk >= 0 &&
3120                             !test_bit(In_sync, &rdev->flags) &&
3121                             !test_bit(Faulty, &rdev->flags))
3122                                 /* complete an interrupted recovery */
3123                                 spares++;
3124                 if (spares && mddev->pers->sync_request) {
3125                         mddev->recovery = 0;
3126                         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3127                         mddev->sync_thread = md_register_thread(md_do_sync,
3128                                                                 mddev,
3129                                                                 "%s_resync");
3130                         if (!mddev->sync_thread) {
3131                                 printk(KERN_ERR "%s: could not start resync"
3132                                        " thread...\n",
3133                                        mdname(mddev));
3134                                 /* leave the spares where they are, it shouldn't hurt */
3135                                 mddev->recovery = 0;
3136                         } else
3137                                 md_wakeup_thread(mddev->sync_thread);
3138                 }
3139         }
3140
3141         mddev->changed = 1;
3142         md_new_event(mddev);
3143         return 0;
3144 }
3145
3146 static int restart_array(mddev_t *mddev)
3147 {
3148         struct gendisk *disk = mddev->gendisk;
3149         int err;
3150
3151         /*
3152          * Complain if it has no devices
3153          */
3154         err = -ENXIO;
3155         if (list_empty(&mddev->disks))
3156                 goto out;
3157
3158         if (mddev->pers) {
3159                 err = -EBUSY;
3160                 if (!mddev->ro)
3161                         goto out;
3162
3163                 mddev->safemode = 0;
3164                 mddev->ro = 0;
3165                 set_disk_ro(disk, 0);
3166
3167                 printk(KERN_INFO "md: %s switched to read-write mode.\n",
3168                         mdname(mddev));
3169                 /*
3170                  * Kick recovery or resync if necessary
3171                  */
3172                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3173                 md_wakeup_thread(mddev->thread);
3174                 md_wakeup_thread(mddev->sync_thread);
3175                 err = 0;
3176         } else
3177                 err = -EINVAL;
3178
3179 out:
3180         return err;
3181 }
3182
3183 /* similar to deny_write_access, but accounts for our holding a reference
3184  * to the file ourselves */
3185 static int deny_bitmap_write_access(struct file * file)
3186 {
3187         struct inode *inode = file->f_mapping->host;
3188
3189         spin_lock(&inode->i_lock);
3190         if (atomic_read(&inode->i_writecount) > 1) {
3191                 spin_unlock(&inode->i_lock);
3192                 return -ETXTBSY;
3193         }
3194         atomic_set(&inode->i_writecount, -1);
3195         spin_unlock(&inode->i_lock);
3196
3197         return 0;
3198 }
3199
3200 static void restore_bitmap_write_access(struct file *file)
3201 {
3202         struct inode *inode = file->f_mapping->host;
3203
3204         spin_lock(&inode->i_lock);
3205         atomic_set(&inode->i_writecount, 1);
3206         spin_unlock(&inode->i_lock);
3207 }
3208
3209 /* mode:
3210  *   0 - completely stop and dis-assemble array
3211  *   1 - switch to readonly
3212  *   2 - stop but do not disassemble array
3213  */
3214 static int do_md_stop(mddev_t * mddev, int mode)
3215 {
3216         int err = 0;
3217         struct gendisk *disk = mddev->gendisk;
3218
3219         if (mddev->pers) {
3220                 if (atomic_read(&mddev->active)>2) {
3221                         printk("md: %s still in use.\n",mdname(mddev));
3222                         return -EBUSY;
3223                 }
3224
3225                 if (mddev->sync_thread) {
3226                         set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3227                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3228                         md_unregister_thread(mddev->sync_thread);
3229                         mddev->sync_thread = NULL;
3230                 }
3231
3232                 del_timer_sync(&mddev->safemode_timer);
3233
3234                 invalidate_partition(disk, 0);
3235
3236                 switch(mode) {
3237                 case 1: /* readonly */
3238                         err  = -ENXIO;
3239                         if (mddev->ro==1)
3240                                 goto out;
3241                         mddev->ro = 1;
3242                         break;
3243                 case 0: /* disassemble */
3244                 case 2: /* stop */
3245                         bitmap_flush(mddev);
3246                         md_super_wait(mddev);
3247                         if (mddev->ro)
3248                                 set_disk_ro(disk, 0);
3249                         blk_queue_make_request(mddev->queue, md_fail_request);
3250                         mddev->pers->stop(mddev);
3251                         if (mddev->pers->sync_request)
3252                                 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3253
3254                         module_put(mddev->pers->owner);
3255                         mddev->pers = NULL;
3256                         if (mddev->ro)
3257                                 mddev->ro = 0;
3258                 }
3259                 if (!mddev->in_sync || mddev->sb_dirty) {
3260                         /* mark array as shutdown cleanly */
3261                         mddev->in_sync = 1;
3262                         md_update_sb(mddev);
3263                 }
3264                 if (mode == 1)
3265                         set_disk_ro(disk, 1);
3266                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3267         }
3268
3269         /*
3270          * Free resources if final stop
3271          */
3272         if (mode == 0) {
3273                 mdk_rdev_t *rdev;
3274                 struct list_head *tmp;
3275                 struct gendisk *disk;
3276                 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
3277
3278                 bitmap_destroy(mddev);
3279                 if (mddev->bitmap_file) {
3280                         restore_bitmap_write_access(mddev->bitmap_file);
3281                         fput(mddev->bitmap_file);
3282                         mddev->bitmap_file = NULL;
3283                 }
3284                 mddev->bitmap_offset = 0;
3285
3286                 ITERATE_RDEV(mddev,rdev,tmp)
3287                         if (rdev->raid_disk >= 0) {
3288                                 char nm[20];
3289                                 sprintf(nm, "rd%d", rdev->raid_disk);
3290                                 sysfs_remove_link(&mddev->kobj, nm);
3291                         }
3292
3293                 export_array(mddev);
3294
3295                 mddev->array_size = 0;
3296                 mddev->size = 0;
3297                 mddev->raid_disks = 0;
3298                 mddev->recovery_cp = 0;
3299
3300                 disk = mddev->gendisk;
3301                 if (disk)
3302                         set_capacity(disk, 0);
3303                 mddev->changed = 1;
3304         } else if (mddev->pers)
3305                 printk(KERN_INFO "md: %s switched to read-only mode.\n",
3306                         mdname(mddev));
3307         err = 0;
3308         md_new_event(mddev);
3309 out:
3310         return err;
3311 }
3312
3313 static void autorun_array(mddev_t *mddev)
3314 {
3315         mdk_rdev_t *rdev;
3316         struct list_head *tmp;
3317         int err;
3318
3319         if (list_empty(&mddev->disks))
3320                 return;
3321
3322         printk(KERN_INFO "md: running: ");
3323
3324         ITERATE_RDEV(mddev,rdev,tmp) {
3325                 char b[BDEVNAME_SIZE];
3326                 printk("<%s>", bdevname(rdev->bdev,b));
3327         }
3328         printk("\n");
3329
3330         err = do_md_run (mddev);
3331         if (err) {
3332                 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
3333                 do_md_stop (mddev, 0);
3334         }
3335 }
3336
3337 /*
3338  * lets try to run arrays based on all disks that have arrived
3339  * until now. (those are in pending_raid_disks)
3340  *
3341  * the method: pick the first pending disk, collect all disks with
3342  * the same UUID, remove all from the pending list and put them into
3343  * the 'same_array' list. Then order this list based on superblock
3344  * update time (freshest comes first), kick out 'old' disks and
3345  * compare superblocks. If everything's fine then run it.
3346  *
3347  * If "unit" is allocated, then bump its reference count
3348  */
3349 static void autorun_devices(int part)
3350 {
3351         struct list_head *tmp;
3352         mdk_rdev_t *rdev0, *rdev;
3353         mddev_t *mddev;
3354         char b[BDEVNAME_SIZE];
3355
3356         printk(KERN_INFO "md: autorun ...\n");
3357         while (!list_empty(&pending_raid_disks)) {
3358                 dev_t dev;
3359                 LIST_HEAD(candidates);
3360                 rdev0 = list_entry(pending_raid_disks.next,
3361                                          mdk_rdev_t, same_set);
3362
3363                 printk(KERN_INFO "md: considering %s ...\n",
3364                         bdevname(rdev0->bdev,b));
3365                 INIT_LIST_HEAD(&candidates);
3366                 ITERATE_RDEV_PENDING(rdev,tmp)
3367                         if (super_90_load(rdev, rdev0, 0) >= 0) {
3368                                 printk(KERN_INFO "md:  adding %s ...\n",
3369                                         bdevname(rdev->bdev,b));
3370                                 list_move(&rdev->same_set, &candidates);
3371                         }
3372                 /*
3373                  * now we have a set of devices, with all of them having
3374                  * mostly sane superblocks. It's time to allocate the
3375                  * mddev.
3376                  */
3377                 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
3378                         printk(KERN_INFO "md: unit number in %s is bad: %d\n",
3379                                bdevname(rdev0->bdev, b), rdev0->preferred_minor);
3380                         break;
3381                 }
3382                 if (part)
3383                         dev = MKDEV(mdp_major,
3384                                     rdev0->preferred_minor << MdpMinorShift);
3385                 else
3386                         dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
3387
3388                 md_probe(dev, NULL, NULL);
3389                 mddev = mddev_find(dev);
3390                 if (!mddev) {
3391                         printk(KERN_ERR 
3392                                 "md: cannot allocate memory for md drive.\n");
3393                         break;
3394                 }
3395                 if (mddev_lock(mddev)) 
3396                         printk(KERN_WARNING "md: %s locked, cannot run\n",
3397                                mdname(mddev));
3398                 else if (mddev->raid_disks || mddev->major_version
3399                          || !list_empty(&mddev->disks)) {
3400                         printk(KERN_WARNING 
3401                                 "md: %s already running, cannot run %s\n",
3402                                 mdname(mddev), bdevname(rdev0->bdev,b));
3403                         mddev_unlock(mddev);
3404                 } else {
3405                         printk(KERN_INFO "md: created %s\n", mdname(mddev));
3406                         ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
3407                                 list_del_init(&rdev->same_set);
3408                                 if (bind_rdev_to_array(rdev, mddev))
3409                                         export_rdev(rdev);
3410                         }
3411                         autorun_array(mddev);
3412                         mddev_unlock(mddev);
3413                 }
3414                 /* on success, candidates will be empty, on error
3415                  * it won't...
3416                  */
3417                 ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
3418                         export_rdev(rdev);
3419                 mddev_put(mddev);
3420         }
3421         printk(KERN_INFO "md: ... autorun DONE.\n");
3422 }
3423
3424 /*
3425  * import RAID devices based on one partition
3426  * if possible, the array gets run as well.
3427  */
3428
3429 static int autostart_array(dev_t startdev)
3430 {
3431         char b[BDEVNAME_SIZE];
3432         int err = -EINVAL, i;
3433         mdp_super_t *sb = NULL;
3434         mdk_rdev_t *start_rdev = NULL, *rdev;
3435
3436         start_rdev = md_import_device(startdev, 0, 0);
3437         if (IS_ERR(start_rdev))
3438                 return err;
3439
3440
3441         /* NOTE: this can only work for 0.90.0 superblocks */
3442         sb = (mdp_super_t*)page_address(start_rdev->sb_page);
3443         if (sb->major_version != 0 ||
3444             sb->minor_version != 90 ) {
3445                 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
3446                 export_rdev(start_rdev);
3447                 return err;
3448         }
3449
3450         if (test_bit(Faulty, &start_rdev->flags)) {
3451                 printk(KERN_WARNING 
3452                         "md: can not autostart based on faulty %s!\n",
3453                         bdevname(start_rdev->bdev,b));
3454                 export_rdev(start_rdev);
3455                 return err;
3456         }
3457         list_add(&start_rdev->same_set, &pending_raid_disks);
3458
3459         for (i = 0; i < MD_SB_DISKS; i++) {
3460                 mdp_disk_t *desc = sb->disks + i;
3461                 dev_t dev = MKDEV(desc->major, desc->minor);
3462
3463                 if (!dev)
3464                         continue;
3465                 if (dev == startdev)
3466                         continue;
3467                 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
3468                         continue;
3469                 rdev = md_import_device(dev, 0, 0);
3470                 if (IS_ERR(rdev))
3471                         continue;
3472
3473                 list_add(&rdev->same_set, &pending_raid_disks);
3474         }
3475
3476         /*
3477          * possibly return codes
3478          */
3479         autorun_devices(0);
3480         return 0;
3481
3482 }
3483
3484
3485 static int get_version(void __user * arg)
3486 {
3487         mdu_version_t ver;
3488
3489         ver.major = MD_MAJOR_VERSION;
3490         ver.minor = MD_MINOR_VERSION;
3491         ver.patchlevel = MD_PATCHLEVEL_VERSION;
3492
3493         if (copy_to_user(arg, &ver, sizeof(ver)))
3494                 return -EFAULT;
3495
3496         return 0;
3497 }
3498
3499 static int get_array_info(mddev_t * mddev, void __user * arg)
3500 {
3501         mdu_array_info_t info;
3502         int nr,working,active,failed,spare;
3503         mdk_rdev_t *rdev;
3504         struct list_head *tmp;
3505
3506         nr=working=active=failed=spare=0;
3507         ITERATE_RDEV(mddev,rdev,tmp) {
3508                 nr++;
3509                 if (test_bit(Faulty, &rdev->flags))
3510                         failed++;
3511                 else {
3512                         working++;
3513                         if (test_bit(In_sync, &rdev->flags))
3514                                 active++;       
3515                         else
3516                                 spare++;
3517                 }
3518         }
3519
3520         info.major_version = mddev->major_version;
3521         info.minor_version = mddev->minor_version;
3522         info.patch_version = MD_PATCHLEVEL_VERSION;
3523         info.ctime         = mddev->ctime;
3524         info.level         = mddev->level;
3525         info.size          = mddev->size;
3526         if (info.size != mddev->size) /* overflow */
3527                 info.size = -1;
3528         info.nr_disks      = nr;
3529         info.raid_disks    = mddev->raid_disks;
3530         info.md_minor      = mddev->md_minor;
3531         info.not_persistent= !mddev->persistent;
3532
3533         info.utime         = mddev->utime;
3534         info.state         = 0;
3535         if (mddev->in_sync)
3536                 info.state = (1<<MD_SB_CLEAN);
3537         if (mddev->bitmap && mddev->bitmap_offset)
3538                 info.state = (1<<MD_SB_BITMAP_PRESENT);
3539         info.active_disks  = active;
3540         info.working_disks = working;
3541         info.failed_disks  = failed;
3542         info.spare_disks   = spare;
3543
3544         info.layout        = mddev->layout;
3545         info.chunk_size    = mddev->chunk_size;
3546
3547         if (copy_to_user(arg, &info, sizeof(info)))
3548                 return -EFAULT;
3549
3550         return 0;
3551 }
3552
3553 static int get_bitmap_file(mddev_t * mddev, void __user * arg)
3554 {
3555         mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
3556         char *ptr, *buf = NULL;
3557         int err = -ENOMEM;
3558
3559         file = kmalloc(sizeof(*file), GFP_KERNEL);
3560         if (!file)
3561                 goto out;
3562
3563         /* bitmap disabled, zero the first byte and copy out */
3564         if (!mddev->bitmap || !mddev->bitmap->file) {
3565                 file->pathname[0] = '\0';
3566                 goto copy_out;
3567         }
3568
3569         buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
3570         if (!buf)
3571                 goto out;
3572
3573         ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
3574         if (!ptr)
3575                 goto out;
3576
3577         strcpy(file->pathname, ptr);
3578
3579 copy_out:
3580         err = 0;
3581         if (copy_to_user(arg, file, sizeof(*file)))
3582                 err = -EFAULT;
3583 out:
3584         kfree(buf);
3585         kfree(file);
3586         return err;
3587 }
3588
3589 static int get_disk_info(mddev_t * mddev, void __user * arg)
3590 {
3591         mdu_disk_info_t info;
3592         unsigned int nr;
3593         mdk_rdev_t *rdev;
3594
3595         if (copy_from_user(&info, arg, sizeof(info)))
3596                 return -EFAULT;
3597
3598         nr = info.number;
3599
3600         rdev = find_rdev_nr(mddev, nr);
3601         if (rdev) {
3602                 info.major = MAJOR(rdev->bdev->bd_dev);
3603                 info.minor = MINOR(rdev->bdev->bd_dev);
3604                 info.raid_disk = rdev->raid_disk;
3605                 info.state = 0;
3606                 if (test_bit(Faulty, &rdev->flags))
3607                         info.state |= (1<<MD_DISK_FAULTY);
3608                 else if (test_bit(In_sync, &rdev->flags)) {
3609                         info.state |= (1<<MD_DISK_ACTIVE);
3610                         info.state |= (1<<MD_DISK_SYNC);
3611                 }
3612                 if (test_bit(WriteMostly, &rdev->flags))
3613                         info.state |= (1<<MD_DISK_WRITEMOSTLY);
3614         } else {
3615                 info.major = info.minor = 0;
3616                 info.raid_disk = -1;
3617                 info.state = (1<<MD_DISK_REMOVED);
3618         }
3619
3620         if (copy_to_user(arg, &info, sizeof(info)))
3621                 return -EFAULT;
3622
3623         return 0;
3624 }
3625
3626 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
3627 {
3628         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3629         mdk_rdev_t *rdev;
3630         dev_t dev = MKDEV(info->major,info->minor);
3631
3632         if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
3633                 return -EOVERFLOW;
3634
3635         if (!mddev->raid_disks) {
3636                 int err;
3637                 /* expecting a device which has a superblock */
3638                 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
3639                 if (IS_ERR(rdev)) {
3640                         printk(KERN_WARNING 
3641                                 "md: md_import_device returned %ld\n",
3642                                 PTR_ERR(rdev));
3643                         return PTR_ERR(rdev);
3644                 }
3645                 if (!list_empty(&mddev->disks)) {
3646                         mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3647                                                         mdk_rdev_t, same_set);
3648                         int err = super_types[mddev->major_version]
3649                                 .load_super(rdev, rdev0, mddev->minor_version);
3650                         if (err < 0) {
3651                                 printk(KERN_WARNING 
3652                                         "md: %s has different UUID to %s\n",
3653                                         bdevname(rdev->bdev,b), 
3654                                         bdevname(rdev0->bdev,b2));
3655                                 export_rdev(rdev);
3656                                 return -EINVAL;
3657                         }
3658                 }
3659                 err = bind_rdev_to_array(rdev, mddev);
3660                 if (err)
3661                         export_rdev(rdev);
3662                 return err;
3663         }
3664
3665         /*
3666          * add_new_disk can be used once the array is assembled
3667          * to add "hot spares".  They must already have a superblock
3668          * written
3669          */
3670         if (mddev->pers) {
3671                 int err;
3672                 if (!mddev->pers->hot_add_disk) {
3673                         printk(KERN_WARNING 
3674                                 "%s: personality does not support diskops!\n",
3675                                mdname(mddev));
3676                         return -EINVAL;
3677                 }
3678                 if (mddev->persistent)
3679                         rdev = md_import_device(dev, mddev->major_version,
3680                                                 mddev->minor_version);
3681                 else
3682                         rdev = md_import_device(dev, -1, -1);
3683                 if (IS_ERR(rdev)) {
3684                         printk(KERN_WARNING 
3685                                 "md: md_import_device returned %ld\n",
3686                                 PTR_ERR(rdev));
3687                         return PTR_ERR(rdev);
3688                 }
3689                 /* set save_raid_disk if appropriate */
3690                 if (!mddev->persistent) {
3691                         if (info->state & (1<<MD_DISK_SYNC)  &&
3692                             info->raid_disk < mddev->raid_disks)
3693                                 rdev->raid_disk = info->raid_disk;
3694                         else
3695                                 rdev->raid_disk = -1;
3696                 } else
3697                         super_types[mddev->major_version].
3698                                 validate_super(mddev, rdev);
3699                 rdev->saved_raid_disk = rdev->raid_disk;
3700
3701                 clear_bit(In_sync, &rdev->flags); /* just to be sure */
3702                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3703                         set_bit(WriteMostly, &rdev->flags);
3704
3705                 rdev->raid_disk = -1;
3706                 err = bind_rdev_to_array(rdev, mddev);
3707                 if (!err && !mddev->pers->hot_remove_disk) {
3708                         /* If there is hot_add_disk but no hot_remove_disk
3709                          * then added disks for geometry changes,
3710                          * and should be added immediately.
3711                          */
3712                         super_types[mddev->major_version].
3713                                 validate_super(mddev, rdev);
3714                         err = mddev->pers->hot_add_disk(mddev, rdev);
3715                         if (err)
3716                                 unbind_rdev_from_array(rdev);
3717                 }
3718                 if (err)
3719                         export_rdev(rdev);
3720
3721                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3722                 md_wakeup_thread(mddev->thread);
3723                 return err;
3724         }
3725
3726         /* otherwise, add_new_disk is only allowed
3727          * for major_version==0 superblocks
3728          */
3729         if (mddev->major_version != 0) {
3730                 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
3731                        mdname(mddev));
3732                 return -EINVAL;
3733         }
3734
3735         if (!(info->state & (1<<MD_DISK_FAULTY))) {
3736                 int err;
3737                 rdev = md_import_device (dev, -1, 0);
3738                 if (IS_ERR(rdev)) {
3739                         printk(KERN_WARNING 
3740                                 "md: error, md_import_device() returned %ld\n",
3741                                 PTR_ERR(rdev));
3742                         return PTR_ERR(rdev);
3743                 }
3744                 rdev->desc_nr = info->number;
3745                 if (info->raid_disk < mddev->raid_disks)
3746                         rdev->raid_disk = info->raid_disk;
3747                 else
3748                         rdev->raid_disk = -1;
3749
3750                 rdev->flags = 0;
3751
3752                 if (rdev->raid_disk < mddev->raid_disks)
3753                         if (info->state & (1<<MD_DISK_SYNC))
3754                                 set_bit(In_sync, &rdev->flags);
3755
3756                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
3757                         set_bit(WriteMostly, &rdev->flags);
3758
3759                 if (!mddev->persistent) {
3760                         printk(KERN_INFO "md: nonpersistent superblock ...\n");
3761                         rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
3762                 } else 
3763                         rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
3764                 rdev->size = calc_dev_size(rdev, mddev->chunk_size);
3765
3766                 err = bind_rdev_to_array(rdev, mddev);
3767                 if (err) {
3768                         export_rdev(rdev);
3769                         return err;
3770                 }
3771         }
3772
3773         return 0;
3774 }
3775
3776 static int hot_remove_disk(mddev_t * mddev, dev_t dev)
3777 {
3778         char b[BDEVNAME_SIZE];
3779         mdk_rdev_t *rdev;
3780
3781         if (!mddev->pers)
3782                 return -ENODEV;
3783
3784         rdev = find_rdev(mddev, dev);
3785         if (!rdev)
3786                 return -ENXIO;
3787
3788         if (rdev->raid_disk >= 0)
3789                 goto busy;
3790
3791         kick_rdev_from_array(rdev);
3792         md_update_sb(mddev);
3793         md_new_event(mddev);
3794
3795         return 0;
3796 busy:
3797         printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
3798                 bdevname(rdev->bdev,b), mdname(mddev));
3799         return -EBUSY;
3800 }
3801
3802 static int hot_add_disk(mddev_t * mddev, dev_t dev)
3803 {
3804         char b[BDEVNAME_SIZE];
3805         int err;
3806         unsigned int size;
3807         mdk_rdev_t *rdev;
3808
3809         if (!mddev->pers)
3810                 return -ENODEV;
3811
3812         if (mddev->major_version != 0) {
3813                 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
3814                         " version-0 superblocks.\n",
3815                         mdname(mddev));
3816                 return -EINVAL;
3817         }
3818         if (!mddev->pers->hot_add_disk) {
3819                 printk(KERN_WARNING 
3820                         "%s: personality does not support diskops!\n",
3821                         mdname(mddev));
3822                 return -EINVAL;
3823         }
3824
3825         rdev = md_import_device (dev, -1, 0);
3826         if (IS_ERR(rdev)) {
3827                 printk(KERN_WARNING 
3828                         "md: error, md_import_device() returned %ld\n",
3829                         PTR_ERR(rdev));
3830                 return -EINVAL;
3831         }
3832
3833         if (mddev->persistent)
3834                 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
3835         else
3836                 rdev->sb_offset =
3837                         rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
3838
3839         size = calc_dev_size(rdev, mddev->chunk_size);
3840         rdev->size = size;
3841
3842         if (test_bit(Faulty, &rdev->flags)) {
3843                 printk(KERN_WARNING 
3844                         "md: can not hot-add faulty %s disk to %s!\n",
3845                         bdevname(rdev->bdev,b), mdname(mddev));
3846                 err = -EINVAL;
3847                 goto abort_export;
3848         }
3849         clear_bit(In_sync, &rdev->flags);
3850         rdev->desc_nr = -1;
3851         err = bind_rdev_to_array(rdev, mddev);
3852         if (err)
3853                 goto abort_export;
3854
3855         /*
3856          * The rest should better be atomic, we can have disk failures
3857          * noticed in interrupt contexts ...
3858          */
3859
3860         if (rdev->desc_nr == mddev->max_disks) {
3861                 printk(KERN_WARNING "%s: can not hot-add to full array!\n",
3862                         mdname(mddev));
3863                 err = -EBUSY;
3864                 goto abort_unbind_export;
3865         }
3866
3867         rdev->raid_disk = -1;
3868
3869         md_update_sb(mddev);
3870
3871         /*
3872          * Kick recovery, maybe this spare has to be added to the
3873          * array immediately.
3874          */
3875         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3876         md_wakeup_thread(mddev->thread);
3877         md_new_event(mddev);
3878         return 0;
3879
3880 abort_unbind_export:
3881         unbind_rdev_from_array(rdev);
3882
3883 abort_export:
3884         export_rdev(rdev);
3885         return err;
3886 }
3887
3888 static int set_bitmap_file(mddev_t *mddev, int fd)
3889 {
3890         int err;
3891
3892         if (mddev->pers) {
3893                 if (!mddev->pers->quiesce)
3894                         return -EBUSY;
3895                 if (mddev->recovery || mddev->sync_thread)
3896                         return -EBUSY;
3897                 /* we should be able to change the bitmap.. */
3898         }
3899
3900
3901         if (fd >= 0) {
3902                 if (mddev->bitmap)
3903                         return -EEXIST; /* cannot add when bitmap is present */
3904                 mddev->bitmap_file = fget(fd);
3905
3906                 if (mddev->bitmap_file == NULL) {
3907                         printk(KERN_ERR "%s: error: failed to get bitmap file\n",
3908                                mdname(mddev));
3909                         return -EBADF;
3910                 }
3911
3912                 err = deny_bitmap_write_access(mddev->bitmap_file);
3913                 if (err) {
3914                         printk(KERN_ERR "%s: error: bitmap file is already in use\n",
3915                                mdname(mddev));
3916                         fput(mddev->bitmap_file);
3917                         mddev->bitmap_file = NULL;
3918                         return err;
3919                 }
3920                 mddev->bitmap_offset = 0; /* file overrides offset */
3921         } else if (mddev->bitmap == NULL)
3922                 return -ENOENT; /* cannot remove what isn't there */
3923         err = 0;
3924         if (mddev->pers) {
3925                 mddev->pers->quiesce(mddev, 1);
3926                 if (fd >= 0)
3927                         err = bitmap_create(mddev);
3928                 if (fd < 0 || err) {
3929                         bitmap_destroy(mddev);
3930                         fd = -1; /* make sure to put the file */
3931                 }
3932                 mddev->pers->quiesce(mddev, 0);
3933         }
3934         if (fd < 0) {
3935                 if (mddev->bitmap_file) {
3936                         restore_bitmap_write_access(mddev->bitmap_file);
3937                         fput(mddev->bitmap_file);
3938                 }
3939                 mddev->bitmap_file = NULL;
3940         }
3941
3942         return err;
3943 }
3944
3945 /*
3946  * set_array_info is used two different ways
3947  * The original usage is when creating a new array.
3948  * In this usage, raid_disks is > 0 and it together with
3949  *  level, size, not_persistent,layout,chunksize determine the
3950  *  shape of the array.
3951  *  This will always create an array with a type-0.90.0 superblock.
3952  * The newer usage is when assembling an array.
3953  *  In this case raid_disks will be 0, and the major_version field is
3954  *  use to determine which style super-blocks are to be found on the devices.
3955  *  The minor and patch _version numbers are also kept incase the
3956  *  super_block handler wishes to interpret them.
3957  */
3958 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
3959 {
3960
3961         if (info->raid_disks == 0) {
3962                 /* just setting version number for superblock loading */
3963                 if (info->major_version < 0 ||
3964                     info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
3965                     super_types[info->major_version].name == NULL) {
3966                         /* maybe try to auto-load a module? */
3967                         printk(KERN_INFO 
3968                                 "md: superblock version %d not known\n",
3969                                 info->major_version);
3970                         return -EINVAL;
3971                 }
3972                 mddev->major_version = info->major_version;
3973                 mddev->minor_version = info->minor_version;
3974                 mddev->patch_version = info->patch_version;
3975                 return 0;
3976         }
3977         mddev->major_version = MD_MAJOR_VERSION;
3978         mddev->minor_version = MD_MINOR_VERSION;
3979         mddev->patch_version = MD_PATCHLEVEL_VERSION;
3980         mddev->ctime         = get_seconds();
3981
3982         mddev->level         = info->level;
3983         mddev->clevel[0]     = 0;
3984         mddev->size          = info->size;
3985         mddev->raid_disks    = info->raid_disks;
3986         /* don't set md_minor, it is determined by which /dev/md* was
3987          * openned
3988          */
3989         if (info->state & (1<<MD_SB_CLEAN))
3990                 mddev->recovery_cp = MaxSector;
3991         else
3992                 mddev->recovery_cp = 0;
3993         mddev->persistent    = ! info->not_persistent;
3994
3995         mddev->layout        = info->layout;
3996         mddev->chunk_size    = info->chunk_size;
3997
3998         mddev->max_disks     = MD_SB_DISKS;
3999
4000         mddev->sb_dirty      = 1;
4001
4002         mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
4003         mddev->bitmap_offset = 0;
4004
4005         mddev->reshape_position = MaxSector;
4006
4007         /*
4008          * Generate a 128 bit UUID
4009          */
4010         get_random_bytes(mddev->uuid, 16);
4011
4012         mddev->new_level = mddev->level;
4013         mddev->new_chunk = mddev->chunk_size;
4014         mddev->new_layout = mddev->layout;
4015         mddev->delta_disks = 0;
4016
4017         return 0;
4018 }
4019
4020 static int update_size(mddev_t *mddev, unsigned long size)
4021 {
4022         mdk_rdev_t * rdev;
4023         int rv;
4024         struct list_head *tmp;
4025         int fit = (size == 0);
4026
4027         if (mddev->pers->resize == NULL)
4028                 return -EINVAL;
4029         /* The "size" is the amount of each device that is used.
4030          * This can only make sense for arrays with redundancy.
4031          * linear and raid0 always use whatever space is available
4032          * We can only consider changing the size if no resync
4033          * or reconstruction is happening, and if the new size
4034          * is acceptable. It must fit before the sb_offset or,
4035          * if that is <data_offset, it must fit before the
4036          * size of each device.
4037          * If size is zero, we find the largest size that fits.
4038          */
4039         if (mddev->sync_thread)
4040                 return -EBUSY;
4041         ITERATE_RDEV(mddev,rdev,tmp) {
4042                 sector_t avail;
4043                 if (rdev->sb_offset > rdev->data_offset)
4044                         avail = (rdev->sb_offset*2) - rdev->data_offset;
4045                 else
4046                         avail = get_capacity(rdev->bdev->bd_disk)
4047                                 - rdev->data_offset;
4048                 if (fit && (size == 0 || size > avail/2))
4049                         size = avail/2;
4050                 if (avail < ((sector_t)size << 1))
4051                         return -ENOSPC;
4052         }
4053         rv = mddev->pers->resize(mddev, (sector_t)size *2);
4054         if (!rv) {
4055                 struct block_device *bdev;
4056
4057                 bdev = bdget_disk(mddev->gendisk, 0);
4058                 if (bdev) {
4059                         mutex_lock(&bdev->bd_inode->i_mutex);
4060                         i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
4061                         mutex_unlock(&bdev->bd_inode->i_mutex);
4062                         bdput(bdev);
4063                 }
4064         }
4065         return rv;
4066 }
4067
4068 static int update_raid_disks(mddev_t *mddev, int raid_disks)
4069 {
4070         int rv;
4071         /* change the number of raid disks */
4072         if (mddev->pers->check_reshape == NULL)
4073                 return -EINVAL;
4074         if (raid_disks <= 0 ||
4075             raid_disks >= mddev->max_disks)
4076                 return -EINVAL;
4077         if (mddev->sync_thread || mddev->reshape_position != MaxSector)
4078                 return -EBUSY;
4079         mddev->delta_disks = raid_disks - mddev->raid_disks;
4080
4081         rv = mddev->pers->check_reshape(mddev);
4082         return rv;
4083 }
4084
4085
4086 /*
4087  * update_array_info is used to change the configuration of an
4088  * on-line array.
4089  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
4090  * fields in the info are checked against the array.
4091  * Any differences that cannot be handled will cause an error.
4092  * Normally, only one change can be managed at a time.
4093  */
4094 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4095 {
4096         int rv = 0;
4097         int cnt = 0;
4098         int state = 0;
4099
4100         /* calculate expected state,ignoring low bits */
4101         if (mddev->bitmap && mddev->bitmap_offset)
4102                 state |= (1 << MD_SB_BITMAP_PRESENT);
4103
4104         if (mddev->major_version != info->major_version ||
4105             mddev->minor_version != info->minor_version ||
4106 /*          mddev->patch_version != info->patch_version || */
4107             mddev->ctime         != info->ctime         ||
4108             mddev->level         != info->level         ||
4109 /*          mddev->layout        != info->layout        || */
4110             !mddev->persistent   != info->not_persistent||
4111             mddev->chunk_size    != info->chunk_size    ||
4112             /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
4113             ((state^info->state) & 0xfffffe00)
4114                 )
4115                 return -EINVAL;
4116         /* Check there is only one change */
4117         if (info->size >= 0 && mddev->size != info->size) cnt++;
4118         if (mddev->raid_disks != info->raid_disks) cnt++;
4119         if (mddev->layout != info->layout) cnt++;
4120         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
4121         if (cnt == 0) return 0;
4122         if (cnt > 1) return -EINVAL;
4123
4124         if (mddev->layout != info->layout) {
4125                 /* Change layout
4126                  * we don't need to do anything at the md level, the
4127                  * personality will take care of it all.
4128                  */
4129                 if (mddev->pers->reconfig == NULL)
4130                         return -EINVAL;
4131                 else
4132                         return mddev->pers->reconfig(mddev, info->layout, -1);
4133         }
4134         if (info->size >= 0 && mddev->size != info->size)
4135                 rv = update_size(mddev, info->size);
4136
4137         if (mddev->raid_disks    != info->raid_disks)
4138                 rv = update_raid_disks(mddev, info->raid_disks);
4139
4140         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
4141                 if (mddev->pers->quiesce == NULL)
4142                         return -EINVAL;
4143                 if (mddev->recovery || mddev->sync_thread)
4144                         return -EBUSY;
4145                 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
4146                         /* add the bitmap */
4147                         if (mddev->bitmap)
4148                                 return -EEXIST;
4149                         if (mddev->default_bitmap_offset == 0)
4150                                 return -EINVAL;
4151                         mddev->bitmap_offset = mddev->default_bitmap_offset;
4152                         mddev->pers->quiesce(mddev, 1);
4153                         rv = bitmap_create(mddev);
4154                         if (rv)
4155                                 bitmap_destroy(mddev);
4156                         mddev->pers->quiesce(mddev, 0);
4157                 } else {
4158                         /* remove the bitmap */
4159                         if (!mddev->bitmap)
4160                                 return -ENOENT;
4161                         if (mddev->bitmap->file)
4162                                 return -EINVAL;
4163                         mddev->pers->quiesce(mddev, 1);
4164                         bitmap_destroy(mddev);
4165                         mddev->pers->quiesce(mddev, 0);
4166                         mddev->bitmap_offset = 0;
4167                 }
4168         }
4169         md_update_sb(mddev);
4170         return rv;
4171 }
4172
4173 static int set_disk_faulty(mddev_t *mddev, dev_t dev)
4174 {
4175         mdk_rdev_t *rdev;
4176
4177         if (mddev->pers == NULL)
4178                 return -ENODEV;
4179
4180         rdev = find_rdev(mddev, dev);
4181         if (!rdev)
4182                 return -ENODEV;
4183
4184         md_error(mddev, rdev);
4185         return 0;
4186 }
4187
4188 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4189 {
4190         mddev_t *mddev = bdev->bd_disk->private_data;
4191
4192         geo->heads = 2;
4193         geo->sectors = 4;
4194         geo->cylinders = get_capacity(mddev->gendisk) / 8;
4195         return 0;
4196 }
4197
4198 static int md_ioctl(struct inode *inode, struct file *file,
4199                         unsigned int cmd, unsigned long arg)
4200 {
4201         int err = 0;
4202         void __user *argp = (void __user *)arg;
4203         mddev_t *mddev = NULL;
4204
4205         if (!capable(CAP_SYS_ADMIN))
4206                 return -EACCES;
4207
4208         /*
4209          * Commands dealing with the RAID driver but not any
4210          * particular array:
4211          */
4212         switch (cmd)
4213         {
4214                 case RAID_VERSION:
4215                         err = get_version(argp);
4216                         goto done;
4217
4218                 case PRINT_RAID_DEBUG:
4219                         err = 0;
4220                         md_print_devices();
4221                         goto done;
4222
4223 #ifndef MODULE
4224                 case RAID_AUTORUN:
4225                         err = 0;
4226                         autostart_arrays(arg);
4227                         goto done;
4228 #endif
4229                 default:;
4230         }
4231
4232         /*
4233          * Commands creating/starting a new array:
4234          */
4235
4236         mddev = inode->i_bdev->bd_disk->private_data;
4237
4238         if (!mddev) {
4239                 BUG();
4240                 goto abort;
4241         }
4242
4243
4244         if (cmd == START_ARRAY) {
4245                 /* START_ARRAY doesn't need to lock the array as autostart_array
4246                  * does the locking, and it could even be a different array
4247                  */
4248                 static int cnt = 3;
4249                 if (cnt > 0 ) {
4250                         printk(KERN_WARNING
4251                                "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
4252                                "This will not be supported beyond July 2006\n",
4253                                current->comm, current->pid);
4254                         cnt--;
4255                 }
4256                 err = autostart_array(new_decode_dev(arg));
4257                 if (err) {
4258                         printk(KERN_WARNING "md: autostart failed!\n");
4259                         goto abort;
4260                 }
4261                 goto done;
4262         }
4263
4264         err = mddev_lock(mddev);
4265         if (err) {
4266                 printk(KERN_INFO 
4267                         "md: ioctl lock interrupted, reason %d, cmd %d\n",
4268                         err, cmd);
4269                 goto abort;
4270         }
4271
4272         switch (cmd)
4273         {
4274                 case SET_ARRAY_INFO:
4275                         {
4276                                 mdu_array_info_t info;
4277                                 if (!arg)
4278                                         memset(&info, 0, sizeof(info));
4279                                 else if (copy_from_user(&info, argp, sizeof(info))) {
4280                                         err = -EFAULT;
4281                                         goto abort_unlock;
4282                                 }
4283                                 if (mddev->pers) {
4284                                         err = update_array_info(mddev, &info);
4285                                         if (err) {
4286                                                 printk(KERN_WARNING "md: couldn't update"
4287                                                        " array info. %d\n", err);
4288                                                 goto abort_unlock;
4289                                         }
4290                                         goto done_unlock;
4291                                 }
4292                                 if (!list_empty(&mddev->disks)) {
4293                                         printk(KERN_WARNING
4294                                                "md: array %s already has disks!\n",
4295                                                mdname(mddev));
4296                                         err = -EBUSY;
4297                                         goto abort_unlock;
4298                                 }
4299                                 if (mddev->raid_disks) {
4300                                         printk(KERN_WARNING
4301                                                "md: array %s already initialised!\n",
4302                                                mdname(mddev));
4303                                         err = -EBUSY;
4304                                         goto abort_unlock;
4305                                 }
4306                                 err = set_array_info(mddev, &info);
4307                                 if (err) {
4308                                         printk(KERN_WARNING "md: couldn't set"
4309                                                " array info. %d\n", err);
4310                                         goto abort_unlock;
4311                                 }
4312                         }
4313                         goto done_unlock;
4314
4315                 default:;
4316         }
4317
4318         /*
4319          * Commands querying/configuring an existing array:
4320          */
4321         /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
4322          * RUN_ARRAY, and SET_BITMAP_FILE are allowed */
4323         if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
4324                         && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) {
4325                 err = -ENODEV;
4326                 goto abort_unlock;
4327         }
4328
4329         /*
4330          * Commands even a read-only array can execute:
4331          */
4332         switch (cmd)
4333         {
4334                 case GET_ARRAY_INFO:
4335                         err = get_array_info(mddev, argp);
4336                         goto done_unlock;
4337
4338                 case GET_BITMAP_FILE:
4339                         err = get_bitmap_file(mddev, argp);
4340                         goto done_unlock;
4341
4342                 case GET_DISK_INFO:
4343                         err = get_disk_info(mddev, argp);
4344                         goto done_unlock;
4345
4346                 case RESTART_ARRAY_RW:
4347                         err = restart_array(mddev);
4348                         goto done_unlock;
4349
4350                 case STOP_ARRAY:
4351                         err = do_md_stop (mddev, 0);
4352                         goto done_unlock;
4353
4354                 case STOP_ARRAY_RO:
4355                         err = do_md_stop (mddev, 1);
4356                         goto done_unlock;
4357
4358         /*
4359          * We have a problem here : there is no easy way to give a CHS
4360          * virtual geometry. We currently pretend that we have a 2 heads
4361          * 4 sectors (with a BIG number of cylinders...). This drives
4362          * dosfs just mad... ;-)
4363          */
4364         }
4365
4366         /*
4367          * The remaining ioctls are changing the state of the
4368          * superblock, so we do not allow them on read-only arrays.
4369          * However non-MD ioctls (e.g. get-size) will still come through
4370          * here and hit the 'default' below, so only disallow
4371          * 'md' ioctls, and switch to rw mode if started auto-readonly.
4372          */
4373         if (_IOC_TYPE(cmd) == MD_MAJOR &&
4374             mddev->ro && mddev->pers) {
4375                 if (mddev->ro == 2) {
4376                         mddev->ro = 0;
4377                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4378                 md_wakeup_thread(mddev->thread);
4379
4380                 } else {
4381                         err = -EROFS;
4382                         goto abort_unlock;
4383                 }
4384         }
4385
4386         switch (cmd)
4387         {
4388                 case ADD_NEW_DISK:
4389                 {
4390                         mdu_disk_info_t info;
4391                         if (copy_from_user(&info, argp, sizeof(info)))
4392                                 err = -EFAULT;
4393                         else
4394                                 err = add_new_disk(mddev, &info);
4395                         goto done_unlock;
4396                 }
4397
4398                 case HOT_REMOVE_DISK:
4399                         err = hot_remove_disk(mddev, new_decode_dev(arg));
4400                         goto done_unlock;
4401
4402                 case HOT_ADD_DISK:
4403                         err = hot_add_disk(mddev, new_decode_dev(arg));
4404                         goto done_unlock;
4405
4406                 case SET_DISK_FAULTY:
4407                         err = set_disk_faulty(mddev, new_decode_dev(arg));
4408                         goto done_unlock;
4409
4410                 case RUN_ARRAY:
4411                         err = do_md_run (mddev);
4412                         goto done_unlock;
4413
4414                 case SET_BITMAP_FILE:
4415                         err = set_bitmap_file(mddev, (int)arg);
4416                         goto done_unlock;
4417
4418                 default:
4419                         err = -EINVAL;
4420                         goto abort_unlock;
4421         }
4422
4423 done_unlock:
4424 abort_unlock:
4425         mddev_unlock(mddev);
4426
4427         return err;
4428 done:
4429         if (err)
4430                 MD_BUG();
4431 abort:
4432         return err;
4433 }
4434
4435 static int md_open(struct inode *inode, struct file *file)
4436 {
4437         /*
4438          * Succeed if we can lock the mddev, which confirms that
4439          * it isn't being stopped right now.
4440          */
4441         mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4442         int err;
4443
4444         if ((err = mddev_lock(mddev)))
4445                 goto out;
4446
4447         err = 0;
4448         mddev_get(mddev);
4449         mddev_unlock(mddev);
4450
4451         check_disk_change(inode->i_bdev);
4452  out:
4453         return err;
4454 }
4455
4456 static int md_release(struct inode *inode, struct file * file)
4457 {
4458         mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4459
4460         if (!mddev)
4461                 BUG();
4462         mddev_put(mddev);
4463
4464         return 0;
4465 }
4466
4467 static int md_media_changed(struct gendisk *disk)
4468 {
4469         mddev_t *mddev = disk->private_data;
4470
4471         return mddev->changed;
4472 }
4473
4474 static int md_revalidate(struct gendisk *disk)
4475 {
4476         mddev_t *mddev = disk->private_data;
4477
4478         mddev->changed = 0;
4479         return 0;
4480 }
4481 static struct block_device_operations md_fops =
4482 {
4483         .owner          = THIS_MODULE,
4484         .open           = md_open,
4485         .release        = md_release,
4486         .ioctl          = md_ioctl,
4487         .getgeo         = md_getgeo,
4488         .media_changed  = md_media_changed,
4489         .revalidate_disk= md_revalidate,
4490 };
4491
4492 static int md_thread(void * arg)
4493 {
4494         mdk_thread_t *thread = arg;
4495
4496         /*
4497          * md_thread is a 'system-thread', it's priority should be very
4498          * high. We avoid resource deadlocks individually in each
4499          * raid personality. (RAID5 does preallocation) We also use RR and
4500          * the very same RT priority as kswapd, thus we will never get
4501          * into a priority inversion deadlock.
4502          *
4503          * we definitely have to have equal or higher priority than
4504          * bdflush, otherwise bdflush will deadlock if there are too
4505          * many dirty RAID5 blocks.
4506          */
4507
4508         allow_signal(SIGKILL);
4509         while (!kthread_should_stop()) {
4510
4511                 /* We need to wait INTERRUPTIBLE so that
4512                  * we don't add to the load-average.
4513                  * That means we need to be sure no signals are
4514                  * pending
4515                  */
4516                 if (signal_pending(current))
4517                         flush_signals(current);
4518
4519                 wait_event_interruptible_timeout
4520                         (thread->wqueue,
4521                          test_bit(THREAD_WAKEUP, &thread->flags)
4522                          || kthread_should_stop(),
4523                          thread->timeout);
4524                 try_to_freeze();
4525
4526                 clear_bit(THREAD_WAKEUP, &thread->flags);
4527
4528                 thread->run(thread->mddev);
4529         }
4530
4531         return 0;
4532 }
4533
4534 void md_wakeup_thread(mdk_thread_t *thread)
4535 {
4536         if (thread) {
4537                 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
4538                 set_bit(THREAD_WAKEUP, &thread->flags);
4539                 wake_up(&thread->wqueue);
4540         }
4541 }
4542
4543 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
4544                                  const char *name)
4545 {
4546         mdk_thread_t *thread;
4547
4548         thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
4549         if (!thread)
4550                 return NULL;
4551
4552         init_waitqueue_head(&thread->wqueue);
4553
4554         thread->run = run;
4555         thread->mddev = mddev;
4556         thread->timeout = MAX_SCHEDULE_TIMEOUT;
4557         thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
4558         if (IS_ERR(thread->tsk)) {
4559                 kfree(thread);
4560                 return NULL;
4561         }
4562         return thread;
4563 }
4564
4565 void md_unregister_thread(mdk_thread_t *thread)
4566 {
4567         dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
4568
4569         kthread_stop(thread->tsk);
4570         kfree(thread);
4571 }
4572
4573 void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
4574 {
4575         if (!mddev) {
4576                 MD_BUG();
4577                 return;
4578         }
4579
4580         if (!rdev || test_bit(Faulty, &rdev->flags))
4581                 return;
4582 /*
4583         dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
4584                 mdname(mddev),
4585                 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
4586                 __builtin_return_address(0),__builtin_return_address(1),
4587                 __builtin_return_address(2),__builtin_return_address(3));
4588 */
4589         if (!mddev->pers->error_handler)
4590                 return;
4591         mddev->pers->error_handler(mddev,rdev);
4592         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4593         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4594         md_wakeup_thread(mddev->thread);
4595         md_new_event_inintr(mddev);
4596 }
4597
4598 /* seq_file implementation /proc/mdstat */
4599
4600 static void status_unused(struct seq_file *seq)
4601 {
4602         int i = 0;
4603         mdk_rdev_t *rdev;
4604         struct list_head *tmp;
4605
4606         seq_printf(seq, "unused devices: ");
4607
4608         ITERATE_RDEV_PENDING(rdev,tmp) {
4609                 char b[BDEVNAME_SIZE];
4610                 i++;
4611                 seq_printf(seq, "%s ",
4612                               bdevname(rdev->bdev,b));
4613         }
4614         if (!i)
4615                 seq_printf(seq, "<none>");
4616
4617         seq_printf(seq, "\n");
4618 }
4619
4620
4621 static void status_resync(struct seq_file *seq, mddev_t * mddev)
4622 {
4623         sector_t max_blocks, resync, res;
4624         unsigned long dt, db, rt;
4625         int scale;
4626         unsigned int per_milli;
4627
4628         resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
4629
4630         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
4631                 max_blocks = mddev->resync_max_sectors >> 1;
4632         else
4633                 max_blocks = mddev->size;
4634
4635         /*
4636          * Should not happen.
4637          */
4638         if (!max_blocks) {
4639                 MD_BUG();
4640                 return;
4641         }
4642         /* Pick 'scale' such that (resync>>scale)*1000 will fit
4643          * in a sector_t, and (max_blocks>>scale) will fit in a
4644          * u32, as those are the requirements for sector_div.
4645          * Thus 'scale' must be at least 10
4646          */
4647         scale = 10;
4648         if (sizeof(sector_t) > sizeof(unsigned long)) {
4649                 while ( max_blocks/2 > (1ULL<<(scale+32)))
4650                         scale++;
4651         }
4652         res = (resync>>scale)*1000;
4653         sector_div(res, (u32)((max_blocks>>scale)+1));
4654
4655         per_milli = res;
4656         {
4657                 int i, x = per_milli/50, y = 20-x;
4658                 seq_printf(seq, "[");
4659                 for (i = 0; i < x; i++)
4660                         seq_printf(seq, "=");
4661                 seq_printf(seq, ">");
4662                 for (i = 0; i < y; i++)
4663                         seq_printf(seq, ".");
4664                 seq_printf(seq, "] ");
4665         }
4666         seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
4667                    (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
4668                     "reshape" :
4669                       (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
4670                        "resync" : "recovery")),
4671                       per_milli/10, per_milli % 10,
4672                    (unsigned long long) resync,
4673                    (unsigned long long) max_blocks);
4674
4675         /*
4676          * We do not want to overflow, so the order of operands and
4677          * the * 100 / 100 trick are important. We do a +1 to be
4678          * safe against division by zero. We only estimate anyway.
4679          *
4680          * dt: time from mark until now
4681          * db: blocks written from mark until now
4682          * rt: remaining time
4683          */
4684         dt = ((jiffies - mddev->resync_mark) / HZ);
4685         if (!dt) dt++;
4686         db = resync - (mddev->resync_mark_cnt/2);
4687         rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100;
4688
4689         seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
4690
4691         seq_printf(seq, " speed=%ldK/sec", db/dt);
4692 }
4693
4694 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
4695 {
4696         struct list_head *tmp;
4697         loff_t l = *pos;
4698         mddev_t *mddev;
4699
4700         if (l >= 0x10000)
4701                 return NULL;
4702         if (!l--)
4703                 /* header */
4704                 return (void*)1;
4705
4706         spin_lock(&all_mddevs_lock);
4707         list_for_each(tmp,&all_mddevs)
4708                 if (!l--) {
4709                         mddev = list_entry(tmp, mddev_t, all_mddevs);
4710                         mddev_get(mddev);
4711                         spin_unlock(&all_mddevs_lock);
4712                         return mddev;
4713                 }
4714         spin_unlock(&all_mddevs_lock);
4715         if (!l--)
4716                 return (void*)2;/* tail */
4717         return NULL;
4718 }
4719
4720 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4721 {
4722         struct list_head *tmp;
4723         mddev_t *next_mddev, *mddev = v;
4724         
4725         ++*pos;
4726         if (v == (void*)2)
4727                 return NULL;
4728
4729         spin_lock(&all_mddevs_lock);
4730         if (v == (void*)1)
4731                 tmp = all_mddevs.next;
4732         else
4733                 tmp = mddev->all_mddevs.next;
4734         if (tmp != &all_mddevs)
4735                 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
4736         else {
4737                 next_mddev = (void*)2;
4738                 *pos = 0x10000;
4739         }               
4740         spin_unlock(&all_mddevs_lock);
4741
4742         if (v != (void*)1)
4743                 mddev_put(mddev);
4744         return next_mddev;
4745
4746 }
4747
4748 static void md_seq_stop(struct seq_file *seq, void *v)
4749 {
4750         mddev_t *mddev = v;
4751
4752         if (mddev && v != (void*)1 && v != (void*)2)
4753                 mddev_put(mddev);
4754 }
4755
4756 struct mdstat_info {
4757         int event;
4758 };
4759
4760 static int md_seq_show(struct seq_file *seq, void *v)
4761 {
4762         mddev_t *mddev = v;
4763         sector_t size;
4764         struct list_head *tmp2;
4765         mdk_rdev_t *rdev;
4766         struct mdstat_info *mi = seq->private;
4767         struct bitmap *bitmap;
4768
4769         if (v == (void*)1) {
4770                 struct mdk_personality *pers;
4771                 seq_printf(seq, "Personalities : ");
4772                 spin_lock(&pers_lock);
4773                 list_for_each_entry(pers, &pers_list, list)
4774                         seq_printf(seq, "[%s] ", pers->name);
4775
4776                 spin_unlock(&pers_lock);
4777                 seq_printf(seq, "\n");
4778                 mi->event = atomic_read(&md_event_count);
4779                 return 0;
4780         }
4781         if (v == (void*)2) {
4782                 status_unused(seq);
4783                 return 0;
4784         }
4785
4786         if (mddev_lock(mddev) < 0)
4787                 return -EINTR;
4788
4789         if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
4790                 seq_printf(seq, "%s : %sactive", mdname(mddev),
4791                                                 mddev->pers ? "" : "in");
4792                 if (mddev->pers) {
4793                         if (mddev->ro==1)
4794                                 seq_printf(seq, " (read-only)");
4795                         if (mddev->ro==2)
4796                                 seq_printf(seq, "(auto-read-only)");
4797                         seq_printf(seq, " %s", mddev->pers->name);
4798                 }
4799
4800                 size = 0;
4801                 ITERATE_RDEV(mddev,rdev,tmp2) {
4802                         char b[BDEVNAME_SIZE];
4803                         seq_printf(seq, " %s[%d]",
4804                                 bdevname(rdev->bdev,b), rdev->desc_nr);
4805                         if (test_bit(WriteMostly, &rdev->flags))
4806                                 seq_printf(seq, "(W)");
4807                         if (test_bit(Faulty, &rdev->flags)) {
4808                                 seq_printf(seq, "(F)");
4809                                 continue;
4810                         } else if (rdev->raid_disk < 0)
4811                                 seq_printf(seq, "(S)"); /* spare */
4812                         size += rdev->size;
4813                 }
4814
4815                 if (!list_empty(&mddev->disks)) {
4816                         if (mddev->pers)
4817                                 seq_printf(seq, "\n      %llu blocks",
4818                                         (unsigned long long)mddev->array_size);
4819                         else
4820                                 seq_printf(seq, "\n      %llu blocks",
4821                                         (unsigned long long)size);
4822                 }
4823                 if (mddev->persistent) {
4824                         if (mddev->major_version != 0 ||
4825                             mddev->minor_version != 90) {
4826                                 seq_printf(seq," super %d.%d",
4827                                            mddev->major_version,
4828                                            mddev->minor_version);
4829                         }
4830                 } else
4831                         seq_printf(seq, " super non-persistent");
4832
4833                 if (mddev->pers) {
4834                         mddev->pers->status (seq, mddev);
4835                         seq_printf(seq, "\n      ");
4836                         if (mddev->pers->sync_request) {
4837                                 if (mddev->curr_resync > 2) {
4838                                         status_resync (seq, mddev);
4839                                         seq_printf(seq, "\n      ");
4840                                 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
4841                                         seq_printf(seq, "\tresync=DELAYED\n      ");
4842                                 else if (mddev->recovery_cp < MaxSector)
4843                                         seq_printf(seq, "\tresync=PENDING\n      ");
4844                         }
4845                 } else
4846                         seq_printf(seq, "\n       ");
4847
4848                 if ((bitmap = mddev->bitmap)) {
4849                         unsigned long chunk_kb;
4850                         unsigned long flags;
4851                         spin_lock_irqsave(&bitmap->lock, flags);
4852                         chunk_kb = bitmap->chunksize >> 10;
4853                         seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
4854                                 "%lu%s chunk",
4855                                 bitmap->pages - bitmap->missing_pages,
4856                                 bitmap->pages,
4857                                 (bitmap->pages - bitmap->missing_pages)
4858                                         << (PAGE_SHIFT - 10),
4859                                 chunk_kb ? chunk_kb : bitmap->chunksize,
4860                                 chunk_kb ? "KB" : "B");
4861                         if (bitmap->file) {
4862                                 seq_printf(seq, ", file: ");
4863                                 seq_path(seq, bitmap->file->f_vfsmnt,
4864                                          bitmap->file->f_dentry," \t\n");
4865                         }
4866
4867                         seq_printf(seq, "\n");
4868                         spin_unlock_irqrestore(&bitmap->lock, flags);
4869                 }
4870
4871                 seq_printf(seq, "\n");
4872         }
4873         mddev_unlock(mddev);
4874         
4875         return 0;
4876 }
4877
4878 static struct seq_operations md_seq_ops = {
4879         .start  = md_seq_start,
4880         .next   = md_seq_next,
4881         .stop   = md_seq_stop,
4882         .show   = md_seq_show,
4883 };
4884
4885 static int md_seq_open(struct inode *inode, struct file *file)
4886 {
4887         int error;
4888         struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
4889         if (mi == NULL)
4890                 return -ENOMEM;
4891
4892         error = seq_open(file, &md_seq_ops);
4893         if (error)
4894                 kfree(mi);
4895         else {
4896                 struct seq_file *p = file->private_data;
4897                 p->private = mi;
4898                 mi->event = atomic_read(&md_event_count);
4899         }
4900         return error;
4901 }
4902
4903 static int md_seq_release(struct inode *inode, struct file *file)
4904 {
4905         struct seq_file *m = file->private_data;
4906         struct mdstat_info *mi = m->private;
4907         m->private = NULL;
4908         kfree(mi);
4909         return seq_release(inode, file);
4910 }
4911
4912 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
4913 {
4914         struct seq_file *m = filp->private_data;
4915         struct mdstat_info *mi = m->private;
4916         int mask;
4917
4918         poll_wait(filp, &md_event_waiters, wait);
4919
4920         /* always allow read */
4921         mask = POLLIN | POLLRDNORM;
4922
4923         if (mi->event != atomic_read(&md_event_count))
4924                 mask |= POLLERR | POLLPRI;
4925         return mask;
4926 }
4927
4928 static struct file_operations md_seq_fops = {
4929         .open           = md_seq_open,
4930         .read           = seq_read,
4931         .llseek         = seq_lseek,
4932         .release        = md_seq_release,
4933         .poll           = mdstat_poll,
4934 };
4935
4936 int register_md_personality(struct mdk_personality *p)
4937 {
4938         spin_lock(&pers_lock);
4939         list_add_tail(&p->list, &pers_list);
4940         printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
4941         spin_unlock(&pers_lock);
4942         return 0;
4943 }
4944
4945 int unregister_md_personality(struct mdk_personality *p)
4946 {
4947         printk(KERN_INFO "md: %s personality unregistered\n", p->name);
4948         spin_lock(&pers_lock);
4949         list_del_init(&p->list);
4950         spin_unlock(&pers_lock);
4951         return 0;
4952 }
4953
4954 static int is_mddev_idle(mddev_t *mddev)
4955 {
4956         mdk_rdev_t * rdev;
4957         struct list_head *tmp;
4958         int idle;
4959         unsigned long curr_events;
4960
4961         idle = 1;
4962         ITERATE_RDEV(mddev,rdev,tmp) {
4963                 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
4964                 curr_events = disk_stat_read(disk, sectors[0]) + 
4965                                 disk_stat_read(disk, sectors[1]) - 
4966                                 atomic_read(&disk->sync_io);
4967                 /* The difference between curr_events and last_events
4968                  * will be affected by any new non-sync IO (making
4969                  * curr_events bigger) and any difference in the amount of
4970                  * in-flight syncio (making current_events bigger or smaller)
4971                  * The amount in-flight is currently limited to
4972                  * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6
4973                  * which is at most 4096 sectors.
4974                  * These numbers are fairly fragile and should be made
4975                  * more robust, probably by enforcing the
4976                  * 'window size' that md_do_sync sort-of uses.
4977                  *
4978                  * Note: the following is an unsigned comparison.
4979                  */
4980                 if ((curr_events - rdev->last_events + 4096) > 8192) {
4981                         rdev->last_events = curr_events;
4982                         idle = 0;
4983                 }
4984         }
4985         return idle;
4986 }
4987
4988 void md_done_sync(mddev_t *mddev, int blocks, int ok)
4989 {
4990         /* another "blocks" (512byte) blocks have been synced */
4991         atomic_sub(blocks, &mddev->recovery_active);
4992         wake_up(&mddev->recovery_wait);
4993         if (!ok) {
4994                 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
4995                 md_wakeup_thread(mddev->thread);
4996                 // stop recovery, signal do_sync ....
4997         }
4998 }
4999
5000
5001 /* md_write_start(mddev, bi)
5002  * If we need to update some array metadata (e.g. 'active' flag
5003  * in superblock) before writing, schedule a superblock update
5004  * and wait for it to complete.
5005  */
5006 void md_write_start(mddev_t *mddev, struct bio *bi)
5007 {
5008         if (bio_data_dir(bi) != WRITE)
5009                 return;
5010
5011         BUG_ON(mddev->ro == 1);
5012         if (mddev->ro == 2) {
5013                 /* need to switch to read/write */
5014                 mddev->ro = 0;
5015                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5016                 md_wakeup_thread(mddev->thread);
5017         }
5018         atomic_inc(&mddev->writes_pending);
5019         if (mddev->in_sync) {
5020                 spin_lock_irq(&mddev->write_lock);
5021                 if (mddev->in_sync) {
5022                         mddev->in_sync = 0;
5023                         mddev->sb_dirty = 3;
5024                         md_wakeup_thread(mddev->thread);
5025                 }
5026                 spin_unlock_irq(&mddev->write_lock);
5027         }
5028         wait_event(mddev->sb_wait, mddev->sb_dirty==0);
5029 }
5030
5031 void md_write_end(mddev_t *mddev)
5032 {
5033         if (atomic_dec_and_test(&mddev->writes_pending)) {
5034                 if (mddev->safemode == 2)
5035                         md_wakeup_thread(mddev->thread);
5036                 else if (mddev->safemode_delay)
5037                         mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
5038         }
5039 }
5040
5041 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
5042
5043 #define SYNC_MARKS      10
5044 #define SYNC_MARK_STEP  (3*HZ)
5045 void md_do_sync(mddev_t *mddev)
5046 {
5047         mddev_t *mddev2;
5048         unsigned int currspeed = 0,
5049                  window;
5050         sector_t max_sectors,j, io_sectors;
5051         unsigned long mark[SYNC_MARKS];
5052         sector_t mark_cnt[SYNC_MARKS];
5053         int last_mark,m;
5054         struct list_head *tmp;
5055         sector_t last_check;
5056         int skipped = 0;
5057         struct list_head *rtmp;
5058         mdk_rdev_t *rdev;
5059
5060         /* just incase thread restarts... */
5061         if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
5062                 return;
5063         if (mddev->ro) /* never try to sync a read-only array */
5064                 return;
5065
5066         /* we overload curr_resync somewhat here.
5067          * 0 == not engaged in resync at all
5068          * 2 == checking that there is no conflict with another sync
5069          * 1 == like 2, but have yielded to allow conflicting resync to
5070          *              commense
5071          * other == active in resync - this many blocks
5072          *
5073          * Before starting a resync we must have set curr_resync to
5074          * 2, and then checked that every "conflicting" array has curr_resync
5075          * less than ours.  When we find one that is the same or higher
5076          * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
5077          * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
5078          * This will mean we have to start checking from the beginning again.
5079          *
5080          */
5081
5082         do {
5083                 mddev->curr_resync = 2;
5084
5085         try_again:
5086                 if (kthread_should_stop()) {
5087                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5088                         goto skip;
5089                 }
5090                 ITERATE_MDDEV(mddev2,tmp) {
5091                         if (mddev2 == mddev)
5092                                 continue;
5093                         if (mddev2->curr_resync && 
5094                             match_mddev_units(mddev,mddev2)) {
5095                                 DEFINE_WAIT(wq);
5096                                 if (mddev < mddev2 && mddev->curr_resync == 2) {
5097                                         /* arbitrarily yield */
5098                                         mddev->curr_resync = 1;
5099                                         wake_up(&resync_wait);
5100                                 }
5101                                 if (mddev > mddev2 && mddev->curr_resync == 1)
5102                                         /* no need to wait here, we can wait the next
5103                                          * time 'round when curr_resync == 2
5104                                          */
5105                                         continue;
5106                                 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
5107                                 if (!kthread_should_stop() &&
5108                                     mddev2->curr_resync >= mddev->curr_resync) {
5109                                         printk(KERN_INFO "md: delaying resync of %s"
5110                                                " until %s has finished resync (they"
5111                                                " share one or more physical units)\n",
5112                                                mdname(mddev), mdname(mddev2));
5113                                         mddev_put(mddev2);
5114                                         schedule();
5115                                         finish_wait(&resync_wait, &wq);
5116                                         goto try_again;
5117                                 }
5118                                 finish_wait(&resync_wait, &wq);
5119                         }
5120                 }
5121         } while (mddev->curr_resync < 2);
5122
5123         j = 0;
5124         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5125                 /* resync follows the size requested by the personality,
5126                  * which defaults to physical size, but can be virtual size
5127                  */
5128                 max_sectors = mddev->resync_max_sectors;
5129                 mddev->resync_mismatches = 0;
5130                 /* we don't use the checkpoint if there's a bitmap */
5131                 if (!mddev->bitmap &&
5132                     !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5133                         j = mddev->recovery_cp;
5134         } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5135                 max_sectors = mddev->size << 1;
5136         else {
5137                 /* recovery follows the physical size of devices */
5138                 max_sectors = mddev->size << 1;
5139                 j = MaxSector;
5140                 ITERATE_RDEV(mddev,rdev,rtmp)
5141                         if (rdev->raid_disk >= 0 &&
5142                             !test_bit(Faulty, &rdev->flags) &&
5143                             !test_bit(In_sync, &rdev->flags) &&
5144                             rdev->recovery_offset < j)
5145                                 j = rdev->recovery_offset;
5146         }
5147
5148         printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
5149         printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
5150                 " %d KB/sec/disc.\n", speed_min(mddev));
5151         printk(KERN_INFO "md: using maximum available idle IO bandwidth "
5152                "(but not more than %d KB/sec) for reconstruction.\n",
5153                speed_max(mddev));
5154
5155         is_mddev_idle(mddev); /* this also initializes IO event counters */
5156
5157         io_sectors = 0;
5158         for (m = 0; m < SYNC_MARKS; m++) {
5159                 mark[m] = jiffies;
5160                 mark_cnt[m] = io_sectors;
5161         }
5162         last_mark = 0;
5163         mddev->resync_mark = mark[last_mark];
5164         mddev->resync_mark_cnt = mark_cnt[last_mark];
5165
5166         /*
5167          * Tune reconstruction:
5168          */
5169         window = 32*(PAGE_SIZE/512);
5170         printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
5171                 window/2,(unsigned long long) max_sectors/2);
5172
5173         atomic_set(&mddev->recovery_active, 0);
5174         init_waitqueue_head(&mddev->recovery_wait);
5175         last_check = 0;
5176
5177         if (j>2) {
5178                 printk(KERN_INFO 
5179                         "md: resuming recovery of %s from checkpoint.\n",
5180                         mdname(mddev));
5181                 mddev->curr_resync = j;
5182         }
5183
5184         while (j < max_sectors) {
5185                 sector_t sectors;
5186
5187                 skipped = 0;
5188                 sectors = mddev->pers->sync_request(mddev, j, &skipped,
5189                                             currspeed < speed_min(mddev));
5190                 if (sectors == 0) {
5191                         set_bit(MD_RECOVERY_ERR, &mddev->recovery);
5192                         goto out;
5193                 }
5194
5195                 if (!skipped) { /* actual IO requested */
5196                         io_sectors += sectors;
5197                         atomic_add(sectors, &mddev->recovery_active);
5198                 }
5199
5200                 j += sectors;
5201                 if (j>1) mddev->curr_resync = j;
5202                 if (last_check == 0)
5203                         /* this is the earliers that rebuilt will be
5204                          * visible in /proc/mdstat
5205                          */
5206                         md_new_event(mddev);
5207
5208                 if (last_check + window > io_sectors || j == max_sectors)
5209                         continue;
5210
5211                 last_check = io_sectors;
5212
5213                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
5214                     test_bit(MD_RECOVERY_ERR, &mddev->recovery))
5215                         break;
5216
5217         repeat:
5218                 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
5219                         /* step marks */
5220                         int next = (last_mark+1) % SYNC_MARKS;
5221
5222                         mddev->resync_mark = mark[next];
5223                         mddev->resync_mark_cnt = mark_cnt[next];
5224                         mark[next] = jiffies;
5225                         mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
5226                         last_mark = next;
5227                 }
5228
5229
5230                 if (kthread_should_stop()) {
5231                         /*
5232                          * got a signal, exit.
5233                          */
5234                         printk(KERN_INFO 
5235                                 "md: md_do_sync() got signal ... exiting\n");
5236                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5237                         goto out;
5238                 }
5239
5240                 /*
5241                  * this loop exits only if either when we are slower than
5242                  * the 'hard' speed limit, or the system was IO-idle for
5243                  * a jiffy.
5244                  * the system might be non-idle CPU-wise, but we only care
5245                  * about not overloading the IO subsystem. (things like an
5246                  * e2fsck being done on the RAID array should execute fast)
5247                  */
5248                 mddev->queue->unplug_fn(mddev->queue);
5249                 cond_resched();
5250
5251                 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
5252                         /((jiffies-mddev->resync_mark)/HZ +1) +1;
5253
5254                 if (currspeed > speed_min(mddev)) {
5255                         if ((currspeed > speed_max(mddev)) ||
5256                                         !is_mddev_idle(mddev)) {
5257                                 msleep(500);
5258                                 goto repeat;
5259                         }
5260                 }
5261         }
5262         printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
5263         /*
5264          * this also signals 'finished resyncing' to md_stop
5265          */
5266  out:
5267         mddev->queue->unplug_fn(mddev->queue);
5268
5269         wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
5270
5271         /* tell personality that we are finished */
5272         mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
5273
5274         if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5275             test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
5276             !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
5277             mddev->curr_resync > 2) {
5278                 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5279                         if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5280                                 if (mddev->curr_resync >= mddev->recovery_cp) {
5281                                         printk(KERN_INFO
5282                                                "md: checkpointing recovery of %s.\n",
5283                                                mdname(mddev));
5284                                         mddev->recovery_cp = mddev->curr_resync;
5285                                 }
5286                         } else
5287                                 mddev->recovery_cp = MaxSector;
5288                 } else {
5289                         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5290                                 mddev->curr_resync = MaxSector;
5291                         ITERATE_RDEV(mddev,rdev,rtmp)
5292                                 if (rdev->raid_disk >= 0 &&
5293                                     !test_bit(Faulty, &rdev->flags) &&
5294                                     !test_bit(In_sync, &rdev->flags) &&
5295                                     rdev->recovery_offset < mddev->curr_resync)
5296                                         rdev->recovery_offset = mddev->curr_resync;
5297                         mddev->sb_dirty = 1;
5298                 }
5299         }
5300
5301  skip:
5302         mddev->curr_resync = 0;
5303         wake_up(&resync_wait);
5304         set_bit(MD_RECOVERY_DONE, &mddev->recovery);
5305         md_wakeup_thread(mddev->thread);
5306 }
5307 EXPORT_SYMBOL_GPL(md_do_sync);
5308
5309
5310 /*
5311  * This routine is regularly called by all per-raid-array threads to
5312  * deal with generic issues like resync and super-block update.
5313  * Raid personalities that don't have a thread (linear/raid0) do not
5314  * need this as they never do any recovery or update the superblock.
5315  *
5316  * It does not do any resync itself, but rather "forks" off other threads
5317  * to do that as needed.
5318  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
5319  * "->recovery" and create a thread at ->sync_thread.
5320  * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
5321  * and wakeups up this thread which will reap the thread and finish up.
5322  * This thread also removes any faulty devices (with nr_pending == 0).
5323  *
5324  * The overall approach is:
5325  *  1/ if the superblock needs updating, update it.
5326  *  2/ If a recovery thread is running, don't do anything else.
5327  *  3/ If recovery has finished, clean up, possibly marking spares active.
5328  *  4/ If there are any faulty devices, remove them.
5329  *  5/ If array is degraded, try to add spares devices
5330  *  6/ If array has spares or is not in-sync, start a resync thread.
5331  */
5332 void md_check_recovery(mddev_t *mddev)
5333 {
5334         mdk_rdev_t *rdev;
5335         struct list_head *rtmp;
5336
5337
5338         if (mddev->bitmap)
5339                 bitmap_daemon_work(mddev->bitmap);
5340
5341         if (mddev->ro)
5342                 return;
5343
5344         if (signal_pending(current)) {
5345                 if (mddev->pers->sync_request) {
5346                         printk(KERN_INFO "md: %s in immediate safe mode\n",
5347                                mdname(mddev));
5348                         mddev->safemode = 2;
5349                 }
5350                 flush_signals(current);
5351         }
5352
5353         if ( ! (
5354                 mddev->sb_dirty ||
5355                 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
5356                 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
5357                 (mddev->safemode == 1) ||
5358                 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
5359                  && !mddev->in_sync && mddev->recovery_cp == MaxSector)
5360                 ))
5361                 return;
5362
5363         if (mddev_trylock(mddev)) {
5364                 int spares =0;
5365
5366                 spin_lock_irq(&mddev->write_lock);
5367                 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
5368                     !mddev->in_sync && mddev->recovery_cp == MaxSector) {
5369                         mddev->in_sync = 1;
5370                         mddev->sb_dirty = 3;
5371                 }
5372                 if (mddev->safemode == 1)
5373                         mddev->safemode = 0;
5374                 spin_unlock_irq(&mddev->write_lock);
5375
5376                 if (mddev->sb_dirty)
5377                         md_update_sb(mddev);
5378
5379
5380                 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
5381                     !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
5382                         /* resync/recovery still happening */
5383                         clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5384                         goto unlock;
5385                 }
5386                 if (mddev->sync_thread) {
5387                         /* resync has finished, collect result */
5388                         md_unregister_thread(mddev->sync_thread);
5389                         mddev->sync_thread = NULL;
5390                         if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
5391                             !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5392                                 /* success...*/
5393                                 /* activate any spares */
5394                                 mddev->pers->spare_active(mddev);
5395                         }
5396                         md_update_sb(mddev);
5397
5398                         /* if array is no-longer degraded, then any saved_raid_disk
5399                          * information must be scrapped
5400                          */
5401                         if (!mddev->degraded)
5402                                 ITERATE_RDEV(mddev,rdev,rtmp)
5403                                         rdev->saved_raid_disk = -1;
5404
5405                         mddev->recovery = 0;
5406                         /* flag recovery needed just to double check */
5407                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5408                         md_new_event(mddev);
5409                         goto unlock;
5410                 }
5411                 /* Clear some bits that don't mean anything, but
5412                  * might be left set
5413                  */
5414                 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5415                 clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
5416                 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
5417                 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
5418
5419                 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
5420                         goto unlock;
5421                 /* no recovery is running.
5422                  * remove any failed drives, then
5423                  * add spares if possible.
5424                  * Spare are also removed and re-added, to allow
5425                  * the personality to fail the re-add.
5426                  */
5427                 ITERATE_RDEV(mddev,rdev,rtmp)
5428                         if (rdev->raid_disk >= 0 &&
5429                             (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) &&
5430                             atomic_read(&rdev->nr_pending)==0) {
5431                                 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) {
5432                                         char nm[20];
5433                                         sprintf(nm,"rd%d", rdev->raid_disk);
5434                                         sysfs_remove_link(&mddev->kobj, nm);
5435                                         rdev->raid_disk = -1;
5436                                 }
5437                         }
5438
5439                 if (mddev->degraded) {
5440                         ITERATE_RDEV(mddev,rdev,rtmp)
5441                                 if (rdev->raid_disk < 0
5442                                     && !test_bit(Faulty, &rdev->flags)) {
5443                                         rdev->recovery_offset = 0;
5444                                         if (mddev->pers->hot_add_disk(mddev,rdev)) {
5445                                                 char nm[20];
5446                                                 sprintf(nm, "rd%d", rdev->raid_disk);
5447                                                 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
5448                                                 spares++;
5449                                                 md_new_event(mddev);
5450                                         } else
5451                                                 break;
5452                                 }
5453                 }
5454
5455                 if (spares) {
5456                         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5457                         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5458                 } else if (mddev->recovery_cp < MaxSector) {
5459                         set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5460                 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5461                         /* nothing to be done ... */
5462                         goto unlock;
5463
5464                 if (mddev->pers->sync_request) {
5465                         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5466                         if (spares && mddev->bitmap && ! mddev->bitmap->file) {
5467                                 /* We are adding a device or devices to an array
5468                                  * which has the bitmap stored on all devices.
5469                                  * So make sure all bitmap pages get written
5470                                  */
5471                                 bitmap_write_all(mddev->bitmap);
5472                         }
5473                         mddev->sync_thread = md_register_thread(md_do_sync,
5474                                                                 mddev,
5475                                                                 "%s_resync");
5476                         if (!mddev->sync_thread) {
5477                                 printk(KERN_ERR "%s: could not start resync"
5478                                         " thread...\n", 
5479                                         mdname(mddev));
5480                                 /* leave the spares where they are, it shouldn't hurt */
5481                                 mddev->recovery = 0;
5482                         } else
5483                                 md_wakeup_thread(mddev->sync_thread);
5484                         md_new_event(mddev);
5485                 }
5486         unlock:
5487                 mddev_unlock(mddev);
5488         }
5489 }
5490
5491 static int md_notify_reboot(struct notifier_block *this,
5492                             unsigned long code, void *x)
5493 {
5494         struct list_head *tmp;
5495         mddev_t *mddev;
5496
5497         if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
5498
5499                 printk(KERN_INFO "md: stopping all md devices.\n");
5500
5501                 ITERATE_MDDEV(mddev,tmp)
5502                         if (mddev_trylock(mddev)) {
5503                                 do_md_stop (mddev, 1);
5504                                 mddev_unlock(mddev);
5505                         }
5506                 /*
5507                  * certain more exotic SCSI devices are known to be
5508                  * volatile wrt too early system reboots. While the
5509                  * right place to handle this issue is the given
5510                  * driver, we do want to have a safe RAID driver ...
5511                  */
5512                 mdelay(1000*1);
5513         }
5514         return NOTIFY_DONE;
5515 }
5516
5517 static struct notifier_block md_notifier = {
5518         .notifier_call  = md_notify_reboot,
5519         .next           = NULL,
5520         .priority       = INT_MAX, /* before any real devices */
5521 };
5522
5523 static void md_geninit(void)
5524 {
5525         struct proc_dir_entry *p;
5526
5527         dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
5528
5529         p = create_proc_entry("mdstat", S_IRUGO, NULL);
5530         if (p)
5531                 p->proc_fops = &md_seq_fops;
5532 }
5533
5534 static int __init md_init(void)
5535 {
5536         printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
5537                         " MD_SB_DISKS=%d\n",
5538                         MD_MAJOR_VERSION, MD_MINOR_VERSION,
5539                         MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
5540         printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI,
5541                         BITMAP_MINOR);
5542
5543         if (register_blkdev(MAJOR_NR, "md"))
5544                 return -1;
5545         if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
5546                 unregister_blkdev(MAJOR_NR, "md");
5547                 return -1;
5548         }
5549         blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
5550                                 md_probe, NULL, NULL);
5551         blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
5552                             md_probe, NULL, NULL);
5553
5554         register_reboot_notifier(&md_notifier);
5555         raid_table_header = register_sysctl_table(raid_root_table, 1);
5556
5557         md_geninit();
5558         return (0);
5559 }
5560
5561
5562 #ifndef MODULE
5563
5564 /*
5565  * Searches all registered partitions for autorun RAID arrays
5566  * at boot time.
5567  */
5568 static dev_t detected_devices[128];
5569 static int dev_cnt;
5570
5571 void md_autodetect_dev(dev_t dev)
5572 {
5573         if (dev_cnt >= 0 && dev_cnt < 127)
5574                 detected_devices[dev_cnt++] = dev;
5575 }
5576
5577
5578 static void autostart_arrays(int part)
5579 {
5580         mdk_rdev_t *rdev;
5581         int i;
5582
5583         printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
5584
5585         for (i = 0; i < dev_cnt; i++) {
5586                 dev_t dev = detected_devices[i];
5587
5588                 rdev = md_import_device(dev,0, 0);
5589                 if (IS_ERR(rdev))
5590                         continue;
5591
5592                 if (test_bit(Faulty, &rdev->flags)) {
5593                         MD_BUG();
5594                         continue;
5595                 }
5596                 list_add(&rdev->same_set, &pending_raid_disks);
5597         }
5598         dev_cnt = 0;
5599
5600         autorun_devices(part);
5601 }
5602
5603 #endif
5604
5605 static __exit void md_exit(void)
5606 {
5607         mddev_t *mddev;
5608         struct list_head *tmp;
5609
5610         blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
5611         blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
5612
5613         unregister_blkdev(MAJOR_NR,"md");
5614         unregister_blkdev(mdp_major, "mdp");
5615         unregister_reboot_notifier(&md_notifier);
5616         unregister_sysctl_table(raid_table_header);
5617         remove_proc_entry("mdstat", NULL);
5618         ITERATE_MDDEV(mddev,tmp) {
5619                 struct gendisk *disk = mddev->gendisk;
5620                 if (!disk)
5621                         continue;
5622                 export_array(mddev);
5623                 del_gendisk(disk);
5624                 put_disk(disk);
5625                 mddev->gendisk = NULL;
5626                 mddev_put(mddev);
5627         }
5628 }
5629
5630 module_init(md_init)
5631 module_exit(md_exit)
5632
5633 static int get_ro(char *buffer, struct kernel_param *kp)
5634 {
5635         return sprintf(buffer, "%d", start_readonly);
5636 }
5637 static int set_ro(const char *val, struct kernel_param *kp)
5638 {
5639         char *e;
5640         int num = simple_strtoul(val, &e, 10);
5641         if (*val && (*e == '\0' || *e == '\n')) {
5642                 start_readonly = num;
5643                 return 0;
5644         }
5645         return -EINVAL;
5646 }
5647
5648 module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
5649 module_param(start_dirty_degraded, int, 0644);
5650
5651
5652 EXPORT_SYMBOL(register_md_personality);
5653 EXPORT_SYMBOL(unregister_md_personality);
5654 EXPORT_SYMBOL(md_error);
5655 EXPORT_SYMBOL(md_done_sync);
5656 EXPORT_SYMBOL(md_write_start);
5657 EXPORT_SYMBOL(md_write_end);
5658 EXPORT_SYMBOL(md_register_thread);
5659 EXPORT_SYMBOL(md_unregister_thread);
5660 EXPORT_SYMBOL(md_wakeup_thread);
5661 EXPORT_SYMBOL(md_check_recovery);
5662 MODULE_LICENSE("GPL");
5663 MODULE_ALIAS("md");
5664 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);