8a03c953b887ad08d25ae67ca4d2aac9b07af4c8
[pandora-kernel.git] / drivers / md / md.c
1 /*
2    md.c : Multiple Devices driver for Linux
3           Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5      completely rewritten, based on the MD driver code from Marc Zyngier
6
7    Changes:
8
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13    - kmod support by: Cyrus Durgin
14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17    - lots of fixes and improvements to the RAID1/RAID5 and generic
18      RAID code (such as request based resynchronization):
19
20      Neil Brown <neilb@cse.unsw.edu.au>.
21
22    - persistent bitmap code
23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25    This program is free software; you can redistribute it and/or modify
26    it under the terms of the GNU General Public License as published by
27    the Free Software Foundation; either version 2, or (at your option)
28    any later version.
29
30    You should have received a copy of the GNU General Public License
31    (for example /usr/src/linux/COPYING); if not, write to the Free
32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */
34
35 #include <linux/module.h>
36 #include <linux/kernel.h>
37 #include <linux/kthread.h>
38 #include <linux/linkage.h>
39 #include <linux/raid/md.h>
40 #include <linux/raid/bitmap.h>
41 #include <linux/sysctl.h>
42 #include <linux/buffer_head.h> /* for invalidate_bdev */
43 #include <linux/poll.h>
44 #include <linux/mutex.h>
45 #include <linux/ctype.h>
46 #include <linux/freezer.h>
47
48 #include <linux/init.h>
49
50 #include <linux/file.h>
51
52 #ifdef CONFIG_KMOD
53 #include <linux/kmod.h>
54 #endif
55
56 #include <asm/unaligned.h>
57
58 #define MAJOR_NR MD_MAJOR
59 #define MD_DRIVER
60
61 /* 63 partitions with the alternate major number (mdp) */
62 #define MdpMinorShift 6
63
64 #define DEBUG 0
65 #define dprintk(x...) ((void)(DEBUG && printk(x)))
66
67
68 #ifndef MODULE
69 static void autostart_arrays (int part);
70 #endif
71
72 static LIST_HEAD(pers_list);
73 static DEFINE_SPINLOCK(pers_lock);
74
75 static void md_print_devices(void);
76
77 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
78
79 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
80
81 /*
82  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
83  * is 1000 KB/sec, so the extra system load does not show up that much.
84  * Increase it if you want to have more _guaranteed_ speed. Note that
85  * the RAID driver will use the maximum available bandwidth if the IO
86  * subsystem is idle. There is also an 'absolute maximum' reconstruction
87  * speed limit - in case reconstruction slows down your system despite
88  * idle IO detection.
89  *
90  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
91  * or /sys/block/mdX/md/sync_speed_{min,max}
92  */
93
94 static int sysctl_speed_limit_min = 1000;
95 static int sysctl_speed_limit_max = 200000;
96 static inline int speed_min(mddev_t *mddev)
97 {
98         return mddev->sync_speed_min ?
99                 mddev->sync_speed_min : sysctl_speed_limit_min;
100 }
101
102 static inline int speed_max(mddev_t *mddev)
103 {
104         return mddev->sync_speed_max ?
105                 mddev->sync_speed_max : sysctl_speed_limit_max;
106 }
107
108 static struct ctl_table_header *raid_table_header;
109
110 static ctl_table raid_table[] = {
111         {
112                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MIN,
113                 .procname       = "speed_limit_min",
114                 .data           = &sysctl_speed_limit_min,
115                 .maxlen         = sizeof(int),
116                 .mode           = S_IRUGO|S_IWUSR,
117                 .proc_handler   = &proc_dointvec,
118         },
119         {
120                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MAX,
121                 .procname       = "speed_limit_max",
122                 .data           = &sysctl_speed_limit_max,
123                 .maxlen         = sizeof(int),
124                 .mode           = S_IRUGO|S_IWUSR,
125                 .proc_handler   = &proc_dointvec,
126         },
127         { .ctl_name = 0 }
128 };
129
130 static ctl_table raid_dir_table[] = {
131         {
132                 .ctl_name       = DEV_RAID,
133                 .procname       = "raid",
134                 .maxlen         = 0,
135                 .mode           = S_IRUGO|S_IXUGO,
136                 .child          = raid_table,
137         },
138         { .ctl_name = 0 }
139 };
140
141 static ctl_table raid_root_table[] = {
142         {
143                 .ctl_name       = CTL_DEV,
144                 .procname       = "dev",
145                 .maxlen         = 0,
146                 .mode           = 0555,
147                 .child          = raid_dir_table,
148         },
149         { .ctl_name = 0 }
150 };
151
152 static struct block_device_operations md_fops;
153
154 static int start_readonly;
155
156 /*
157  * We have a system wide 'event count' that is incremented
158  * on any 'interesting' event, and readers of /proc/mdstat
159  * can use 'poll' or 'select' to find out when the event
160  * count increases.
161  *
162  * Events are:
163  *  start array, stop array, error, add device, remove device,
164  *  start build, activate spare
165  */
166 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
167 static atomic_t md_event_count;
168 void md_new_event(mddev_t *mddev)
169 {
170         atomic_inc(&md_event_count);
171         wake_up(&md_event_waiters);
172 }
173 EXPORT_SYMBOL_GPL(md_new_event);
174
175 /* Alternate version that can be called from interrupts
176  * when calling sysfs_notify isn't needed.
177  */
178 static void md_new_event_inintr(mddev_t *mddev)
179 {
180         atomic_inc(&md_event_count);
181         wake_up(&md_event_waiters);
182 }
183
184 /*
185  * Enables to iterate over all existing md arrays
186  * all_mddevs_lock protects this list.
187  */
188 static LIST_HEAD(all_mddevs);
189 static DEFINE_SPINLOCK(all_mddevs_lock);
190
191
192 /*
193  * iterates through all used mddevs in the system.
194  * We take care to grab the all_mddevs_lock whenever navigating
195  * the list, and to always hold a refcount when unlocked.
196  * Any code which breaks out of this loop while own
197  * a reference to the current mddev and must mddev_put it.
198  */
199 #define for_each_mddev(mddev,tmp)                                       \
200                                                                         \
201         for (({ spin_lock(&all_mddevs_lock);                            \
202                 tmp = all_mddevs.next;                                  \
203                 mddev = NULL;});                                        \
204              ({ if (tmp != &all_mddevs)                                 \
205                         mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
206                 spin_unlock(&all_mddevs_lock);                          \
207                 if (mddev) mddev_put(mddev);                            \
208                 mddev = list_entry(tmp, mddev_t, all_mddevs);           \
209                 tmp != &all_mddevs;});                                  \
210              ({ spin_lock(&all_mddevs_lock);                            \
211                 tmp = tmp->next;})                                      \
212                 )
213
214
215 static int md_fail_request (struct request_queue *q, struct bio *bio)
216 {
217         bio_io_error(bio);
218         return 0;
219 }
220
221 static inline mddev_t *mddev_get(mddev_t *mddev)
222 {
223         atomic_inc(&mddev->active);
224         return mddev;
225 }
226
227 static void mddev_put(mddev_t *mddev)
228 {
229         if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
230                 return;
231         if (!mddev->raid_disks && list_empty(&mddev->disks)) {
232                 list_del(&mddev->all_mddevs);
233                 spin_unlock(&all_mddevs_lock);
234                 blk_cleanup_queue(mddev->queue);
235                 kobject_put(&mddev->kobj);
236         } else
237                 spin_unlock(&all_mddevs_lock);
238 }
239
240 static mddev_t * mddev_find(dev_t unit)
241 {
242         mddev_t *mddev, *new = NULL;
243
244  retry:
245         spin_lock(&all_mddevs_lock);
246         list_for_each_entry(mddev, &all_mddevs, all_mddevs)
247                 if (mddev->unit == unit) {
248                         mddev_get(mddev);
249                         spin_unlock(&all_mddevs_lock);
250                         kfree(new);
251                         return mddev;
252                 }
253
254         if (new) {
255                 list_add(&new->all_mddevs, &all_mddevs);
256                 spin_unlock(&all_mddevs_lock);
257                 return new;
258         }
259         spin_unlock(&all_mddevs_lock);
260
261         new = kzalloc(sizeof(*new), GFP_KERNEL);
262         if (!new)
263                 return NULL;
264
265         new->unit = unit;
266         if (MAJOR(unit) == MD_MAJOR)
267                 new->md_minor = MINOR(unit);
268         else
269                 new->md_minor = MINOR(unit) >> MdpMinorShift;
270
271         mutex_init(&new->reconfig_mutex);
272         INIT_LIST_HEAD(&new->disks);
273         INIT_LIST_HEAD(&new->all_mddevs);
274         init_timer(&new->safemode_timer);
275         atomic_set(&new->active, 1);
276         spin_lock_init(&new->write_lock);
277         init_waitqueue_head(&new->sb_wait);
278         init_waitqueue_head(&new->recovery_wait);
279         new->reshape_position = MaxSector;
280         new->resync_min = 0;
281         new->resync_max = MaxSector;
282         new->level = LEVEL_NONE;
283
284         new->queue = blk_alloc_queue(GFP_KERNEL);
285         if (!new->queue) {
286                 kfree(new);
287                 return NULL;
288         }
289         /* Can be unlocked because the queue is new: no concurrency */
290         queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, new->queue);
291
292         blk_queue_make_request(new->queue, md_fail_request);
293
294         goto retry;
295 }
296
297 static inline int mddev_lock(mddev_t * mddev)
298 {
299         return mutex_lock_interruptible(&mddev->reconfig_mutex);
300 }
301
302 static inline int mddev_trylock(mddev_t * mddev)
303 {
304         return mutex_trylock(&mddev->reconfig_mutex);
305 }
306
307 static inline void mddev_unlock(mddev_t * mddev)
308 {
309         mutex_unlock(&mddev->reconfig_mutex);
310
311         md_wakeup_thread(mddev->thread);
312 }
313
314 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
315 {
316         mdk_rdev_t * rdev;
317         struct list_head *tmp;
318
319         rdev_for_each(rdev, tmp, mddev) {
320                 if (rdev->desc_nr == nr)
321                         return rdev;
322         }
323         return NULL;
324 }
325
326 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
327 {
328         struct list_head *tmp;
329         mdk_rdev_t *rdev;
330
331         rdev_for_each(rdev, tmp, mddev) {
332                 if (rdev->bdev->bd_dev == dev)
333                         return rdev;
334         }
335         return NULL;
336 }
337
338 static struct mdk_personality *find_pers(int level, char *clevel)
339 {
340         struct mdk_personality *pers;
341         list_for_each_entry(pers, &pers_list, list) {
342                 if (level != LEVEL_NONE && pers->level == level)
343                         return pers;
344                 if (strcmp(pers->name, clevel)==0)
345                         return pers;
346         }
347         return NULL;
348 }
349
350 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
351 {
352         sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
353         return MD_NEW_SIZE_BLOCKS(size);
354 }
355
356 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
357 {
358         sector_t size;
359
360         size = rdev->sb_offset;
361
362         if (chunk_size)
363                 size &= ~((sector_t)chunk_size/1024 - 1);
364         return size;
365 }
366
367 static int alloc_disk_sb(mdk_rdev_t * rdev)
368 {
369         if (rdev->sb_page)
370                 MD_BUG();
371
372         rdev->sb_page = alloc_page(GFP_KERNEL);
373         if (!rdev->sb_page) {
374                 printk(KERN_ALERT "md: out of memory.\n");
375                 return -EINVAL;
376         }
377
378         return 0;
379 }
380
381 static void free_disk_sb(mdk_rdev_t * rdev)
382 {
383         if (rdev->sb_page) {
384                 put_page(rdev->sb_page);
385                 rdev->sb_loaded = 0;
386                 rdev->sb_page = NULL;
387                 rdev->sb_offset = 0;
388                 rdev->size = 0;
389         }
390 }
391
392
393 static void super_written(struct bio *bio, int error)
394 {
395         mdk_rdev_t *rdev = bio->bi_private;
396         mddev_t *mddev = rdev->mddev;
397
398         if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
399                 printk("md: super_written gets error=%d, uptodate=%d\n",
400                        error, test_bit(BIO_UPTODATE, &bio->bi_flags));
401                 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
402                 md_error(mddev, rdev);
403         }
404
405         if (atomic_dec_and_test(&mddev->pending_writes))
406                 wake_up(&mddev->sb_wait);
407         bio_put(bio);
408 }
409
410 static void super_written_barrier(struct bio *bio, int error)
411 {
412         struct bio *bio2 = bio->bi_private;
413         mdk_rdev_t *rdev = bio2->bi_private;
414         mddev_t *mddev = rdev->mddev;
415
416         if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
417             error == -EOPNOTSUPP) {
418                 unsigned long flags;
419                 /* barriers don't appear to be supported :-( */
420                 set_bit(BarriersNotsupp, &rdev->flags);
421                 mddev->barriers_work = 0;
422                 spin_lock_irqsave(&mddev->write_lock, flags);
423                 bio2->bi_next = mddev->biolist;
424                 mddev->biolist = bio2;
425                 spin_unlock_irqrestore(&mddev->write_lock, flags);
426                 wake_up(&mddev->sb_wait);
427                 bio_put(bio);
428         } else {
429                 bio_put(bio2);
430                 bio->bi_private = rdev;
431                 super_written(bio, error);
432         }
433 }
434
435 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
436                    sector_t sector, int size, struct page *page)
437 {
438         /* write first size bytes of page to sector of rdev
439          * Increment mddev->pending_writes before returning
440          * and decrement it on completion, waking up sb_wait
441          * if zero is reached.
442          * If an error occurred, call md_error
443          *
444          * As we might need to resubmit the request if BIO_RW_BARRIER
445          * causes ENOTSUPP, we allocate a spare bio...
446          */
447         struct bio *bio = bio_alloc(GFP_NOIO, 1);
448         int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
449
450         bio->bi_bdev = rdev->bdev;
451         bio->bi_sector = sector;
452         bio_add_page(bio, page, size, 0);
453         bio->bi_private = rdev;
454         bio->bi_end_io = super_written;
455         bio->bi_rw = rw;
456
457         atomic_inc(&mddev->pending_writes);
458         if (!test_bit(BarriersNotsupp, &rdev->flags)) {
459                 struct bio *rbio;
460                 rw |= (1<<BIO_RW_BARRIER);
461                 rbio = bio_clone(bio, GFP_NOIO);
462                 rbio->bi_private = bio;
463                 rbio->bi_end_io = super_written_barrier;
464                 submit_bio(rw, rbio);
465         } else
466                 submit_bio(rw, bio);
467 }
468
469 void md_super_wait(mddev_t *mddev)
470 {
471         /* wait for all superblock writes that were scheduled to complete.
472          * if any had to be retried (due to BARRIER problems), retry them
473          */
474         DEFINE_WAIT(wq);
475         for(;;) {
476                 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
477                 if (atomic_read(&mddev->pending_writes)==0)
478                         break;
479                 while (mddev->biolist) {
480                         struct bio *bio;
481                         spin_lock_irq(&mddev->write_lock);
482                         bio = mddev->biolist;
483                         mddev->biolist = bio->bi_next ;
484                         bio->bi_next = NULL;
485                         spin_unlock_irq(&mddev->write_lock);
486                         submit_bio(bio->bi_rw, bio);
487                 }
488                 schedule();
489         }
490         finish_wait(&mddev->sb_wait, &wq);
491 }
492
493 static void bi_complete(struct bio *bio, int error)
494 {
495         complete((struct completion*)bio->bi_private);
496 }
497
498 int sync_page_io(struct block_device *bdev, sector_t sector, int size,
499                    struct page *page, int rw)
500 {
501         struct bio *bio = bio_alloc(GFP_NOIO, 1);
502         struct completion event;
503         int ret;
504
505         rw |= (1 << BIO_RW_SYNC);
506
507         bio->bi_bdev = bdev;
508         bio->bi_sector = sector;
509         bio_add_page(bio, page, size, 0);
510         init_completion(&event);
511         bio->bi_private = &event;
512         bio->bi_end_io = bi_complete;
513         submit_bio(rw, bio);
514         wait_for_completion(&event);
515
516         ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
517         bio_put(bio);
518         return ret;
519 }
520 EXPORT_SYMBOL_GPL(sync_page_io);
521
522 static int read_disk_sb(mdk_rdev_t * rdev, int size)
523 {
524         char b[BDEVNAME_SIZE];
525         if (!rdev->sb_page) {
526                 MD_BUG();
527                 return -EINVAL;
528         }
529         if (rdev->sb_loaded)
530                 return 0;
531
532
533         if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
534                 goto fail;
535         rdev->sb_loaded = 1;
536         return 0;
537
538 fail:
539         printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
540                 bdevname(rdev->bdev,b));
541         return -EINVAL;
542 }
543
544 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
545 {
546         return  sb1->set_uuid0 == sb2->set_uuid0 &&
547                 sb1->set_uuid1 == sb2->set_uuid1 &&
548                 sb1->set_uuid2 == sb2->set_uuid2 &&
549                 sb1->set_uuid3 == sb2->set_uuid3;
550 }
551
552 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
553 {
554         int ret;
555         mdp_super_t *tmp1, *tmp2;
556
557         tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
558         tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
559
560         if (!tmp1 || !tmp2) {
561                 ret = 0;
562                 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
563                 goto abort;
564         }
565
566         *tmp1 = *sb1;
567         *tmp2 = *sb2;
568
569         /*
570          * nr_disks is not constant
571          */
572         tmp1->nr_disks = 0;
573         tmp2->nr_disks = 0;
574
575         if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
576                 ret = 0;
577         else
578                 ret = 1;
579
580 abort:
581         kfree(tmp1);
582         kfree(tmp2);
583         return ret;
584 }
585
586
587 static u32 md_csum_fold(u32 csum)
588 {
589         csum = (csum & 0xffff) + (csum >> 16);
590         return (csum & 0xffff) + (csum >> 16);
591 }
592
593 static unsigned int calc_sb_csum(mdp_super_t * sb)
594 {
595         u64 newcsum = 0;
596         u32 *sb32 = (u32*)sb;
597         int i;
598         unsigned int disk_csum, csum;
599
600         disk_csum = sb->sb_csum;
601         sb->sb_csum = 0;
602
603         for (i = 0; i < MD_SB_BYTES/4 ; i++)
604                 newcsum += sb32[i];
605         csum = (newcsum & 0xffffffff) + (newcsum>>32);
606
607
608 #ifdef CONFIG_ALPHA
609         /* This used to use csum_partial, which was wrong for several
610          * reasons including that different results are returned on
611          * different architectures.  It isn't critical that we get exactly
612          * the same return value as before (we always csum_fold before
613          * testing, and that removes any differences).  However as we
614          * know that csum_partial always returned a 16bit value on
615          * alphas, do a fold to maximise conformity to previous behaviour.
616          */
617         sb->sb_csum = md_csum_fold(disk_csum);
618 #else
619         sb->sb_csum = disk_csum;
620 #endif
621         return csum;
622 }
623
624
625 /*
626  * Handle superblock details.
627  * We want to be able to handle multiple superblock formats
628  * so we have a common interface to them all, and an array of
629  * different handlers.
630  * We rely on user-space to write the initial superblock, and support
631  * reading and updating of superblocks.
632  * Interface methods are:
633  *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
634  *      loads and validates a superblock on dev.
635  *      if refdev != NULL, compare superblocks on both devices
636  *    Return:
637  *      0 - dev has a superblock that is compatible with refdev
638  *      1 - dev has a superblock that is compatible and newer than refdev
639  *          so dev should be used as the refdev in future
640  *     -EINVAL superblock incompatible or invalid
641  *     -othererror e.g. -EIO
642  *
643  *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
644  *      Verify that dev is acceptable into mddev.
645  *       The first time, mddev->raid_disks will be 0, and data from
646  *       dev should be merged in.  Subsequent calls check that dev
647  *       is new enough.  Return 0 or -EINVAL
648  *
649  *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
650  *     Update the superblock for rdev with data in mddev
651  *     This does not write to disc.
652  *
653  */
654
655 struct super_type  {
656         char                *name;
657         struct module       *owner;
658         int                 (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
659                                           int minor_version);
660         int                 (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
661         void                (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
662         unsigned long long  (*rdev_size_change)(mdk_rdev_t *rdev,
663                                                 unsigned long long size);
664 };
665
666 /*
667  * load_super for 0.90.0 
668  */
669 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
670 {
671         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
672         mdp_super_t *sb;
673         int ret;
674         sector_t sb_offset;
675
676         /*
677          * Calculate the position of the superblock,
678          * it's at the end of the disk.
679          *
680          * It also happens to be a multiple of 4Kb.
681          */
682         sb_offset = calc_dev_sboffset(rdev->bdev);
683         rdev->sb_offset = sb_offset;
684
685         ret = read_disk_sb(rdev, MD_SB_BYTES);
686         if (ret) return ret;
687
688         ret = -EINVAL;
689
690         bdevname(rdev->bdev, b);
691         sb = (mdp_super_t*)page_address(rdev->sb_page);
692
693         if (sb->md_magic != MD_SB_MAGIC) {
694                 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
695                        b);
696                 goto abort;
697         }
698
699         if (sb->major_version != 0 ||
700             sb->minor_version < 90 ||
701             sb->minor_version > 91) {
702                 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
703                         sb->major_version, sb->minor_version,
704                         b);
705                 goto abort;
706         }
707
708         if (sb->raid_disks <= 0)
709                 goto abort;
710
711         if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
712                 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
713                         b);
714                 goto abort;
715         }
716
717         rdev->preferred_minor = sb->md_minor;
718         rdev->data_offset = 0;
719         rdev->sb_size = MD_SB_BYTES;
720
721         if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
722                 if (sb->level != 1 && sb->level != 4
723                     && sb->level != 5 && sb->level != 6
724                     && sb->level != 10) {
725                         /* FIXME use a better test */
726                         printk(KERN_WARNING
727                                "md: bitmaps not supported for this level.\n");
728                         goto abort;
729                 }
730         }
731
732         if (sb->level == LEVEL_MULTIPATH)
733                 rdev->desc_nr = -1;
734         else
735                 rdev->desc_nr = sb->this_disk.number;
736
737         if (!refdev) {
738                 ret = 1;
739         } else {
740                 __u64 ev1, ev2;
741                 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
742                 if (!uuid_equal(refsb, sb)) {
743                         printk(KERN_WARNING "md: %s has different UUID to %s\n",
744                                 b, bdevname(refdev->bdev,b2));
745                         goto abort;
746                 }
747                 if (!sb_equal(refsb, sb)) {
748                         printk(KERN_WARNING "md: %s has same UUID"
749                                " but different superblock to %s\n",
750                                b, bdevname(refdev->bdev, b2));
751                         goto abort;
752                 }
753                 ev1 = md_event(sb);
754                 ev2 = md_event(refsb);
755                 if (ev1 > ev2)
756                         ret = 1;
757                 else 
758                         ret = 0;
759         }
760         rdev->size = calc_dev_size(rdev, sb->chunk_size);
761
762         if (rdev->size < sb->size && sb->level > 1)
763                 /* "this cannot possibly happen" ... */
764                 ret = -EINVAL;
765
766  abort:
767         return ret;
768 }
769
770 /*
771  * validate_super for 0.90.0
772  */
773 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
774 {
775         mdp_disk_t *desc;
776         mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
777         __u64 ev1 = md_event(sb);
778
779         rdev->raid_disk = -1;
780         clear_bit(Faulty, &rdev->flags);
781         clear_bit(In_sync, &rdev->flags);
782         clear_bit(WriteMostly, &rdev->flags);
783         clear_bit(BarriersNotsupp, &rdev->flags);
784
785         if (mddev->raid_disks == 0) {
786                 mddev->major_version = 0;
787                 mddev->minor_version = sb->minor_version;
788                 mddev->patch_version = sb->patch_version;
789                 mddev->external = 0;
790                 mddev->chunk_size = sb->chunk_size;
791                 mddev->ctime = sb->ctime;
792                 mddev->utime = sb->utime;
793                 mddev->level = sb->level;
794                 mddev->clevel[0] = 0;
795                 mddev->layout = sb->layout;
796                 mddev->raid_disks = sb->raid_disks;
797                 mddev->size = sb->size;
798                 mddev->events = ev1;
799                 mddev->bitmap_offset = 0;
800                 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
801
802                 if (mddev->minor_version >= 91) {
803                         mddev->reshape_position = sb->reshape_position;
804                         mddev->delta_disks = sb->delta_disks;
805                         mddev->new_level = sb->new_level;
806                         mddev->new_layout = sb->new_layout;
807                         mddev->new_chunk = sb->new_chunk;
808                 } else {
809                         mddev->reshape_position = MaxSector;
810                         mddev->delta_disks = 0;
811                         mddev->new_level = mddev->level;
812                         mddev->new_layout = mddev->layout;
813                         mddev->new_chunk = mddev->chunk_size;
814                 }
815
816                 if (sb->state & (1<<MD_SB_CLEAN))
817                         mddev->recovery_cp = MaxSector;
818                 else {
819                         if (sb->events_hi == sb->cp_events_hi && 
820                                 sb->events_lo == sb->cp_events_lo) {
821                                 mddev->recovery_cp = sb->recovery_cp;
822                         } else
823                                 mddev->recovery_cp = 0;
824                 }
825
826                 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
827                 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
828                 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
829                 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
830
831                 mddev->max_disks = MD_SB_DISKS;
832
833                 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
834                     mddev->bitmap_file == NULL)
835                         mddev->bitmap_offset = mddev->default_bitmap_offset;
836
837         } else if (mddev->pers == NULL) {
838                 /* Insist on good event counter while assembling */
839                 ++ev1;
840                 if (ev1 < mddev->events) 
841                         return -EINVAL;
842         } else if (mddev->bitmap) {
843                 /* if adding to array with a bitmap, then we can accept an
844                  * older device ... but not too old.
845                  */
846                 if (ev1 < mddev->bitmap->events_cleared)
847                         return 0;
848         } else {
849                 if (ev1 < mddev->events)
850                         /* just a hot-add of a new device, leave raid_disk at -1 */
851                         return 0;
852         }
853
854         if (mddev->level != LEVEL_MULTIPATH) {
855                 desc = sb->disks + rdev->desc_nr;
856
857                 if (desc->state & (1<<MD_DISK_FAULTY))
858                         set_bit(Faulty, &rdev->flags);
859                 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
860                             desc->raid_disk < mddev->raid_disks */) {
861                         set_bit(In_sync, &rdev->flags);
862                         rdev->raid_disk = desc->raid_disk;
863                 }
864                 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
865                         set_bit(WriteMostly, &rdev->flags);
866         } else /* MULTIPATH are always insync */
867                 set_bit(In_sync, &rdev->flags);
868         return 0;
869 }
870
871 /*
872  * sync_super for 0.90.0
873  */
874 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
875 {
876         mdp_super_t *sb;
877         struct list_head *tmp;
878         mdk_rdev_t *rdev2;
879         int next_spare = mddev->raid_disks;
880
881
882         /* make rdev->sb match mddev data..
883          *
884          * 1/ zero out disks
885          * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
886          * 3/ any empty disks < next_spare become removed
887          *
888          * disks[0] gets initialised to REMOVED because
889          * we cannot be sure from other fields if it has
890          * been initialised or not.
891          */
892         int i;
893         int active=0, working=0,failed=0,spare=0,nr_disks=0;
894
895         rdev->sb_size = MD_SB_BYTES;
896
897         sb = (mdp_super_t*)page_address(rdev->sb_page);
898
899         memset(sb, 0, sizeof(*sb));
900
901         sb->md_magic = MD_SB_MAGIC;
902         sb->major_version = mddev->major_version;
903         sb->patch_version = mddev->patch_version;
904         sb->gvalid_words  = 0; /* ignored */
905         memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
906         memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
907         memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
908         memcpy(&sb->set_uuid3, mddev->uuid+12,4);
909
910         sb->ctime = mddev->ctime;
911         sb->level = mddev->level;
912         sb->size  = mddev->size;
913         sb->raid_disks = mddev->raid_disks;
914         sb->md_minor = mddev->md_minor;
915         sb->not_persistent = 0;
916         sb->utime = mddev->utime;
917         sb->state = 0;
918         sb->events_hi = (mddev->events>>32);
919         sb->events_lo = (u32)mddev->events;
920
921         if (mddev->reshape_position == MaxSector)
922                 sb->minor_version = 90;
923         else {
924                 sb->minor_version = 91;
925                 sb->reshape_position = mddev->reshape_position;
926                 sb->new_level = mddev->new_level;
927                 sb->delta_disks = mddev->delta_disks;
928                 sb->new_layout = mddev->new_layout;
929                 sb->new_chunk = mddev->new_chunk;
930         }
931         mddev->minor_version = sb->minor_version;
932         if (mddev->in_sync)
933         {
934                 sb->recovery_cp = mddev->recovery_cp;
935                 sb->cp_events_hi = (mddev->events>>32);
936                 sb->cp_events_lo = (u32)mddev->events;
937                 if (mddev->recovery_cp == MaxSector)
938                         sb->state = (1<< MD_SB_CLEAN);
939         } else
940                 sb->recovery_cp = 0;
941
942         sb->layout = mddev->layout;
943         sb->chunk_size = mddev->chunk_size;
944
945         if (mddev->bitmap && mddev->bitmap_file == NULL)
946                 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
947
948         sb->disks[0].state = (1<<MD_DISK_REMOVED);
949         rdev_for_each(rdev2, tmp, mddev) {
950                 mdp_disk_t *d;
951                 int desc_nr;
952                 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
953                     && !test_bit(Faulty, &rdev2->flags))
954                         desc_nr = rdev2->raid_disk;
955                 else
956                         desc_nr = next_spare++;
957                 rdev2->desc_nr = desc_nr;
958                 d = &sb->disks[rdev2->desc_nr];
959                 nr_disks++;
960                 d->number = rdev2->desc_nr;
961                 d->major = MAJOR(rdev2->bdev->bd_dev);
962                 d->minor = MINOR(rdev2->bdev->bd_dev);
963                 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
964                     && !test_bit(Faulty, &rdev2->flags))
965                         d->raid_disk = rdev2->raid_disk;
966                 else
967                         d->raid_disk = rdev2->desc_nr; /* compatibility */
968                 if (test_bit(Faulty, &rdev2->flags))
969                         d->state = (1<<MD_DISK_FAULTY);
970                 else if (test_bit(In_sync, &rdev2->flags)) {
971                         d->state = (1<<MD_DISK_ACTIVE);
972                         d->state |= (1<<MD_DISK_SYNC);
973                         active++;
974                         working++;
975                 } else {
976                         d->state = 0;
977                         spare++;
978                         working++;
979                 }
980                 if (test_bit(WriteMostly, &rdev2->flags))
981                         d->state |= (1<<MD_DISK_WRITEMOSTLY);
982         }
983         /* now set the "removed" and "faulty" bits on any missing devices */
984         for (i=0 ; i < mddev->raid_disks ; i++) {
985                 mdp_disk_t *d = &sb->disks[i];
986                 if (d->state == 0 && d->number == 0) {
987                         d->number = i;
988                         d->raid_disk = i;
989                         d->state = (1<<MD_DISK_REMOVED);
990                         d->state |= (1<<MD_DISK_FAULTY);
991                         failed++;
992                 }
993         }
994         sb->nr_disks = nr_disks;
995         sb->active_disks = active;
996         sb->working_disks = working;
997         sb->failed_disks = failed;
998         sb->spare_disks = spare;
999
1000         sb->this_disk = sb->disks[rdev->desc_nr];
1001         sb->sb_csum = calc_sb_csum(sb);
1002 }
1003
1004 /*
1005  * rdev_size_change for 0.90.0
1006  */
1007 static unsigned long long
1008 super_90_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
1009 {
1010         if (size && size < rdev->mddev->size)
1011                 return 0; /* component must fit device */
1012         size *= 2; /* convert to sectors */
1013         if (rdev->mddev->bitmap_offset)
1014                 return 0; /* can't move bitmap */
1015         rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
1016         if (!size || size > rdev->sb_offset*2)
1017                 size = rdev->sb_offset*2;
1018         md_super_write(rdev->mddev, rdev, rdev->sb_offset << 1, rdev->sb_size,
1019                        rdev->sb_page);
1020         md_super_wait(rdev->mddev);
1021         return size/2; /* kB for sysfs */
1022 }
1023
1024
1025 /*
1026  * version 1 superblock
1027  */
1028
1029 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1030 {
1031         __le32 disk_csum;
1032         u32 csum;
1033         unsigned long long newcsum;
1034         int size = 256 + le32_to_cpu(sb->max_dev)*2;
1035         __le32 *isuper = (__le32*)sb;
1036         int i;
1037
1038         disk_csum = sb->sb_csum;
1039         sb->sb_csum = 0;
1040         newcsum = 0;
1041         for (i=0; size>=4; size -= 4 )
1042                 newcsum += le32_to_cpu(*isuper++);
1043
1044         if (size == 2)
1045                 newcsum += le16_to_cpu(*(__le16*) isuper);
1046
1047         csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1048         sb->sb_csum = disk_csum;
1049         return cpu_to_le32(csum);
1050 }
1051
1052 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1053 {
1054         struct mdp_superblock_1 *sb;
1055         int ret;
1056         sector_t sb_offset;
1057         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1058         int bmask;
1059
1060         /*
1061          * Calculate the position of the superblock.
1062          * It is always aligned to a 4K boundary and
1063          * depeding on minor_version, it can be:
1064          * 0: At least 8K, but less than 12K, from end of device
1065          * 1: At start of device
1066          * 2: 4K from start of device.
1067          */
1068         switch(minor_version) {
1069         case 0:
1070                 sb_offset = rdev->bdev->bd_inode->i_size >> 9;
1071                 sb_offset -= 8*2;
1072                 sb_offset &= ~(sector_t)(4*2-1);
1073                 /* convert from sectors to K */
1074                 sb_offset /= 2;
1075                 break;
1076         case 1:
1077                 sb_offset = 0;
1078                 break;
1079         case 2:
1080                 sb_offset = 4;
1081                 break;
1082         default:
1083                 return -EINVAL;
1084         }
1085         rdev->sb_offset = sb_offset;
1086
1087         /* superblock is rarely larger than 1K, but it can be larger,
1088          * and it is safe to read 4k, so we do that
1089          */
1090         ret = read_disk_sb(rdev, 4096);
1091         if (ret) return ret;
1092
1093
1094         sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1095
1096         if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1097             sb->major_version != cpu_to_le32(1) ||
1098             le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1099             le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
1100             (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1101                 return -EINVAL;
1102
1103         if (calc_sb_1_csum(sb) != sb->sb_csum) {
1104                 printk("md: invalid superblock checksum on %s\n",
1105                         bdevname(rdev->bdev,b));
1106                 return -EINVAL;
1107         }
1108         if (le64_to_cpu(sb->data_size) < 10) {
1109                 printk("md: data_size too small on %s\n",
1110                        bdevname(rdev->bdev,b));
1111                 return -EINVAL;
1112         }
1113         if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) {
1114                 if (sb->level != cpu_to_le32(1) &&
1115                     sb->level != cpu_to_le32(4) &&
1116                     sb->level != cpu_to_le32(5) &&
1117                     sb->level != cpu_to_le32(6) &&
1118                     sb->level != cpu_to_le32(10)) {
1119                         printk(KERN_WARNING
1120                                "md: bitmaps not supported for this level.\n");
1121                         return -EINVAL;
1122                 }
1123         }
1124
1125         rdev->preferred_minor = 0xffff;
1126         rdev->data_offset = le64_to_cpu(sb->data_offset);
1127         atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1128
1129         rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1130         bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
1131         if (rdev->sb_size & bmask)
1132                 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1133
1134         if (minor_version
1135             && rdev->data_offset < sb_offset + (rdev->sb_size/512))
1136                 return -EINVAL;
1137
1138         if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1139                 rdev->desc_nr = -1;
1140         else
1141                 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1142
1143         if (!refdev) {
1144                 ret = 1;
1145         } else {
1146                 __u64 ev1, ev2;
1147                 struct mdp_superblock_1 *refsb = 
1148                         (struct mdp_superblock_1*)page_address(refdev->sb_page);
1149
1150                 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1151                     sb->level != refsb->level ||
1152                     sb->layout != refsb->layout ||
1153                     sb->chunksize != refsb->chunksize) {
1154                         printk(KERN_WARNING "md: %s has strangely different"
1155                                 " superblock to %s\n",
1156                                 bdevname(rdev->bdev,b),
1157                                 bdevname(refdev->bdev,b2));
1158                         return -EINVAL;
1159                 }
1160                 ev1 = le64_to_cpu(sb->events);
1161                 ev2 = le64_to_cpu(refsb->events);
1162
1163                 if (ev1 > ev2)
1164                         ret = 1;
1165                 else
1166                         ret = 0;
1167         }
1168         if (minor_version)
1169                 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
1170         else
1171                 rdev->size = rdev->sb_offset;
1172         if (rdev->size < le64_to_cpu(sb->data_size)/2)
1173                 return -EINVAL;
1174         rdev->size = le64_to_cpu(sb->data_size)/2;
1175         if (le32_to_cpu(sb->chunksize))
1176                 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
1177
1178         if (le64_to_cpu(sb->size) > rdev->size*2)
1179                 return -EINVAL;
1180         return ret;
1181 }
1182
1183 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1184 {
1185         struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1186         __u64 ev1 = le64_to_cpu(sb->events);
1187
1188         rdev->raid_disk = -1;
1189         clear_bit(Faulty, &rdev->flags);
1190         clear_bit(In_sync, &rdev->flags);
1191         clear_bit(WriteMostly, &rdev->flags);
1192         clear_bit(BarriersNotsupp, &rdev->flags);
1193
1194         if (mddev->raid_disks == 0) {
1195                 mddev->major_version = 1;
1196                 mddev->patch_version = 0;
1197                 mddev->external = 0;
1198                 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
1199                 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1200                 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1201                 mddev->level = le32_to_cpu(sb->level);
1202                 mddev->clevel[0] = 0;
1203                 mddev->layout = le32_to_cpu(sb->layout);
1204                 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1205                 mddev->size = le64_to_cpu(sb->size)/2;
1206                 mddev->events = ev1;
1207                 mddev->bitmap_offset = 0;
1208                 mddev->default_bitmap_offset = 1024 >> 9;
1209                 
1210                 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1211                 memcpy(mddev->uuid, sb->set_uuid, 16);
1212
1213                 mddev->max_disks =  (4096-256)/2;
1214
1215                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1216                     mddev->bitmap_file == NULL )
1217                         mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1218
1219                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1220                         mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1221                         mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1222                         mddev->new_level = le32_to_cpu(sb->new_level);
1223                         mddev->new_layout = le32_to_cpu(sb->new_layout);
1224                         mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
1225                 } else {
1226                         mddev->reshape_position = MaxSector;
1227                         mddev->delta_disks = 0;
1228                         mddev->new_level = mddev->level;
1229                         mddev->new_layout = mddev->layout;
1230                         mddev->new_chunk = mddev->chunk_size;
1231                 }
1232
1233         } else if (mddev->pers == NULL) {
1234                 /* Insist of good event counter while assembling */
1235                 ++ev1;
1236                 if (ev1 < mddev->events)
1237                         return -EINVAL;
1238         } else if (mddev->bitmap) {
1239                 /* If adding to array with a bitmap, then we can accept an
1240                  * older device, but not too old.
1241                  */
1242                 if (ev1 < mddev->bitmap->events_cleared)
1243                         return 0;
1244         } else {
1245                 if (ev1 < mddev->events)
1246                         /* just a hot-add of a new device, leave raid_disk at -1 */
1247                         return 0;
1248         }
1249         if (mddev->level != LEVEL_MULTIPATH) {
1250                 int role;
1251                 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1252                 switch(role) {
1253                 case 0xffff: /* spare */
1254                         break;
1255                 case 0xfffe: /* faulty */
1256                         set_bit(Faulty, &rdev->flags);
1257                         break;
1258                 default:
1259                         if ((le32_to_cpu(sb->feature_map) &
1260                              MD_FEATURE_RECOVERY_OFFSET))
1261                                 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1262                         else
1263                                 set_bit(In_sync, &rdev->flags);
1264                         rdev->raid_disk = role;
1265                         break;
1266                 }
1267                 if (sb->devflags & WriteMostly1)
1268                         set_bit(WriteMostly, &rdev->flags);
1269         } else /* MULTIPATH are always insync */
1270                 set_bit(In_sync, &rdev->flags);
1271
1272         return 0;
1273 }
1274
1275 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1276 {
1277         struct mdp_superblock_1 *sb;
1278         struct list_head *tmp;
1279         mdk_rdev_t *rdev2;
1280         int max_dev, i;
1281         /* make rdev->sb match mddev and rdev data. */
1282
1283         sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1284
1285         sb->feature_map = 0;
1286         sb->pad0 = 0;
1287         sb->recovery_offset = cpu_to_le64(0);
1288         memset(sb->pad1, 0, sizeof(sb->pad1));
1289         memset(sb->pad2, 0, sizeof(sb->pad2));
1290         memset(sb->pad3, 0, sizeof(sb->pad3));
1291
1292         sb->utime = cpu_to_le64((__u64)mddev->utime);
1293         sb->events = cpu_to_le64(mddev->events);
1294         if (mddev->in_sync)
1295                 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1296         else
1297                 sb->resync_offset = cpu_to_le64(0);
1298
1299         sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1300
1301         sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1302         sb->size = cpu_to_le64(mddev->size<<1);
1303
1304         if (mddev->bitmap && mddev->bitmap_file == NULL) {
1305                 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1306                 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1307         }
1308
1309         if (rdev->raid_disk >= 0 &&
1310             !test_bit(In_sync, &rdev->flags) &&
1311             rdev->recovery_offset > 0) {
1312                 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1313                 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
1314         }
1315
1316         if (mddev->reshape_position != MaxSector) {
1317                 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1318                 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1319                 sb->new_layout = cpu_to_le32(mddev->new_layout);
1320                 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1321                 sb->new_level = cpu_to_le32(mddev->new_level);
1322                 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
1323         }
1324
1325         max_dev = 0;
1326         rdev_for_each(rdev2, tmp, mddev)
1327                 if (rdev2->desc_nr+1 > max_dev)
1328                         max_dev = rdev2->desc_nr+1;
1329
1330         if (max_dev > le32_to_cpu(sb->max_dev))
1331                 sb->max_dev = cpu_to_le32(max_dev);
1332         for (i=0; i<max_dev;i++)
1333                 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1334         
1335         rdev_for_each(rdev2, tmp, mddev) {
1336                 i = rdev2->desc_nr;
1337                 if (test_bit(Faulty, &rdev2->flags))
1338                         sb->dev_roles[i] = cpu_to_le16(0xfffe);
1339                 else if (test_bit(In_sync, &rdev2->flags))
1340                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1341                 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1342                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1343                 else
1344                         sb->dev_roles[i] = cpu_to_le16(0xffff);
1345         }
1346
1347         sb->sb_csum = calc_sb_1_csum(sb);
1348 }
1349
1350 static unsigned long long
1351 super_1_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
1352 {
1353         struct mdp_superblock_1 *sb;
1354         unsigned long long max_size;
1355         if (size && size < rdev->mddev->size)
1356                 return 0; /* component must fit device */
1357         size *= 2; /* convert to sectors */
1358         if (rdev->sb_offset < rdev->data_offset/2) {
1359                 /* minor versions 1 and 2; superblock before data */
1360                 max_size = (rdev->bdev->bd_inode->i_size >> 9);
1361                 max_size -= rdev->data_offset;
1362                 if (!size || size > max_size)
1363                         size = max_size;
1364         } else if (rdev->mddev->bitmap_offset) {
1365                 /* minor version 0 with bitmap we can't move */
1366                 return 0;
1367         } else {
1368                 /* minor version 0; superblock after data */
1369                 sector_t sb_offset;
1370                 sb_offset = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1371                 sb_offset &= ~(sector_t)(4*2 - 1);
1372                 max_size = rdev->size*2 + sb_offset - rdev->sb_offset*2;
1373                 if (!size || size > max_size)
1374                         size = max_size;
1375                 rdev->sb_offset = sb_offset/2;
1376         }
1377         sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
1378         sb->data_size = cpu_to_le64(size);
1379         sb->super_offset = rdev->sb_offset*2;
1380         sb->sb_csum = calc_sb_1_csum(sb);
1381         md_super_write(rdev->mddev, rdev, rdev->sb_offset << 1, rdev->sb_size,
1382                        rdev->sb_page);
1383         md_super_wait(rdev->mddev);
1384         return size/2; /* kB for sysfs */
1385 }
1386
1387 static struct super_type super_types[] = {
1388         [0] = {
1389                 .name   = "0.90.0",
1390                 .owner  = THIS_MODULE,
1391                 .load_super         = super_90_load,
1392                 .validate_super     = super_90_validate,
1393                 .sync_super         = super_90_sync,
1394                 .rdev_size_change   = super_90_rdev_size_change,
1395         },
1396         [1] = {
1397                 .name   = "md-1",
1398                 .owner  = THIS_MODULE,
1399                 .load_super         = super_1_load,
1400                 .validate_super     = super_1_validate,
1401                 .sync_super         = super_1_sync,
1402                 .rdev_size_change   = super_1_rdev_size_change,
1403         },
1404 };
1405
1406 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1407 {
1408         struct list_head *tmp, *tmp2;
1409         mdk_rdev_t *rdev, *rdev2;
1410
1411         rdev_for_each(rdev, tmp, mddev1)
1412                 rdev_for_each(rdev2, tmp2, mddev2)
1413                         if (rdev->bdev->bd_contains ==
1414                             rdev2->bdev->bd_contains)
1415                                 return 1;
1416
1417         return 0;
1418 }
1419
1420 static LIST_HEAD(pending_raid_disks);
1421
1422 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1423 {
1424         char b[BDEVNAME_SIZE];
1425         struct kobject *ko;
1426         char *s;
1427         int err;
1428
1429         if (rdev->mddev) {
1430                 MD_BUG();
1431                 return -EINVAL;
1432         }
1433
1434         /* prevent duplicates */
1435         if (find_rdev(mddev, rdev->bdev->bd_dev))
1436                 return -EEXIST;
1437
1438         /* make sure rdev->size exceeds mddev->size */
1439         if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1440                 if (mddev->pers) {
1441                         /* Cannot change size, so fail
1442                          * If mddev->level <= 0, then we don't care
1443                          * about aligning sizes (e.g. linear)
1444                          */
1445                         if (mddev->level > 0)
1446                                 return -ENOSPC;
1447                 } else
1448                         mddev->size = rdev->size;
1449         }
1450
1451         /* Verify rdev->desc_nr is unique.
1452          * If it is -1, assign a free number, else
1453          * check number is not in use
1454          */
1455         if (rdev->desc_nr < 0) {
1456                 int choice = 0;
1457                 if (mddev->pers) choice = mddev->raid_disks;
1458                 while (find_rdev_nr(mddev, choice))
1459                         choice++;
1460                 rdev->desc_nr = choice;
1461         } else {
1462                 if (find_rdev_nr(mddev, rdev->desc_nr))
1463                         return -EBUSY;
1464         }
1465         bdevname(rdev->bdev,b);
1466         while ( (s=strchr(b, '/')) != NULL)
1467                 *s = '!';
1468
1469         rdev->mddev = mddev;
1470         printk(KERN_INFO "md: bind<%s>\n", b);
1471
1472         if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
1473                 goto fail;
1474
1475         if (rdev->bdev->bd_part)
1476                 ko = &rdev->bdev->bd_part->dev.kobj;
1477         else
1478                 ko = &rdev->bdev->bd_disk->dev.kobj;
1479         if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
1480                 kobject_del(&rdev->kobj);
1481                 goto fail;
1482         }
1483         list_add(&rdev->same_set, &mddev->disks);
1484         bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1485         return 0;
1486
1487  fail:
1488         printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
1489                b, mdname(mddev));
1490         return err;
1491 }
1492
1493 static void md_delayed_delete(struct work_struct *ws)
1494 {
1495         mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
1496         kobject_del(&rdev->kobj);
1497         kobject_put(&rdev->kobj);
1498 }
1499
1500 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1501 {
1502         char b[BDEVNAME_SIZE];
1503         if (!rdev->mddev) {
1504                 MD_BUG();
1505                 return;
1506         }
1507         bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1508         list_del_init(&rdev->same_set);
1509         printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1510         rdev->mddev = NULL;
1511         sysfs_remove_link(&rdev->kobj, "block");
1512
1513         /* We need to delay this, otherwise we can deadlock when
1514          * writing to 'remove' to "dev/state"
1515          */
1516         INIT_WORK(&rdev->del_work, md_delayed_delete);
1517         kobject_get(&rdev->kobj);
1518         schedule_work(&rdev->del_work);
1519 }
1520
1521 /*
1522  * prevent the device from being mounted, repartitioned or
1523  * otherwise reused by a RAID array (or any other kernel
1524  * subsystem), by bd_claiming the device.
1525  */
1526 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
1527 {
1528         int err = 0;
1529         struct block_device *bdev;
1530         char b[BDEVNAME_SIZE];
1531
1532         bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1533         if (IS_ERR(bdev)) {
1534                 printk(KERN_ERR "md: could not open %s.\n",
1535                         __bdevname(dev, b));
1536                 return PTR_ERR(bdev);
1537         }
1538         err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
1539         if (err) {
1540                 printk(KERN_ERR "md: could not bd_claim %s.\n",
1541                         bdevname(bdev, b));
1542                 blkdev_put(bdev);
1543                 return err;
1544         }
1545         if (!shared)
1546                 set_bit(AllReserved, &rdev->flags);
1547         rdev->bdev = bdev;
1548         return err;
1549 }
1550
1551 static void unlock_rdev(mdk_rdev_t *rdev)
1552 {
1553         struct block_device *bdev = rdev->bdev;
1554         rdev->bdev = NULL;
1555         if (!bdev)
1556                 MD_BUG();
1557         bd_release(bdev);
1558         blkdev_put(bdev);
1559 }
1560
1561 void md_autodetect_dev(dev_t dev);
1562
1563 static void export_rdev(mdk_rdev_t * rdev)
1564 {
1565         char b[BDEVNAME_SIZE];
1566         printk(KERN_INFO "md: export_rdev(%s)\n",
1567                 bdevname(rdev->bdev,b));
1568         if (rdev->mddev)
1569                 MD_BUG();
1570         free_disk_sb(rdev);
1571         list_del_init(&rdev->same_set);
1572 #ifndef MODULE
1573         if (test_bit(AutoDetected, &rdev->flags))
1574                 md_autodetect_dev(rdev->bdev->bd_dev);
1575 #endif
1576         unlock_rdev(rdev);
1577         kobject_put(&rdev->kobj);
1578 }
1579
1580 static void kick_rdev_from_array(mdk_rdev_t * rdev)
1581 {
1582         unbind_rdev_from_array(rdev);
1583         export_rdev(rdev);
1584 }
1585
1586 static void export_array(mddev_t *mddev)
1587 {
1588         struct list_head *tmp;
1589         mdk_rdev_t *rdev;
1590
1591         rdev_for_each(rdev, tmp, mddev) {
1592                 if (!rdev->mddev) {
1593                         MD_BUG();
1594                         continue;
1595                 }
1596                 kick_rdev_from_array(rdev);
1597         }
1598         if (!list_empty(&mddev->disks))
1599                 MD_BUG();
1600         mddev->raid_disks = 0;
1601         mddev->major_version = 0;
1602 }
1603
1604 static void print_desc(mdp_disk_t *desc)
1605 {
1606         printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1607                 desc->major,desc->minor,desc->raid_disk,desc->state);
1608 }
1609
1610 static void print_sb(mdp_super_t *sb)
1611 {
1612         int i;
1613
1614         printk(KERN_INFO 
1615                 "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1616                 sb->major_version, sb->minor_version, sb->patch_version,
1617                 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1618                 sb->ctime);
1619         printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1620                 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1621                 sb->md_minor, sb->layout, sb->chunk_size);
1622         printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
1623                 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1624                 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1625                 sb->failed_disks, sb->spare_disks,
1626                 sb->sb_csum, (unsigned long)sb->events_lo);
1627
1628         printk(KERN_INFO);
1629         for (i = 0; i < MD_SB_DISKS; i++) {
1630                 mdp_disk_t *desc;
1631
1632                 desc = sb->disks + i;
1633                 if (desc->number || desc->major || desc->minor ||
1634                     desc->raid_disk || (desc->state && (desc->state != 4))) {
1635                         printk("     D %2d: ", i);
1636                         print_desc(desc);
1637                 }
1638         }
1639         printk(KERN_INFO "md:     THIS: ");
1640         print_desc(&sb->this_disk);
1641
1642 }
1643
1644 static void print_rdev(mdk_rdev_t *rdev)
1645 {
1646         char b[BDEVNAME_SIZE];
1647         printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1648                 bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1649                 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1650                 rdev->desc_nr);
1651         if (rdev->sb_loaded) {
1652                 printk(KERN_INFO "md: rdev superblock:\n");
1653                 print_sb((mdp_super_t*)page_address(rdev->sb_page));
1654         } else
1655                 printk(KERN_INFO "md: no rdev superblock!\n");
1656 }
1657
1658 static void md_print_devices(void)
1659 {
1660         struct list_head *tmp, *tmp2;
1661         mdk_rdev_t *rdev;
1662         mddev_t *mddev;
1663         char b[BDEVNAME_SIZE];
1664
1665         printk("\n");
1666         printk("md:     **********************************\n");
1667         printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
1668         printk("md:     **********************************\n");
1669         for_each_mddev(mddev, tmp) {
1670
1671                 if (mddev->bitmap)
1672                         bitmap_print_sb(mddev->bitmap);
1673                 else
1674                         printk("%s: ", mdname(mddev));
1675                 rdev_for_each(rdev, tmp2, mddev)
1676                         printk("<%s>", bdevname(rdev->bdev,b));
1677                 printk("\n");
1678
1679                 rdev_for_each(rdev, tmp2, mddev)
1680                         print_rdev(rdev);
1681         }
1682         printk("md:     **********************************\n");
1683         printk("\n");
1684 }
1685
1686
1687 static void sync_sbs(mddev_t * mddev, int nospares)
1688 {
1689         /* Update each superblock (in-memory image), but
1690          * if we are allowed to, skip spares which already
1691          * have the right event counter, or have one earlier
1692          * (which would mean they aren't being marked as dirty
1693          * with the rest of the array)
1694          */
1695         mdk_rdev_t *rdev;
1696         struct list_head *tmp;
1697
1698         rdev_for_each(rdev, tmp, mddev) {
1699                 if (rdev->sb_events == mddev->events ||
1700                     (nospares &&
1701                      rdev->raid_disk < 0 &&
1702                      (rdev->sb_events&1)==0 &&
1703                      rdev->sb_events+1 == mddev->events)) {
1704                         /* Don't update this superblock */
1705                         rdev->sb_loaded = 2;
1706                 } else {
1707                         super_types[mddev->major_version].
1708                                 sync_super(mddev, rdev);
1709                         rdev->sb_loaded = 1;
1710                 }
1711         }
1712 }
1713
1714 static void md_update_sb(mddev_t * mddev, int force_change)
1715 {
1716         struct list_head *tmp;
1717         mdk_rdev_t *rdev;
1718         int sync_req;
1719         int nospares = 0;
1720
1721         if (mddev->external)
1722                 return;
1723 repeat:
1724         spin_lock_irq(&mddev->write_lock);
1725
1726         set_bit(MD_CHANGE_PENDING, &mddev->flags);
1727         if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
1728                 force_change = 1;
1729         if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
1730                 /* just a clean<-> dirty transition, possibly leave spares alone,
1731                  * though if events isn't the right even/odd, we will have to do
1732                  * spares after all
1733                  */
1734                 nospares = 1;
1735         if (force_change)
1736                 nospares = 0;
1737         if (mddev->degraded)
1738                 /* If the array is degraded, then skipping spares is both
1739                  * dangerous and fairly pointless.
1740                  * Dangerous because a device that was removed from the array
1741                  * might have a event_count that still looks up-to-date,
1742                  * so it can be re-added without a resync.
1743                  * Pointless because if there are any spares to skip,
1744                  * then a recovery will happen and soon that array won't
1745                  * be degraded any more and the spare can go back to sleep then.
1746                  */
1747                 nospares = 0;
1748
1749         sync_req = mddev->in_sync;
1750         mddev->utime = get_seconds();
1751
1752         /* If this is just a dirty<->clean transition, and the array is clean
1753          * and 'events' is odd, we can roll back to the previous clean state */
1754         if (nospares
1755             && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1756             && (mddev->events & 1)
1757             && mddev->events != 1)
1758                 mddev->events--;
1759         else {
1760                 /* otherwise we have to go forward and ... */
1761                 mddev->events ++;
1762                 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1763                         /* .. if the array isn't clean, insist on an odd 'events' */
1764                         if ((mddev->events&1)==0) {
1765                                 mddev->events++;
1766                                 nospares = 0;
1767                         }
1768                 } else {
1769                         /* otherwise insist on an even 'events' (for clean states) */
1770                         if ((mddev->events&1)) {
1771                                 mddev->events++;
1772                                 nospares = 0;
1773                         }
1774                 }
1775         }
1776
1777         if (!mddev->events) {
1778                 /*
1779                  * oops, this 64-bit counter should never wrap.
1780                  * Either we are in around ~1 trillion A.C., assuming
1781                  * 1 reboot per second, or we have a bug:
1782                  */
1783                 MD_BUG();
1784                 mddev->events --;
1785         }
1786
1787         /*
1788          * do not write anything to disk if using
1789          * nonpersistent superblocks
1790          */
1791         if (!mddev->persistent) {
1792                 if (!mddev->external)
1793                         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1794
1795                 spin_unlock_irq(&mddev->write_lock);
1796                 wake_up(&mddev->sb_wait);
1797                 return;
1798         }
1799         sync_sbs(mddev, nospares);
1800         spin_unlock_irq(&mddev->write_lock);
1801
1802         dprintk(KERN_INFO 
1803                 "md: updating %s RAID superblock on device (in sync %d)\n",
1804                 mdname(mddev),mddev->in_sync);
1805
1806         bitmap_update_sb(mddev->bitmap);
1807         rdev_for_each(rdev, tmp, mddev) {
1808                 char b[BDEVNAME_SIZE];
1809                 dprintk(KERN_INFO "md: ");
1810                 if (rdev->sb_loaded != 1)
1811                         continue; /* no noise on spare devices */
1812                 if (test_bit(Faulty, &rdev->flags))
1813                         dprintk("(skipping faulty ");
1814
1815                 dprintk("%s ", bdevname(rdev->bdev,b));
1816                 if (!test_bit(Faulty, &rdev->flags)) {
1817                         md_super_write(mddev,rdev,
1818                                        rdev->sb_offset<<1, rdev->sb_size,
1819                                        rdev->sb_page);
1820                         dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1821                                 bdevname(rdev->bdev,b),
1822                                 (unsigned long long)rdev->sb_offset);
1823                         rdev->sb_events = mddev->events;
1824
1825                 } else
1826                         dprintk(")\n");
1827                 if (mddev->level == LEVEL_MULTIPATH)
1828                         /* only need to write one superblock... */
1829                         break;
1830         }
1831         md_super_wait(mddev);
1832         /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
1833
1834         spin_lock_irq(&mddev->write_lock);
1835         if (mddev->in_sync != sync_req ||
1836             test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
1837                 /* have to write it out again */
1838                 spin_unlock_irq(&mddev->write_lock);
1839                 goto repeat;
1840         }
1841         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
1842         spin_unlock_irq(&mddev->write_lock);
1843         wake_up(&mddev->sb_wait);
1844
1845 }
1846
1847 /* words written to sysfs files may, or may not, be \n terminated.
1848  * We want to accept with case. For this we use cmd_match.
1849  */
1850 static int cmd_match(const char *cmd, const char *str)
1851 {
1852         /* See if cmd, written into a sysfs file, matches
1853          * str.  They must either be the same, or cmd can
1854          * have a trailing newline
1855          */
1856         while (*cmd && *str && *cmd == *str) {
1857                 cmd++;
1858                 str++;
1859         }
1860         if (*cmd == '\n')
1861                 cmd++;
1862         if (*str || *cmd)
1863                 return 0;
1864         return 1;
1865 }
1866
1867 struct rdev_sysfs_entry {
1868         struct attribute attr;
1869         ssize_t (*show)(mdk_rdev_t *, char *);
1870         ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
1871 };
1872
1873 static ssize_t
1874 state_show(mdk_rdev_t *rdev, char *page)
1875 {
1876         char *sep = "";
1877         size_t len = 0;
1878
1879         if (test_bit(Faulty, &rdev->flags)) {
1880                 len+= sprintf(page+len, "%sfaulty",sep);
1881                 sep = ",";
1882         }
1883         if (test_bit(In_sync, &rdev->flags)) {
1884                 len += sprintf(page+len, "%sin_sync",sep);
1885                 sep = ",";
1886         }
1887         if (test_bit(WriteMostly, &rdev->flags)) {
1888                 len += sprintf(page+len, "%swrite_mostly",sep);
1889                 sep = ",";
1890         }
1891         if (test_bit(Blocked, &rdev->flags)) {
1892                 len += sprintf(page+len, "%sblocked", sep);
1893                 sep = ",";
1894         }
1895         if (!test_bit(Faulty, &rdev->flags) &&
1896             !test_bit(In_sync, &rdev->flags)) {
1897                 len += sprintf(page+len, "%sspare", sep);
1898                 sep = ",";
1899         }
1900         return len+sprintf(page+len, "\n");
1901 }
1902
1903 static ssize_t
1904 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1905 {
1906         /* can write
1907          *  faulty  - simulates and error
1908          *  remove  - disconnects the device
1909          *  writemostly - sets write_mostly
1910          *  -writemostly - clears write_mostly
1911          *  blocked - sets the Blocked flag
1912          *  -blocked - clears the Blocked flag
1913          */
1914         int err = -EINVAL;
1915         if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
1916                 md_error(rdev->mddev, rdev);
1917                 err = 0;
1918         } else if (cmd_match(buf, "remove")) {
1919                 if (rdev->raid_disk >= 0)
1920                         err = -EBUSY;
1921                 else {
1922                         mddev_t *mddev = rdev->mddev;
1923                         kick_rdev_from_array(rdev);
1924                         if (mddev->pers)
1925                                 md_update_sb(mddev, 1);
1926                         md_new_event(mddev);
1927                         err = 0;
1928                 }
1929         } else if (cmd_match(buf, "writemostly")) {
1930                 set_bit(WriteMostly, &rdev->flags);
1931                 err = 0;
1932         } else if (cmd_match(buf, "-writemostly")) {
1933                 clear_bit(WriteMostly, &rdev->flags);
1934                 err = 0;
1935         } else if (cmd_match(buf, "blocked")) {
1936                 set_bit(Blocked, &rdev->flags);
1937                 err = 0;
1938         } else if (cmd_match(buf, "-blocked")) {
1939                 clear_bit(Blocked, &rdev->flags);
1940                 wake_up(&rdev->blocked_wait);
1941                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
1942                 md_wakeup_thread(rdev->mddev->thread);
1943
1944                 err = 0;
1945         }
1946         if (!err)
1947                 sysfs_notify(&rdev->kobj, NULL, "state");
1948         return err ? err : len;
1949 }
1950 static struct rdev_sysfs_entry rdev_state =
1951 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
1952
1953 static ssize_t
1954 errors_show(mdk_rdev_t *rdev, char *page)
1955 {
1956         return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
1957 }
1958
1959 static ssize_t
1960 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1961 {
1962         char *e;
1963         unsigned long n = simple_strtoul(buf, &e, 10);
1964         if (*buf && (*e == 0 || *e == '\n')) {
1965                 atomic_set(&rdev->corrected_errors, n);
1966                 return len;
1967         }
1968         return -EINVAL;
1969 }
1970 static struct rdev_sysfs_entry rdev_errors =
1971 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
1972
1973 static ssize_t
1974 slot_show(mdk_rdev_t *rdev, char *page)
1975 {
1976         if (rdev->raid_disk < 0)
1977                 return sprintf(page, "none\n");
1978         else
1979                 return sprintf(page, "%d\n", rdev->raid_disk);
1980 }
1981
1982 static ssize_t
1983 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1984 {
1985         char *e;
1986         int err;
1987         char nm[20];
1988         int slot = simple_strtoul(buf, &e, 10);
1989         if (strncmp(buf, "none", 4)==0)
1990                 slot = -1;
1991         else if (e==buf || (*e && *e!= '\n'))
1992                 return -EINVAL;
1993         if (rdev->mddev->pers && slot == -1) {
1994                 /* Setting 'slot' on an active array requires also
1995                  * updating the 'rd%d' link, and communicating
1996                  * with the personality with ->hot_*_disk.
1997                  * For now we only support removing
1998                  * failed/spare devices.  This normally happens automatically,
1999                  * but not when the metadata is externally managed.
2000                  */
2001                 if (rdev->raid_disk == -1)
2002                         return -EEXIST;
2003                 /* personality does all needed checks */
2004                 if (rdev->mddev->pers->hot_add_disk == NULL)
2005                         return -EINVAL;
2006                 err = rdev->mddev->pers->
2007                         hot_remove_disk(rdev->mddev, rdev->raid_disk);
2008                 if (err)
2009                         return err;
2010                 sprintf(nm, "rd%d", rdev->raid_disk);
2011                 sysfs_remove_link(&rdev->mddev->kobj, nm);
2012                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2013                 md_wakeup_thread(rdev->mddev->thread);
2014         } else if (rdev->mddev->pers) {
2015                 mdk_rdev_t *rdev2;
2016                 struct list_head *tmp;
2017                 /* Activating a spare .. or possibly reactivating
2018                  * if we every get bitmaps working here.
2019                  */
2020
2021                 if (rdev->raid_disk != -1)
2022                         return -EBUSY;
2023
2024                 if (rdev->mddev->pers->hot_add_disk == NULL)
2025                         return -EINVAL;
2026
2027                 rdev_for_each(rdev2, tmp, rdev->mddev)
2028                         if (rdev2->raid_disk == slot)
2029                                 return -EEXIST;
2030
2031                 rdev->raid_disk = slot;
2032                 if (test_bit(In_sync, &rdev->flags))
2033                         rdev->saved_raid_disk = slot;
2034                 else
2035                         rdev->saved_raid_disk = -1;
2036                 err = rdev->mddev->pers->
2037                         hot_add_disk(rdev->mddev, rdev);
2038                 if (err) {
2039                         rdev->raid_disk = -1;
2040                         return err;
2041                 } else
2042                         sysfs_notify(&rdev->kobj, NULL, "state");
2043                 sprintf(nm, "rd%d", rdev->raid_disk);
2044                 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2045                         printk(KERN_WARNING
2046                                "md: cannot register "
2047                                "%s for %s\n",
2048                                nm, mdname(rdev->mddev));
2049
2050                 /* don't wakeup anyone, leave that to userspace. */
2051         } else {
2052                 if (slot >= rdev->mddev->raid_disks)
2053                         return -ENOSPC;
2054                 rdev->raid_disk = slot;
2055                 /* assume it is working */
2056                 clear_bit(Faulty, &rdev->flags);
2057                 clear_bit(WriteMostly, &rdev->flags);
2058                 set_bit(In_sync, &rdev->flags);
2059                 sysfs_notify(&rdev->kobj, NULL, "state");
2060         }
2061         return len;
2062 }
2063
2064
2065 static struct rdev_sysfs_entry rdev_slot =
2066 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2067
2068 static ssize_t
2069 offset_show(mdk_rdev_t *rdev, char *page)
2070 {
2071         return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2072 }
2073
2074 static ssize_t
2075 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2076 {
2077         char *e;
2078         unsigned long long offset = simple_strtoull(buf, &e, 10);
2079         if (e==buf || (*e && *e != '\n'))
2080                 return -EINVAL;
2081         if (rdev->mddev->pers && rdev->raid_disk >= 0)
2082                 return -EBUSY;
2083         if (rdev->size && rdev->mddev->external)
2084                 /* Must set offset before size, so overlap checks
2085                  * can be sane */
2086                 return -EBUSY;
2087         rdev->data_offset = offset;
2088         return len;
2089 }
2090
2091 static struct rdev_sysfs_entry rdev_offset =
2092 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2093
2094 static ssize_t
2095 rdev_size_show(mdk_rdev_t *rdev, char *page)
2096 {
2097         return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
2098 }
2099
2100 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2101 {
2102         /* check if two start/length pairs overlap */
2103         if (s1+l1 <= s2)
2104                 return 0;
2105         if (s2+l2 <= s1)
2106                 return 0;
2107         return 1;
2108 }
2109
2110 static ssize_t
2111 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2112 {
2113         char *e;
2114         unsigned long long size = simple_strtoull(buf, &e, 10);
2115         unsigned long long oldsize = rdev->size;
2116         mddev_t *my_mddev = rdev->mddev;
2117
2118         if (e==buf || (*e && *e != '\n'))
2119                 return -EINVAL;
2120         if (my_mddev->pers && rdev->raid_disk >= 0) {
2121                 if (rdev->mddev->persistent) {
2122                         size = super_types[rdev->mddev->major_version].
2123                                 rdev_size_change(rdev, size);
2124                         if (!size)
2125                                 return -EBUSY;
2126                 } else if (!size) {
2127                         size = (rdev->bdev->bd_inode->i_size >> 10);
2128                         size -= rdev->data_offset/2;
2129                 }
2130                 if (size < rdev->mddev->size)
2131                         return -EINVAL; /* component must fit device */
2132         }
2133
2134         rdev->size = size;
2135         if (size > oldsize && rdev->mddev->external) {
2136                 /* need to check that all other rdevs with the same ->bdev
2137                  * do not overlap.  We need to unlock the mddev to avoid
2138                  * a deadlock.  We have already changed rdev->size, and if
2139                  * we have to change it back, we will have the lock again.
2140                  */
2141                 mddev_t *mddev;
2142                 int overlap = 0;
2143                 struct list_head *tmp, *tmp2;
2144
2145                 mddev_unlock(my_mddev);
2146                 for_each_mddev(mddev, tmp) {
2147                         mdk_rdev_t *rdev2;
2148
2149                         mddev_lock(mddev);
2150                         rdev_for_each(rdev2, tmp2, mddev)
2151                                 if (test_bit(AllReserved, &rdev2->flags) ||
2152                                     (rdev->bdev == rdev2->bdev &&
2153                                      rdev != rdev2 &&
2154                                      overlaps(rdev->data_offset, rdev->size,
2155                                             rdev2->data_offset, rdev2->size))) {
2156                                         overlap = 1;
2157                                         break;
2158                                 }
2159                         mddev_unlock(mddev);
2160                         if (overlap) {
2161                                 mddev_put(mddev);
2162                                 break;
2163                         }
2164                 }
2165                 mddev_lock(my_mddev);
2166                 if (overlap) {
2167                         /* Someone else could have slipped in a size
2168                          * change here, but doing so is just silly.
2169                          * We put oldsize back because we *know* it is
2170                          * safe, and trust userspace not to race with
2171                          * itself
2172                          */
2173                         rdev->size = oldsize;
2174                         return -EBUSY;
2175                 }
2176         }
2177         if (size < my_mddev->size || my_mddev->size == 0)
2178                 my_mddev->size = size;
2179         return len;
2180 }
2181
2182 static struct rdev_sysfs_entry rdev_size =
2183 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2184
2185 static struct attribute *rdev_default_attrs[] = {
2186         &rdev_state.attr,
2187         &rdev_errors.attr,
2188         &rdev_slot.attr,
2189         &rdev_offset.attr,
2190         &rdev_size.attr,
2191         NULL,
2192 };
2193 static ssize_t
2194 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2195 {
2196         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2197         mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2198         mddev_t *mddev = rdev->mddev;
2199         ssize_t rv;
2200
2201         if (!entry->show)
2202                 return -EIO;
2203
2204         rv = mddev ? mddev_lock(mddev) : -EBUSY;
2205         if (!rv) {
2206                 if (rdev->mddev == NULL)
2207                         rv = -EBUSY;
2208                 else
2209                         rv = entry->show(rdev, page);
2210                 mddev_unlock(mddev);
2211         }
2212         return rv;
2213 }
2214
2215 static ssize_t
2216 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2217               const char *page, size_t length)
2218 {
2219         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2220         mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2221         ssize_t rv;
2222         mddev_t *mddev = rdev->mddev;
2223
2224         if (!entry->store)
2225                 return -EIO;
2226         if (!capable(CAP_SYS_ADMIN))
2227                 return -EACCES;
2228         rv = mddev ? mddev_lock(mddev): -EBUSY;
2229         if (!rv) {
2230                 if (rdev->mddev == NULL)
2231                         rv = -EBUSY;
2232                 else
2233                         rv = entry->store(rdev, page, length);
2234                 mddev_unlock(mddev);
2235         }
2236         return rv;
2237 }
2238
2239 static void rdev_free(struct kobject *ko)
2240 {
2241         mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
2242         kfree(rdev);
2243 }
2244 static struct sysfs_ops rdev_sysfs_ops = {
2245         .show           = rdev_attr_show,
2246         .store          = rdev_attr_store,
2247 };
2248 static struct kobj_type rdev_ktype = {
2249         .release        = rdev_free,
2250         .sysfs_ops      = &rdev_sysfs_ops,
2251         .default_attrs  = rdev_default_attrs,
2252 };
2253
2254 /*
2255  * Import a device. If 'super_format' >= 0, then sanity check the superblock
2256  *
2257  * mark the device faulty if:
2258  *
2259  *   - the device is nonexistent (zero size)
2260  *   - the device has no valid superblock
2261  *
2262  * a faulty rdev _never_ has rdev->sb set.
2263  */
2264 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
2265 {
2266         char b[BDEVNAME_SIZE];
2267         int err;
2268         mdk_rdev_t *rdev;
2269         sector_t size;
2270
2271         rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
2272         if (!rdev) {
2273                 printk(KERN_ERR "md: could not alloc mem for new device!\n");
2274                 return ERR_PTR(-ENOMEM);
2275         }
2276
2277         if ((err = alloc_disk_sb(rdev)))
2278                 goto abort_free;
2279
2280         err = lock_rdev(rdev, newdev, super_format == -2);
2281         if (err)
2282                 goto abort_free;
2283
2284         kobject_init(&rdev->kobj, &rdev_ktype);
2285
2286         rdev->desc_nr = -1;
2287         rdev->saved_raid_disk = -1;
2288         rdev->raid_disk = -1;
2289         rdev->flags = 0;
2290         rdev->data_offset = 0;
2291         rdev->sb_events = 0;
2292         atomic_set(&rdev->nr_pending, 0);
2293         atomic_set(&rdev->read_errors, 0);
2294         atomic_set(&rdev->corrected_errors, 0);
2295
2296         size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2297         if (!size) {
2298                 printk(KERN_WARNING 
2299                         "md: %s has zero or unknown size, marking faulty!\n",
2300                         bdevname(rdev->bdev,b));
2301                 err = -EINVAL;
2302                 goto abort_free;
2303         }
2304
2305         if (super_format >= 0) {
2306                 err = super_types[super_format].
2307                         load_super(rdev, NULL, super_minor);
2308                 if (err == -EINVAL) {
2309                         printk(KERN_WARNING
2310                                 "md: %s does not have a valid v%d.%d "
2311                                "superblock, not importing!\n",
2312                                 bdevname(rdev->bdev,b),
2313                                super_format, super_minor);
2314                         goto abort_free;
2315                 }
2316                 if (err < 0) {
2317                         printk(KERN_WARNING 
2318                                 "md: could not read %s's sb, not importing!\n",
2319                                 bdevname(rdev->bdev,b));
2320                         goto abort_free;
2321                 }
2322         }
2323
2324         INIT_LIST_HEAD(&rdev->same_set);
2325         init_waitqueue_head(&rdev->blocked_wait);
2326
2327         return rdev;
2328
2329 abort_free:
2330         if (rdev->sb_page) {
2331                 if (rdev->bdev)
2332                         unlock_rdev(rdev);
2333                 free_disk_sb(rdev);
2334         }
2335         kfree(rdev);
2336         return ERR_PTR(err);
2337 }
2338
2339 /*
2340  * Check a full RAID array for plausibility
2341  */
2342
2343
2344 static void analyze_sbs(mddev_t * mddev)
2345 {
2346         int i;
2347         struct list_head *tmp;
2348         mdk_rdev_t *rdev, *freshest;
2349         char b[BDEVNAME_SIZE];
2350
2351         freshest = NULL;
2352         rdev_for_each(rdev, tmp, mddev)
2353                 switch (super_types[mddev->major_version].
2354                         load_super(rdev, freshest, mddev->minor_version)) {
2355                 case 1:
2356                         freshest = rdev;
2357                         break;
2358                 case 0:
2359                         break;
2360                 default:
2361                         printk( KERN_ERR \
2362                                 "md: fatal superblock inconsistency in %s"
2363                                 " -- removing from array\n", 
2364                                 bdevname(rdev->bdev,b));
2365                         kick_rdev_from_array(rdev);
2366                 }
2367
2368
2369         super_types[mddev->major_version].
2370                 validate_super(mddev, freshest);
2371
2372         i = 0;
2373         rdev_for_each(rdev, tmp, mddev) {
2374                 if (rdev != freshest)
2375                         if (super_types[mddev->major_version].
2376                             validate_super(mddev, rdev)) {
2377                                 printk(KERN_WARNING "md: kicking non-fresh %s"
2378                                         " from array!\n",
2379                                         bdevname(rdev->bdev,b));
2380                                 kick_rdev_from_array(rdev);
2381                                 continue;
2382                         }
2383                 if (mddev->level == LEVEL_MULTIPATH) {
2384                         rdev->desc_nr = i++;
2385                         rdev->raid_disk = rdev->desc_nr;
2386                         set_bit(In_sync, &rdev->flags);
2387                 } else if (rdev->raid_disk >= mddev->raid_disks) {
2388                         rdev->raid_disk = -1;
2389                         clear_bit(In_sync, &rdev->flags);
2390                 }
2391         }
2392
2393
2394
2395         if (mddev->recovery_cp != MaxSector &&
2396             mddev->level >= 1)
2397                 printk(KERN_ERR "md: %s: raid array is not clean"
2398                        " -- starting background reconstruction\n",
2399                        mdname(mddev));
2400
2401 }
2402
2403 static ssize_t
2404 safe_delay_show(mddev_t *mddev, char *page)
2405 {
2406         int msec = (mddev->safemode_delay*1000)/HZ;
2407         return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2408 }
2409 static ssize_t
2410 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2411 {
2412         int scale=1;
2413         int dot=0;
2414         int i;
2415         unsigned long msec;
2416         char buf[30];
2417         char *e;
2418         /* remove a period, and count digits after it */
2419         if (len >= sizeof(buf))
2420                 return -EINVAL;
2421         strlcpy(buf, cbuf, len);
2422         buf[len] = 0;
2423         for (i=0; i<len; i++) {
2424                 if (dot) {
2425                         if (isdigit(buf[i])) {
2426                                 buf[i-1] = buf[i];
2427                                 scale *= 10;
2428                         }
2429                         buf[i] = 0;
2430                 } else if (buf[i] == '.') {
2431                         dot=1;
2432                         buf[i] = 0;
2433                 }
2434         }
2435         msec = simple_strtoul(buf, &e, 10);
2436         if (e == buf || (*e && *e != '\n'))
2437                 return -EINVAL;
2438         msec = (msec * 1000) / scale;
2439         if (msec == 0)
2440                 mddev->safemode_delay = 0;
2441         else {
2442                 mddev->safemode_delay = (msec*HZ)/1000;
2443                 if (mddev->safemode_delay == 0)
2444                         mddev->safemode_delay = 1;
2445         }
2446         return len;
2447 }
2448 static struct md_sysfs_entry md_safe_delay =
2449 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2450
2451 static ssize_t
2452 level_show(mddev_t *mddev, char *page)
2453 {
2454         struct mdk_personality *p = mddev->pers;
2455         if (p)
2456                 return sprintf(page, "%s\n", p->name);
2457         else if (mddev->clevel[0])
2458                 return sprintf(page, "%s\n", mddev->clevel);
2459         else if (mddev->level != LEVEL_NONE)
2460                 return sprintf(page, "%d\n", mddev->level);
2461         else
2462                 return 0;
2463 }
2464
2465 static ssize_t
2466 level_store(mddev_t *mddev, const char *buf, size_t len)
2467 {
2468         ssize_t rv = len;
2469         if (mddev->pers)
2470                 return -EBUSY;
2471         if (len == 0)
2472                 return 0;
2473         if (len >= sizeof(mddev->clevel))
2474                 return -ENOSPC;
2475         strncpy(mddev->clevel, buf, len);
2476         if (mddev->clevel[len-1] == '\n')
2477                 len--;
2478         mddev->clevel[len] = 0;
2479         mddev->level = LEVEL_NONE;
2480         return rv;
2481 }
2482
2483 static struct md_sysfs_entry md_level =
2484 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2485
2486
2487 static ssize_t
2488 layout_show(mddev_t *mddev, char *page)
2489 {
2490         /* just a number, not meaningful for all levels */
2491         if (mddev->reshape_position != MaxSector &&
2492             mddev->layout != mddev->new_layout)
2493                 return sprintf(page, "%d (%d)\n",
2494                                mddev->new_layout, mddev->layout);
2495         return sprintf(page, "%d\n", mddev->layout);
2496 }
2497
2498 static ssize_t
2499 layout_store(mddev_t *mddev, const char *buf, size_t len)
2500 {
2501         char *e;
2502         unsigned long n = simple_strtoul(buf, &e, 10);
2503
2504         if (!*buf || (*e && *e != '\n'))
2505                 return -EINVAL;
2506
2507         if (mddev->pers)
2508                 return -EBUSY;
2509         if (mddev->reshape_position != MaxSector)
2510                 mddev->new_layout = n;
2511         else
2512                 mddev->layout = n;
2513         return len;
2514 }
2515 static struct md_sysfs_entry md_layout =
2516 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2517
2518
2519 static ssize_t
2520 raid_disks_show(mddev_t *mddev, char *page)
2521 {
2522         if (mddev->raid_disks == 0)
2523                 return 0;
2524         if (mddev->reshape_position != MaxSector &&
2525             mddev->delta_disks != 0)
2526                 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
2527                                mddev->raid_disks - mddev->delta_disks);
2528         return sprintf(page, "%d\n", mddev->raid_disks);
2529 }
2530
2531 static int update_raid_disks(mddev_t *mddev, int raid_disks);
2532
2533 static ssize_t
2534 raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2535 {
2536         char *e;
2537         int rv = 0;
2538         unsigned long n = simple_strtoul(buf, &e, 10);
2539
2540         if (!*buf || (*e && *e != '\n'))
2541                 return -EINVAL;
2542
2543         if (mddev->pers)
2544                 rv = update_raid_disks(mddev, n);
2545         else if (mddev->reshape_position != MaxSector) {
2546                 int olddisks = mddev->raid_disks - mddev->delta_disks;
2547                 mddev->delta_disks = n - olddisks;
2548                 mddev->raid_disks = n;
2549         } else
2550                 mddev->raid_disks = n;
2551         return rv ? rv : len;
2552 }
2553 static struct md_sysfs_entry md_raid_disks =
2554 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2555
2556 static ssize_t
2557 chunk_size_show(mddev_t *mddev, char *page)
2558 {
2559         if (mddev->reshape_position != MaxSector &&
2560             mddev->chunk_size != mddev->new_chunk)
2561                 return sprintf(page, "%d (%d)\n", mddev->new_chunk,
2562                                mddev->chunk_size);
2563         return sprintf(page, "%d\n", mddev->chunk_size);
2564 }
2565
2566 static ssize_t
2567 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2568 {
2569         /* can only set chunk_size if array is not yet active */
2570         char *e;
2571         unsigned long n = simple_strtoul(buf, &e, 10);
2572
2573         if (!*buf || (*e && *e != '\n'))
2574                 return -EINVAL;
2575
2576         if (mddev->pers)
2577                 return -EBUSY;
2578         else if (mddev->reshape_position != MaxSector)
2579                 mddev->new_chunk = n;
2580         else
2581                 mddev->chunk_size = n;
2582         return len;
2583 }
2584 static struct md_sysfs_entry md_chunk_size =
2585 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2586
2587 static ssize_t
2588 resync_start_show(mddev_t *mddev, char *page)
2589 {
2590         return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2591 }
2592
2593 static ssize_t
2594 resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2595 {
2596         char *e;
2597         unsigned long long n = simple_strtoull(buf, &e, 10);
2598
2599         if (mddev->pers)
2600                 return -EBUSY;
2601         if (!*buf || (*e && *e != '\n'))
2602                 return -EINVAL;
2603
2604         mddev->recovery_cp = n;
2605         return len;
2606 }
2607 static struct md_sysfs_entry md_resync_start =
2608 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2609
2610 /*
2611  * The array state can be:
2612  *
2613  * clear
2614  *     No devices, no size, no level
2615  *     Equivalent to STOP_ARRAY ioctl
2616  * inactive
2617  *     May have some settings, but array is not active
2618  *        all IO results in error
2619  *     When written, doesn't tear down array, but just stops it
2620  * suspended (not supported yet)
2621  *     All IO requests will block. The array can be reconfigured.
2622  *     Writing this, if accepted, will block until array is quiescent
2623  * readonly
2624  *     no resync can happen.  no superblocks get written.
2625  *     write requests fail
2626  * read-auto
2627  *     like readonly, but behaves like 'clean' on a write request.
2628  *
2629  * clean - no pending writes, but otherwise active.
2630  *     When written to inactive array, starts without resync
2631  *     If a write request arrives then
2632  *       if metadata is known, mark 'dirty' and switch to 'active'.
2633  *       if not known, block and switch to write-pending
2634  *     If written to an active array that has pending writes, then fails.
2635  * active
2636  *     fully active: IO and resync can be happening.
2637  *     When written to inactive array, starts with resync
2638  *
2639  * write-pending
2640  *     clean, but writes are blocked waiting for 'active' to be written.
2641  *
2642  * active-idle
2643  *     like active, but no writes have been seen for a while (100msec).
2644  *
2645  */
2646 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2647                    write_pending, active_idle, bad_word};
2648 static char *array_states[] = {
2649         "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2650         "write-pending", "active-idle", NULL };
2651
2652 static int match_word(const char *word, char **list)
2653 {
2654         int n;
2655         for (n=0; list[n]; n++)
2656                 if (cmd_match(word, list[n]))
2657                         break;
2658         return n;
2659 }
2660
2661 static ssize_t
2662 array_state_show(mddev_t *mddev, char *page)
2663 {
2664         enum array_state st = inactive;
2665
2666         if (mddev->pers)
2667                 switch(mddev->ro) {
2668                 case 1:
2669                         st = readonly;
2670                         break;
2671                 case 2:
2672                         st = read_auto;
2673                         break;
2674                 case 0:
2675                         if (mddev->in_sync)
2676                                 st = clean;
2677                         else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
2678                                 st = write_pending;
2679                         else if (mddev->safemode)
2680                                 st = active_idle;
2681                         else
2682                                 st = active;
2683                 }
2684         else {
2685                 if (list_empty(&mddev->disks) &&
2686                     mddev->raid_disks == 0 &&
2687                     mddev->size == 0)
2688                         st = clear;
2689                 else
2690                         st = inactive;
2691         }
2692         return sprintf(page, "%s\n", array_states[st]);
2693 }
2694
2695 static int do_md_stop(mddev_t * mddev, int ro);
2696 static int do_md_run(mddev_t * mddev);
2697 static int restart_array(mddev_t *mddev);
2698
2699 static ssize_t
2700 array_state_store(mddev_t *mddev, const char *buf, size_t len)
2701 {
2702         int err = -EINVAL;
2703         enum array_state st = match_word(buf, array_states);
2704         switch(st) {
2705         case bad_word:
2706                 break;
2707         case clear:
2708                 /* stopping an active array */
2709                 if (atomic_read(&mddev->active) > 1)
2710                         return -EBUSY;
2711                 err = do_md_stop(mddev, 0);
2712                 break;
2713         case inactive:
2714                 /* stopping an active array */
2715                 if (mddev->pers) {
2716                         if (atomic_read(&mddev->active) > 1)
2717                                 return -EBUSY;
2718                         err = do_md_stop(mddev, 2);
2719                 } else
2720                         err = 0; /* already inactive */
2721                 break;
2722         case suspended:
2723                 break; /* not supported yet */
2724         case readonly:
2725                 if (mddev->pers)
2726                         err = do_md_stop(mddev, 1);
2727                 else {
2728                         mddev->ro = 1;
2729                         set_disk_ro(mddev->gendisk, 1);
2730                         err = do_md_run(mddev);
2731                 }
2732                 break;
2733         case read_auto:
2734                 if (mddev->pers) {
2735                         if (mddev->ro != 1)
2736                                 err = do_md_stop(mddev, 1);
2737                         else
2738                                 err = restart_array(mddev);
2739                         if (err == 0) {
2740                                 mddev->ro = 2;
2741                                 set_disk_ro(mddev->gendisk, 0);
2742                         }
2743                 } else {
2744                         mddev->ro = 2;
2745                         err = do_md_run(mddev);
2746                 }
2747                 break;
2748         case clean:
2749                 if (mddev->pers) {
2750                         restart_array(mddev);
2751                         spin_lock_irq(&mddev->write_lock);
2752                         if (atomic_read(&mddev->writes_pending) == 0) {
2753                                 if (mddev->in_sync == 0) {
2754                                         mddev->in_sync = 1;
2755                                         if (mddev->safemode == 1)
2756                                                 mddev->safemode = 0;
2757                                         if (mddev->persistent)
2758                                                 set_bit(MD_CHANGE_CLEAN,
2759                                                         &mddev->flags);
2760                                 }
2761                                 err = 0;
2762                         } else
2763                                 err = -EBUSY;
2764                         spin_unlock_irq(&mddev->write_lock);
2765                 } else {
2766                         mddev->ro = 0;
2767                         mddev->recovery_cp = MaxSector;
2768                         err = do_md_run(mddev);
2769                 }
2770                 break;
2771         case active:
2772                 if (mddev->pers) {
2773                         restart_array(mddev);
2774                         if (mddev->external)
2775                                 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2776                         wake_up(&mddev->sb_wait);
2777                         err = 0;
2778                 } else {
2779                         mddev->ro = 0;
2780                         set_disk_ro(mddev->gendisk, 0);
2781                         err = do_md_run(mddev);
2782                 }
2783                 break;
2784         case write_pending:
2785         case active_idle:
2786                 /* these cannot be set */
2787                 break;
2788         }
2789         if (err)
2790                 return err;
2791         else {
2792                 sysfs_notify(&mddev->kobj, NULL, "array_state");
2793                 return len;
2794         }
2795 }
2796 static struct md_sysfs_entry md_array_state =
2797 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
2798
2799 static ssize_t
2800 null_show(mddev_t *mddev, char *page)
2801 {
2802         return -EINVAL;
2803 }
2804
2805 static ssize_t
2806 new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2807 {
2808         /* buf must be %d:%d\n? giving major and minor numbers */
2809         /* The new device is added to the array.
2810          * If the array has a persistent superblock, we read the
2811          * superblock to initialise info and check validity.
2812          * Otherwise, only checking done is that in bind_rdev_to_array,
2813          * which mainly checks size.
2814          */
2815         char *e;
2816         int major = simple_strtoul(buf, &e, 10);
2817         int minor;
2818         dev_t dev;
2819         mdk_rdev_t *rdev;
2820         int err;
2821
2822         if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
2823                 return -EINVAL;
2824         minor = simple_strtoul(e+1, &e, 10);
2825         if (*e && *e != '\n')
2826                 return -EINVAL;
2827         dev = MKDEV(major, minor);
2828         if (major != MAJOR(dev) ||
2829             minor != MINOR(dev))
2830                 return -EOVERFLOW;
2831
2832
2833         if (mddev->persistent) {
2834                 rdev = md_import_device(dev, mddev->major_version,
2835                                         mddev->minor_version);
2836                 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
2837                         mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2838                                                        mdk_rdev_t, same_set);
2839                         err = super_types[mddev->major_version]
2840                                 .load_super(rdev, rdev0, mddev->minor_version);
2841                         if (err < 0)
2842                                 goto out;
2843                 }
2844         } else if (mddev->external)
2845                 rdev = md_import_device(dev, -2, -1);
2846         else
2847                 rdev = md_import_device(dev, -1, -1);
2848
2849         if (IS_ERR(rdev))
2850                 return PTR_ERR(rdev);
2851         err = bind_rdev_to_array(rdev, mddev);
2852  out:
2853         if (err)
2854                 export_rdev(rdev);
2855         return err ? err : len;
2856 }
2857
2858 static struct md_sysfs_entry md_new_device =
2859 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
2860
2861 static ssize_t
2862 bitmap_store(mddev_t *mddev, const char *buf, size_t len)
2863 {
2864         char *end;
2865         unsigned long chunk, end_chunk;
2866
2867         if (!mddev->bitmap)
2868                 goto out;
2869         /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
2870         while (*buf) {
2871                 chunk = end_chunk = simple_strtoul(buf, &end, 0);
2872                 if (buf == end) break;
2873                 if (*end == '-') { /* range */
2874                         buf = end + 1;
2875                         end_chunk = simple_strtoul(buf, &end, 0);
2876                         if (buf == end) break;
2877                 }
2878                 if (*end && !isspace(*end)) break;
2879                 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
2880                 buf = end;
2881                 while (isspace(*buf)) buf++;
2882         }
2883         bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
2884 out:
2885         return len;
2886 }
2887
2888 static struct md_sysfs_entry md_bitmap =
2889 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
2890
2891 static ssize_t
2892 size_show(mddev_t *mddev, char *page)
2893 {
2894         return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2895 }
2896
2897 static int update_size(mddev_t *mddev, unsigned long size);
2898
2899 static ssize_t
2900 size_store(mddev_t *mddev, const char *buf, size_t len)
2901 {
2902         /* If array is inactive, we can reduce the component size, but
2903          * not increase it (except from 0).
2904          * If array is active, we can try an on-line resize
2905          */
2906         char *e;
2907         int err = 0;
2908         unsigned long long size = simple_strtoull(buf, &e, 10);
2909         if (!*buf || *buf == '\n' ||
2910             (*e && *e != '\n'))
2911                 return -EINVAL;
2912
2913         if (mddev->pers) {
2914                 err = update_size(mddev, size);
2915                 md_update_sb(mddev, 1);
2916         } else {
2917                 if (mddev->size == 0 ||
2918                     mddev->size > size)
2919                         mddev->size = size;
2920                 else
2921                         err = -ENOSPC;
2922         }
2923         return err ? err : len;
2924 }
2925
2926 static struct md_sysfs_entry md_size =
2927 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
2928
2929
2930 /* Metdata version.
2931  * This is one of
2932  *   'none' for arrays with no metadata (good luck...)
2933  *   'external' for arrays with externally managed metadata,
2934  * or N.M for internally known formats
2935  */
2936 static ssize_t
2937 metadata_show(mddev_t *mddev, char *page)
2938 {
2939         if (mddev->persistent)
2940                 return sprintf(page, "%d.%d\n",
2941                                mddev->major_version, mddev->minor_version);
2942         else if (mddev->external)
2943                 return sprintf(page, "external:%s\n", mddev->metadata_type);
2944         else
2945                 return sprintf(page, "none\n");
2946 }
2947
2948 static ssize_t
2949 metadata_store(mddev_t *mddev, const char *buf, size_t len)
2950 {
2951         int major, minor;
2952         char *e;
2953         if (!list_empty(&mddev->disks))
2954                 return -EBUSY;
2955
2956         if (cmd_match(buf, "none")) {
2957                 mddev->persistent = 0;
2958                 mddev->external = 0;
2959                 mddev->major_version = 0;
2960                 mddev->minor_version = 90;
2961                 return len;
2962         }
2963         if (strncmp(buf, "external:", 9) == 0) {
2964                 size_t namelen = len-9;
2965                 if (namelen >= sizeof(mddev->metadata_type))
2966                         namelen = sizeof(mddev->metadata_type)-1;
2967                 strncpy(mddev->metadata_type, buf+9, namelen);
2968                 mddev->metadata_type[namelen] = 0;
2969                 if (namelen && mddev->metadata_type[namelen-1] == '\n')
2970                         mddev->metadata_type[--namelen] = 0;
2971                 mddev->persistent = 0;
2972                 mddev->external = 1;
2973                 mddev->major_version = 0;
2974                 mddev->minor_version = 90;
2975                 return len;
2976         }
2977         major = simple_strtoul(buf, &e, 10);
2978         if (e==buf || *e != '.')
2979                 return -EINVAL;
2980         buf = e+1;
2981         minor = simple_strtoul(buf, &e, 10);
2982         if (e==buf || (*e && *e != '\n') )
2983                 return -EINVAL;
2984         if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
2985                 return -ENOENT;
2986         mddev->major_version = major;
2987         mddev->minor_version = minor;
2988         mddev->persistent = 1;
2989         mddev->external = 0;
2990         return len;
2991 }
2992
2993 static struct md_sysfs_entry md_metadata =
2994 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
2995
2996 static ssize_t
2997 action_show(mddev_t *mddev, char *page)
2998 {
2999         char *type = "idle";
3000         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3001             (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
3002                 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3003                         type = "reshape";
3004                 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3005                         if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
3006                                 type = "resync";
3007                         else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
3008                                 type = "check";
3009                         else
3010                                 type = "repair";
3011                 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
3012                         type = "recover";
3013         }
3014         return sprintf(page, "%s\n", type);
3015 }
3016
3017 static ssize_t
3018 action_store(mddev_t *mddev, const char *page, size_t len)
3019 {
3020         if (!mddev->pers || !mddev->pers->sync_request)
3021                 return -EINVAL;
3022
3023         if (cmd_match(page, "idle")) {
3024                 if (mddev->sync_thread) {
3025                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3026                         md_unregister_thread(mddev->sync_thread);
3027                         mddev->sync_thread = NULL;
3028                         mddev->recovery = 0;
3029                 }
3030         } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3031                    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
3032                 return -EBUSY;
3033         else if (cmd_match(page, "resync"))
3034                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3035         else if (cmd_match(page, "recover")) {
3036                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
3037                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3038         } else if (cmd_match(page, "reshape")) {
3039                 int err;
3040                 if (mddev->pers->start_reshape == NULL)
3041                         return -EINVAL;
3042                 err = mddev->pers->start_reshape(mddev);
3043                 if (err)
3044                         return err;
3045                 sysfs_notify(&mddev->kobj, NULL, "degraded");
3046         } else {
3047                 if (cmd_match(page, "check"))
3048                         set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3049                 else if (!cmd_match(page, "repair"))
3050                         return -EINVAL;
3051                 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
3052                 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3053         }
3054         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3055         md_wakeup_thread(mddev->thread);
3056         sysfs_notify(&mddev->kobj, NULL, "sync_action");
3057         return len;
3058 }
3059
3060 static ssize_t
3061 mismatch_cnt_show(mddev_t *mddev, char *page)
3062 {
3063         return sprintf(page, "%llu\n",
3064                        (unsigned long long) mddev->resync_mismatches);
3065 }
3066
3067 static struct md_sysfs_entry md_scan_mode =
3068 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
3069
3070
3071 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
3072
3073 static ssize_t
3074 sync_min_show(mddev_t *mddev, char *page)
3075 {
3076         return sprintf(page, "%d (%s)\n", speed_min(mddev),
3077                        mddev->sync_speed_min ? "local": "system");
3078 }
3079
3080 static ssize_t
3081 sync_min_store(mddev_t *mddev, const char *buf, size_t len)
3082 {
3083         int min;
3084         char *e;
3085         if (strncmp(buf, "system", 6)==0) {
3086                 mddev->sync_speed_min = 0;
3087                 return len;
3088         }
3089         min = simple_strtoul(buf, &e, 10);
3090         if (buf == e || (*e && *e != '\n') || min <= 0)
3091                 return -EINVAL;
3092         mddev->sync_speed_min = min;
3093         return len;
3094 }
3095
3096 static struct md_sysfs_entry md_sync_min =
3097 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
3098
3099 static ssize_t
3100 sync_max_show(mddev_t *mddev, char *page)
3101 {
3102         return sprintf(page, "%d (%s)\n", speed_max(mddev),
3103                        mddev->sync_speed_max ? "local": "system");
3104 }
3105
3106 static ssize_t
3107 sync_max_store(mddev_t *mddev, const char *buf, size_t len)
3108 {
3109         int max;
3110         char *e;
3111         if (strncmp(buf, "system", 6)==0) {
3112                 mddev->sync_speed_max = 0;
3113                 return len;
3114         }
3115         max = simple_strtoul(buf, &e, 10);
3116         if (buf == e || (*e && *e != '\n') || max <= 0)
3117                 return -EINVAL;
3118         mddev->sync_speed_max = max;
3119         return len;
3120 }
3121
3122 static struct md_sysfs_entry md_sync_max =
3123 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
3124
3125 static ssize_t
3126 degraded_show(mddev_t *mddev, char *page)
3127 {
3128         return sprintf(page, "%d\n", mddev->degraded);
3129 }
3130 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
3131
3132 static ssize_t
3133 sync_force_parallel_show(mddev_t *mddev, char *page)
3134 {
3135         return sprintf(page, "%d\n", mddev->parallel_resync);
3136 }
3137
3138 static ssize_t
3139 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
3140 {
3141         long n;
3142
3143         if (strict_strtol(buf, 10, &n))
3144                 return -EINVAL;
3145
3146         if (n != 0 && n != 1)
3147                 return -EINVAL;
3148
3149         mddev->parallel_resync = n;
3150
3151         if (mddev->sync_thread)
3152                 wake_up(&resync_wait);
3153
3154         return len;
3155 }
3156
3157 /* force parallel resync, even with shared block devices */
3158 static struct md_sysfs_entry md_sync_force_parallel =
3159 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
3160        sync_force_parallel_show, sync_force_parallel_store);
3161
3162 static ssize_t
3163 sync_speed_show(mddev_t *mddev, char *page)
3164 {
3165         unsigned long resync, dt, db;
3166         resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3167         dt = (jiffies - mddev->resync_mark) / HZ;
3168         if (!dt) dt++;
3169         db = resync - mddev->resync_mark_cnt;
3170         return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
3171 }
3172
3173 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
3174
3175 static ssize_t
3176 sync_completed_show(mddev_t *mddev, char *page)
3177 {
3178         unsigned long max_blocks, resync;
3179
3180         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3181                 max_blocks = mddev->resync_max_sectors;
3182         else
3183                 max_blocks = mddev->size << 1;
3184
3185         resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
3186         return sprintf(page, "%lu / %lu\n", resync, max_blocks);
3187 }
3188
3189 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
3190
3191 static ssize_t
3192 min_sync_show(mddev_t *mddev, char *page)
3193 {
3194         return sprintf(page, "%llu\n",
3195                        (unsigned long long)mddev->resync_min);
3196 }
3197 static ssize_t
3198 min_sync_store(mddev_t *mddev, const char *buf, size_t len)
3199 {
3200         unsigned long long min;
3201         if (strict_strtoull(buf, 10, &min))
3202                 return -EINVAL;
3203         if (min > mddev->resync_max)
3204                 return -EINVAL;
3205         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3206                 return -EBUSY;
3207
3208         /* Must be a multiple of chunk_size */
3209         if (mddev->chunk_size) {
3210                 if (min & (sector_t)((mddev->chunk_size>>9)-1))
3211                         return -EINVAL;
3212         }
3213         mddev->resync_min = min;
3214
3215         return len;
3216 }
3217
3218 static struct md_sysfs_entry md_min_sync =
3219 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
3220
3221 static ssize_t
3222 max_sync_show(mddev_t *mddev, char *page)
3223 {
3224         if (mddev->resync_max == MaxSector)
3225                 return sprintf(page, "max\n");
3226         else
3227                 return sprintf(page, "%llu\n",
3228                                (unsigned long long)mddev->resync_max);
3229 }
3230 static ssize_t
3231 max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3232 {
3233         if (strncmp(buf, "max", 3) == 0)
3234                 mddev->resync_max = MaxSector;
3235         else {
3236                 unsigned long long max;
3237                 if (strict_strtoull(buf, 10, &max))
3238                         return -EINVAL;
3239                 if (max < mddev->resync_min)
3240                         return -EINVAL;
3241                 if (max < mddev->resync_max &&
3242                     test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3243                         return -EBUSY;
3244
3245                 /* Must be a multiple of chunk_size */
3246                 if (mddev->chunk_size) {
3247                         if (max & (sector_t)((mddev->chunk_size>>9)-1))
3248                                 return -EINVAL;
3249                 }
3250                 mddev->resync_max = max;
3251         }
3252         wake_up(&mddev->recovery_wait);
3253         return len;
3254 }
3255
3256 static struct md_sysfs_entry md_max_sync =
3257 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
3258
3259 static ssize_t
3260 suspend_lo_show(mddev_t *mddev, char *page)
3261 {
3262         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
3263 }
3264
3265 static ssize_t
3266 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
3267 {
3268         char *e;
3269         unsigned long long new = simple_strtoull(buf, &e, 10);
3270
3271         if (mddev->pers->quiesce == NULL)
3272                 return -EINVAL;
3273         if (buf == e || (*e && *e != '\n'))
3274                 return -EINVAL;
3275         if (new >= mddev->suspend_hi ||
3276             (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
3277                 mddev->suspend_lo = new;
3278                 mddev->pers->quiesce(mddev, 2);
3279                 return len;
3280         } else
3281                 return -EINVAL;
3282 }
3283 static struct md_sysfs_entry md_suspend_lo =
3284 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
3285
3286
3287 static ssize_t
3288 suspend_hi_show(mddev_t *mddev, char *page)
3289 {
3290         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
3291 }
3292
3293 static ssize_t
3294 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
3295 {
3296         char *e;
3297         unsigned long long new = simple_strtoull(buf, &e, 10);
3298
3299         if (mddev->pers->quiesce == NULL)
3300                 return -EINVAL;
3301         if (buf == e || (*e && *e != '\n'))
3302                 return -EINVAL;
3303         if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
3304             (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
3305                 mddev->suspend_hi = new;
3306                 mddev->pers->quiesce(mddev, 1);
3307                 mddev->pers->quiesce(mddev, 0);
3308                 return len;
3309         } else
3310                 return -EINVAL;
3311 }
3312 static struct md_sysfs_entry md_suspend_hi =
3313 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
3314
3315 static ssize_t
3316 reshape_position_show(mddev_t *mddev, char *page)
3317 {
3318         if (mddev->reshape_position != MaxSector)
3319                 return sprintf(page, "%llu\n",
3320                                (unsigned long long)mddev->reshape_position);
3321         strcpy(page, "none\n");
3322         return 5;
3323 }
3324
3325 static ssize_t
3326 reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
3327 {
3328         char *e;
3329         unsigned long long new = simple_strtoull(buf, &e, 10);
3330         if (mddev->pers)
3331                 return -EBUSY;
3332         if (buf == e || (*e && *e != '\n'))
3333                 return -EINVAL;
3334         mddev->reshape_position = new;
3335         mddev->delta_disks = 0;
3336         mddev->new_level = mddev->level;
3337         mddev->new_layout = mddev->layout;
3338         mddev->new_chunk = mddev->chunk_size;
3339         return len;
3340 }
3341
3342 static struct md_sysfs_entry md_reshape_position =
3343 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
3344        reshape_position_store);
3345
3346
3347 static struct attribute *md_default_attrs[] = {
3348         &md_level.attr,
3349         &md_layout.attr,
3350         &md_raid_disks.attr,
3351         &md_chunk_size.attr,
3352         &md_size.attr,
3353         &md_resync_start.attr,
3354         &md_metadata.attr,
3355         &md_new_device.attr,
3356         &md_safe_delay.attr,
3357         &md_array_state.attr,
3358         &md_reshape_position.attr,
3359         NULL,
3360 };
3361
3362 static struct attribute *md_redundancy_attrs[] = {
3363         &md_scan_mode.attr,
3364         &md_mismatches.attr,
3365         &md_sync_min.attr,
3366         &md_sync_max.attr,
3367         &md_sync_speed.attr,
3368         &md_sync_force_parallel.attr,
3369         &md_sync_completed.attr,
3370         &md_min_sync.attr,
3371         &md_max_sync.attr,
3372         &md_suspend_lo.attr,
3373         &md_suspend_hi.attr,
3374         &md_bitmap.attr,
3375         &md_degraded.attr,
3376         NULL,
3377 };
3378 static struct attribute_group md_redundancy_group = {
3379         .name = NULL,
3380         .attrs = md_redundancy_attrs,
3381 };
3382
3383
3384 static ssize_t
3385 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3386 {
3387         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3388         mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3389         ssize_t rv;
3390
3391         if (!entry->show)
3392                 return -EIO;
3393         rv = mddev_lock(mddev);
3394         if (!rv) {
3395                 rv = entry->show(mddev, page);
3396                 mddev_unlock(mddev);
3397         }
3398         return rv;
3399 }
3400
3401 static ssize_t
3402 md_attr_store(struct kobject *kobj, struct attribute *attr,
3403               const char *page, size_t length)
3404 {
3405         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3406         mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3407         ssize_t rv;
3408
3409         if (!entry->store)
3410                 return -EIO;
3411         if (!capable(CAP_SYS_ADMIN))
3412                 return -EACCES;
3413         rv = mddev_lock(mddev);
3414         if (!rv) {
3415                 rv = entry->store(mddev, page, length);
3416                 mddev_unlock(mddev);
3417         }
3418         return rv;
3419 }
3420
3421 static void md_free(struct kobject *ko)
3422 {
3423         mddev_t *mddev = container_of(ko, mddev_t, kobj);
3424         kfree(mddev);
3425 }
3426
3427 static struct sysfs_ops md_sysfs_ops = {
3428         .show   = md_attr_show,
3429         .store  = md_attr_store,
3430 };
3431 static struct kobj_type md_ktype = {
3432         .release        = md_free,
3433         .sysfs_ops      = &md_sysfs_ops,
3434         .default_attrs  = md_default_attrs,
3435 };
3436
3437 int mdp_major = 0;
3438
3439 static struct kobject *md_probe(dev_t dev, int *part, void *data)
3440 {
3441         static DEFINE_MUTEX(disks_mutex);
3442         mddev_t *mddev = mddev_find(dev);
3443         struct gendisk *disk;
3444         int partitioned = (MAJOR(dev) != MD_MAJOR);
3445         int shift = partitioned ? MdpMinorShift : 0;
3446         int unit = MINOR(dev) >> shift;
3447         int error;
3448
3449         if (!mddev)
3450                 return NULL;
3451
3452         mutex_lock(&disks_mutex);
3453         if (mddev->gendisk) {
3454                 mutex_unlock(&disks_mutex);
3455                 mddev_put(mddev);
3456                 return NULL;
3457         }
3458         disk = alloc_disk(1 << shift);
3459         if (!disk) {
3460                 mutex_unlock(&disks_mutex);
3461                 mddev_put(mddev);
3462                 return NULL;
3463         }
3464         disk->major = MAJOR(dev);
3465         disk->first_minor = unit << shift;
3466         if (partitioned)
3467                 sprintf(disk->disk_name, "md_d%d", unit);
3468         else
3469                 sprintf(disk->disk_name, "md%d", unit);
3470         disk->fops = &md_fops;
3471         disk->private_data = mddev;
3472         disk->queue = mddev->queue;
3473         add_disk(disk);
3474         mddev->gendisk = disk;
3475         error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj,
3476                                      "%s", "md");
3477         mutex_unlock(&disks_mutex);
3478         if (error)
3479                 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3480                        disk->disk_name);
3481         else
3482                 kobject_uevent(&mddev->kobj, KOBJ_ADD);
3483         return NULL;
3484 }
3485
3486 static void md_safemode_timeout(unsigned long data)
3487 {
3488         mddev_t *mddev = (mddev_t *) data;
3489
3490         if (!atomic_read(&mddev->writes_pending)) {
3491                 mddev->safemode = 1;
3492                 if (mddev->external)
3493                         sysfs_notify(&mddev->kobj, NULL, "array_state");
3494         }
3495         md_wakeup_thread(mddev->thread);
3496 }
3497
3498 static int start_dirty_degraded;
3499
3500 static int do_md_run(mddev_t * mddev)
3501 {
3502         int err;
3503         int chunk_size;
3504         struct list_head *tmp;
3505         mdk_rdev_t *rdev;
3506         struct gendisk *disk;
3507         struct mdk_personality *pers;
3508         char b[BDEVNAME_SIZE];
3509
3510         if (list_empty(&mddev->disks))
3511                 /* cannot run an array with no devices.. */
3512                 return -EINVAL;
3513
3514         if (mddev->pers)
3515                 return -EBUSY;
3516
3517         /*
3518          * Analyze all RAID superblock(s)
3519          */
3520         if (!mddev->raid_disks) {
3521                 if (!mddev->persistent)
3522                         return -EINVAL;
3523                 analyze_sbs(mddev);
3524         }
3525
3526         chunk_size = mddev->chunk_size;
3527
3528         if (chunk_size) {
3529                 if (chunk_size > MAX_CHUNK_SIZE) {
3530                         printk(KERN_ERR "too big chunk_size: %d > %d\n",
3531                                 chunk_size, MAX_CHUNK_SIZE);
3532                         return -EINVAL;
3533                 }
3534                 /*
3535                  * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
3536                  */
3537                 if ( (1 << ffz(~chunk_size)) != chunk_size) {
3538                         printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
3539                         return -EINVAL;
3540                 }
3541                 if (chunk_size < PAGE_SIZE) {
3542                         printk(KERN_ERR "too small chunk_size: %d < %ld\n",
3543                                 chunk_size, PAGE_SIZE);
3544                         return -EINVAL;
3545                 }
3546
3547                 /* devices must have minimum size of one chunk */
3548                 rdev_for_each(rdev, tmp, mddev) {
3549                         if (test_bit(Faulty, &rdev->flags))
3550                                 continue;
3551                         if (rdev->size < chunk_size / 1024) {
3552                                 printk(KERN_WARNING
3553                                         "md: Dev %s smaller than chunk_size:"
3554                                         " %lluk < %dk\n",
3555                                         bdevname(rdev->bdev,b),
3556                                         (unsigned long long)rdev->size,
3557                                         chunk_size / 1024);
3558                                 return -EINVAL;
3559                         }
3560                 }
3561         }
3562
3563 #ifdef CONFIG_KMOD
3564         if (mddev->level != LEVEL_NONE)
3565                 request_module("md-level-%d", mddev->level);
3566         else if (mddev->clevel[0])
3567                 request_module("md-%s", mddev->clevel);
3568 #endif
3569
3570         /*
3571          * Drop all container device buffers, from now on
3572          * the only valid external interface is through the md
3573          * device.
3574          */
3575         rdev_for_each(rdev, tmp, mddev) {
3576                 if (test_bit(Faulty, &rdev->flags))
3577                         continue;
3578                 sync_blockdev(rdev->bdev);
3579                 invalidate_bdev(rdev->bdev);
3580
3581                 /* perform some consistency tests on the device.
3582                  * We don't want the data to overlap the metadata,
3583                  * Internal Bitmap issues has handled elsewhere.
3584                  */
3585                 if (rdev->data_offset < rdev->sb_offset) {
3586                         if (mddev->size &&
3587                             rdev->data_offset + mddev->size*2
3588                             > rdev->sb_offset*2) {
3589                                 printk("md: %s: data overlaps metadata\n",
3590                                        mdname(mddev));
3591                                 return -EINVAL;
3592                         }
3593                 } else {
3594                         if (rdev->sb_offset*2 + rdev->sb_size/512
3595                             > rdev->data_offset) {
3596                                 printk("md: %s: metadata overlaps data\n",
3597                                        mdname(mddev));
3598                                 return -EINVAL;
3599                         }
3600                 }
3601                 sysfs_notify(&rdev->kobj, NULL, "state");
3602         }
3603
3604         md_probe(mddev->unit, NULL, NULL);
3605         disk = mddev->gendisk;
3606         if (!disk)
3607                 return -ENOMEM;
3608
3609         spin_lock(&pers_lock);
3610         pers = find_pers(mddev->level, mddev->clevel);
3611         if (!pers || !try_module_get(pers->owner)) {
3612                 spin_unlock(&pers_lock);
3613                 if (mddev->level != LEVEL_NONE)
3614                         printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
3615                                mddev->level);
3616                 else
3617                         printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
3618                                mddev->clevel);
3619                 return -EINVAL;
3620         }
3621         mddev->pers = pers;
3622         spin_unlock(&pers_lock);
3623         mddev->level = pers->level;
3624         strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3625
3626         if (mddev->reshape_position != MaxSector &&
3627             pers->start_reshape == NULL) {
3628                 /* This personality cannot handle reshaping... */
3629                 mddev->pers = NULL;
3630                 module_put(pers->owner);
3631                 return -EINVAL;
3632         }
3633
3634         if (pers->sync_request) {
3635                 /* Warn if this is a potentially silly
3636                  * configuration.
3637                  */
3638                 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3639                 mdk_rdev_t *rdev2;
3640                 struct list_head *tmp2;
3641                 int warned = 0;
3642                 rdev_for_each(rdev, tmp, mddev) {
3643                         rdev_for_each(rdev2, tmp2, mddev) {
3644                                 if (rdev < rdev2 &&
3645                                     rdev->bdev->bd_contains ==
3646                                     rdev2->bdev->bd_contains) {
3647                                         printk(KERN_WARNING
3648                                                "%s: WARNING: %s appears to be"
3649                                                " on the same physical disk as"
3650                                                " %s.\n",
3651                                                mdname(mddev),
3652                                                bdevname(rdev->bdev,b),
3653                                                bdevname(rdev2->bdev,b2));
3654                                         warned = 1;
3655                                 }
3656                         }
3657                 }
3658                 if (warned)
3659                         printk(KERN_WARNING
3660                                "True protection against single-disk"
3661                                " failure might be compromised.\n");
3662         }
3663
3664         mddev->recovery = 0;
3665         mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
3666         mddev->barriers_work = 1;
3667         mddev->ok_start_degraded = start_dirty_degraded;
3668
3669         if (start_readonly)
3670                 mddev->ro = 2; /* read-only, but switch on first write */
3671
3672         err = mddev->pers->run(mddev);
3673         if (err)
3674                 printk(KERN_ERR "md: pers->run() failed ...\n");
3675         else if (mddev->pers->sync_request) {
3676                 err = bitmap_create(mddev);
3677                 if (err) {
3678                         printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
3679                                mdname(mddev), err);
3680                         mddev->pers->stop(mddev);
3681                 }
3682         }
3683         if (err) {
3684                 module_put(mddev->pers->owner);
3685                 mddev->pers = NULL;
3686                 bitmap_destroy(mddev);
3687                 return err;
3688         }
3689         if (mddev->pers->sync_request) {
3690                 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3691                         printk(KERN_WARNING
3692                                "md: cannot register extra attributes for %s\n",
3693                                mdname(mddev));
3694         } else if (mddev->ro == 2) /* auto-readonly not meaningful */
3695                 mddev->ro = 0;
3696
3697         atomic_set(&mddev->writes_pending,0);
3698         mddev->safemode = 0;
3699         mddev->safemode_timer.function = md_safemode_timeout;
3700         mddev->safemode_timer.data = (unsigned long) mddev;
3701         mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
3702         mddev->in_sync = 1;
3703
3704         rdev_for_each(rdev, tmp, mddev)
3705                 if (rdev->raid_disk >= 0) {
3706                         char nm[20];
3707                         sprintf(nm, "rd%d", rdev->raid_disk);
3708                         if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
3709                                 printk("md: cannot register %s for %s\n",
3710                                        nm, mdname(mddev));
3711                 }
3712         
3713         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3714         
3715         if (mddev->flags)
3716                 md_update_sb(mddev, 0);
3717
3718         set_capacity(disk, mddev->array_size<<1);
3719
3720         /* If we call blk_queue_make_request here, it will
3721          * re-initialise max_sectors etc which may have been
3722          * refined inside -> run.  So just set the bits we need to set.
3723          * Most initialisation happended when we called
3724          * blk_queue_make_request(..., md_fail_request)
3725          * earlier.
3726          */
3727         mddev->queue->queuedata = mddev;
3728         mddev->queue->make_request_fn = mddev->pers->make_request;
3729
3730         /* If there is a partially-recovered drive we need to
3731          * start recovery here.  If we leave it to md_check_recovery,
3732          * it will remove the drives and not do the right thing
3733          */
3734         if (mddev->degraded && !mddev->sync_thread) {
3735                 struct list_head *rtmp;
3736                 int spares = 0;
3737                 rdev_for_each(rdev, rtmp, mddev)
3738                         if (rdev->raid_disk >= 0 &&
3739                             !test_bit(In_sync, &rdev->flags) &&
3740                             !test_bit(Faulty, &rdev->flags))
3741                                 /* complete an interrupted recovery */
3742                                 spares++;
3743                 if (spares && mddev->pers->sync_request) {
3744                         mddev->recovery = 0;
3745                         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3746                         mddev->sync_thread = md_register_thread(md_do_sync,
3747                                                                 mddev,
3748                                                                 "%s_resync");
3749                         if (!mddev->sync_thread) {
3750                                 printk(KERN_ERR "%s: could not start resync"
3751                                        " thread...\n",
3752                                        mdname(mddev));
3753                                 /* leave the spares where they are, it shouldn't hurt */
3754                                 mddev->recovery = 0;
3755                         }
3756                 }
3757         }
3758         md_wakeup_thread(mddev->thread);
3759         md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
3760
3761         mddev->changed = 1;
3762         md_new_event(mddev);
3763         sysfs_notify(&mddev->kobj, NULL, "array_state");
3764         sysfs_notify(&mddev->kobj, NULL, "sync_action");
3765         sysfs_notify(&mddev->kobj, NULL, "degraded");
3766         kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
3767         return 0;
3768 }
3769
3770 static int restart_array(mddev_t *mddev)
3771 {
3772         struct gendisk *disk = mddev->gendisk;
3773         int err;
3774
3775         /*
3776          * Complain if it has no devices
3777          */
3778         err = -ENXIO;
3779         if (list_empty(&mddev->disks))
3780                 goto out;
3781
3782         if (mddev->pers) {
3783                 err = -EBUSY;
3784                 if (!mddev->ro)
3785                         goto out;
3786
3787                 mddev->safemode = 0;
3788                 mddev->ro = 0;
3789                 set_disk_ro(disk, 0);
3790
3791                 printk(KERN_INFO "md: %s switched to read-write mode.\n",
3792                         mdname(mddev));
3793                 /*
3794                  * Kick recovery or resync if necessary
3795                  */
3796                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3797                 md_wakeup_thread(mddev->thread);
3798                 md_wakeup_thread(mddev->sync_thread);
3799                 err = 0;
3800                 sysfs_notify(&mddev->kobj, NULL, "array_state");
3801
3802         } else
3803                 err = -EINVAL;
3804
3805 out:
3806         return err;
3807 }
3808
3809 /* similar to deny_write_access, but accounts for our holding a reference
3810  * to the file ourselves */
3811 static int deny_bitmap_write_access(struct file * file)
3812 {
3813         struct inode *inode = file->f_mapping->host;
3814
3815         spin_lock(&inode->i_lock);
3816         if (atomic_read(&inode->i_writecount) > 1) {
3817                 spin_unlock(&inode->i_lock);
3818                 return -ETXTBSY;
3819         }
3820         atomic_set(&inode->i_writecount, -1);
3821         spin_unlock(&inode->i_lock);
3822
3823         return 0;
3824 }
3825
3826 static void restore_bitmap_write_access(struct file *file)
3827 {
3828         struct inode *inode = file->f_mapping->host;
3829
3830         spin_lock(&inode->i_lock);
3831         atomic_set(&inode->i_writecount, 1);
3832         spin_unlock(&inode->i_lock);
3833 }
3834
3835 /* mode:
3836  *   0 - completely stop and dis-assemble array
3837  *   1 - switch to readonly
3838  *   2 - stop but do not disassemble array
3839  */
3840 static int do_md_stop(mddev_t * mddev, int mode)
3841 {
3842         int err = 0;
3843         struct gendisk *disk = mddev->gendisk;
3844
3845         if (mddev->pers) {
3846                 if (atomic_read(&mddev->active)>2) {
3847                         printk("md: %s still in use.\n",mdname(mddev));
3848                         return -EBUSY;
3849                 }
3850
3851                 if (mddev->sync_thread) {
3852                         set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3853                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3854                         md_unregister_thread(mddev->sync_thread);
3855                         mddev->sync_thread = NULL;
3856                 }
3857
3858                 del_timer_sync(&mddev->safemode_timer);
3859
3860                 invalidate_partition(disk, 0);
3861
3862                 switch(mode) {
3863                 case 1: /* readonly */
3864                         err  = -ENXIO;
3865                         if (mddev->ro==1)
3866                                 goto out;
3867                         mddev->ro = 1;
3868                         break;
3869                 case 0: /* disassemble */
3870                 case 2: /* stop */
3871                         bitmap_flush(mddev);
3872                         md_super_wait(mddev);
3873                         if (mddev->ro)
3874                                 set_disk_ro(disk, 0);
3875                         blk_queue_make_request(mddev->queue, md_fail_request);
3876                         mddev->pers->stop(mddev);
3877                         mddev->queue->merge_bvec_fn = NULL;
3878                         mddev->queue->unplug_fn = NULL;
3879                         mddev->queue->backing_dev_info.congested_fn = NULL;
3880                         if (mddev->pers->sync_request)
3881                                 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3882
3883                         module_put(mddev->pers->owner);
3884                         mddev->pers = NULL;
3885                         /* tell userspace to handle 'inactive' */
3886                         sysfs_notify(&mddev->kobj, NULL, "array_state");
3887
3888                         set_capacity(disk, 0);
3889                         mddev->changed = 1;
3890
3891                         if (mddev->ro)
3892                                 mddev->ro = 0;
3893                 }
3894                 if (!mddev->in_sync || mddev->flags) {
3895                         /* mark array as shutdown cleanly */
3896                         mddev->in_sync = 1;
3897                         md_update_sb(mddev, 1);
3898                 }
3899                 if (mode == 1)
3900                         set_disk_ro(disk, 1);
3901                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3902         }
3903
3904         /*
3905          * Free resources if final stop
3906          */
3907         if (mode == 0) {
3908                 mdk_rdev_t *rdev;
3909                 struct list_head *tmp;
3910
3911                 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
3912
3913                 bitmap_destroy(mddev);
3914                 if (mddev->bitmap_file) {
3915                         restore_bitmap_write_access(mddev->bitmap_file);
3916                         fput(mddev->bitmap_file);
3917                         mddev->bitmap_file = NULL;
3918                 }
3919                 mddev->bitmap_offset = 0;
3920
3921                 rdev_for_each(rdev, tmp, mddev)
3922                         if (rdev->raid_disk >= 0) {
3923                                 char nm[20];
3924                                 sprintf(nm, "rd%d", rdev->raid_disk);
3925                                 sysfs_remove_link(&mddev->kobj, nm);
3926                         }
3927
3928                 /* make sure all md_delayed_delete calls have finished */
3929                 flush_scheduled_work();
3930
3931                 export_array(mddev);
3932
3933                 mddev->array_size = 0;
3934                 mddev->size = 0;
3935                 mddev->raid_disks = 0;
3936                 mddev->recovery_cp = 0;
3937                 mddev->resync_min = 0;
3938                 mddev->resync_max = MaxSector;
3939                 mddev->reshape_position = MaxSector;
3940                 mddev->external = 0;
3941                 mddev->persistent = 0;
3942                 mddev->level = LEVEL_NONE;
3943                 mddev->clevel[0] = 0;
3944                 mddev->flags = 0;
3945                 mddev->ro = 0;
3946                 mddev->metadata_type[0] = 0;
3947                 mddev->chunk_size = 0;
3948                 mddev->ctime = mddev->utime = 0;
3949                 mddev->layout = 0;
3950                 mddev->max_disks = 0;
3951                 mddev->events = 0;
3952                 mddev->delta_disks = 0;
3953                 mddev->new_level = LEVEL_NONE;
3954                 mddev->new_layout = 0;
3955                 mddev->new_chunk = 0;
3956                 mddev->curr_resync = 0;
3957                 mddev->resync_mismatches = 0;
3958                 mddev->suspend_lo = mddev->suspend_hi = 0;
3959                 mddev->sync_speed_min = mddev->sync_speed_max = 0;
3960                 mddev->recovery = 0;
3961                 mddev->in_sync = 0;
3962                 mddev->changed = 0;
3963                 mddev->degraded = 0;
3964                 mddev->barriers_work = 0;
3965                 mddev->safemode = 0;
3966
3967         } else if (mddev->pers)
3968                 printk(KERN_INFO "md: %s switched to read-only mode.\n",
3969                         mdname(mddev));
3970         err = 0;
3971         md_new_event(mddev);
3972         sysfs_notify(&mddev->kobj, NULL, "array_state");
3973 out:
3974         return err;
3975 }
3976
3977 #ifndef MODULE
3978 static void autorun_array(mddev_t *mddev)
3979 {
3980         mdk_rdev_t *rdev;
3981         struct list_head *tmp;
3982         int err;
3983
3984         if (list_empty(&mddev->disks))
3985                 return;
3986
3987         printk(KERN_INFO "md: running: ");
3988
3989         rdev_for_each(rdev, tmp, mddev) {
3990                 char b[BDEVNAME_SIZE];
3991                 printk("<%s>", bdevname(rdev->bdev,b));
3992         }
3993         printk("\n");
3994
3995         err = do_md_run (mddev);
3996         if (err) {
3997                 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
3998                 do_md_stop (mddev, 0);
3999         }
4000 }
4001
4002 /*
4003  * lets try to run arrays based on all disks that have arrived
4004  * until now. (those are in pending_raid_disks)
4005  *
4006  * the method: pick the first pending disk, collect all disks with
4007  * the same UUID, remove all from the pending list and put them into
4008  * the 'same_array' list. Then order this list based on superblock
4009  * update time (freshest comes first), kick out 'old' disks and
4010  * compare superblocks. If everything's fine then run it.
4011  *
4012  * If "unit" is allocated, then bump its reference count
4013  */
4014 static void autorun_devices(int part)
4015 {
4016         struct list_head *tmp;
4017         mdk_rdev_t *rdev0, *rdev;
4018         mddev_t *mddev;
4019         char b[BDEVNAME_SIZE];
4020
4021         printk(KERN_INFO "md: autorun ...\n");
4022         while (!list_empty(&pending_raid_disks)) {
4023                 int unit;
4024                 dev_t dev;
4025                 LIST_HEAD(candidates);
4026                 rdev0 = list_entry(pending_raid_disks.next,
4027                                          mdk_rdev_t, same_set);
4028
4029                 printk(KERN_INFO "md: considering %s ...\n",
4030                         bdevname(rdev0->bdev,b));
4031                 INIT_LIST_HEAD(&candidates);
4032                 rdev_for_each_list(rdev, tmp, pending_raid_disks)
4033                         if (super_90_load(rdev, rdev0, 0) >= 0) {
4034                                 printk(KERN_INFO "md:  adding %s ...\n",
4035                                         bdevname(rdev->bdev,b));
4036                                 list_move(&rdev->same_set, &candidates);
4037                         }
4038                 /*
4039                  * now we have a set of devices, with all of them having
4040                  * mostly sane superblocks. It's time to allocate the
4041                  * mddev.
4042                  */
4043                 if (part) {
4044                         dev = MKDEV(mdp_major,
4045                                     rdev0->preferred_minor << MdpMinorShift);
4046                         unit = MINOR(dev) >> MdpMinorShift;
4047                 } else {
4048                         dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
4049                         unit = MINOR(dev);
4050                 }
4051                 if (rdev0->preferred_minor != unit) {
4052                         printk(KERN_INFO "md: unit number in %s is bad: %d\n",
4053                                bdevname(rdev0->bdev, b), rdev0->preferred_minor);
4054                         break;
4055                 }
4056
4057                 md_probe(dev, NULL, NULL);
4058                 mddev = mddev_find(dev);
4059                 if (!mddev || !mddev->gendisk) {
4060                         if (mddev)
4061                                 mddev_put(mddev);
4062                         printk(KERN_ERR
4063                                 "md: cannot allocate memory for md drive.\n");
4064                         break;
4065                 }
4066                 if (mddev_lock(mddev)) 
4067                         printk(KERN_WARNING "md: %s locked, cannot run\n",
4068                                mdname(mddev));
4069                 else if (mddev->raid_disks || mddev->major_version
4070                          || !list_empty(&mddev->disks)) {
4071                         printk(KERN_WARNING 
4072                                 "md: %s already running, cannot run %s\n",
4073                                 mdname(mddev), bdevname(rdev0->bdev,b));
4074                         mddev_unlock(mddev);
4075                 } else {
4076                         printk(KERN_INFO "md: created %s\n", mdname(mddev));
4077                         mddev->persistent = 1;
4078                         rdev_for_each_list(rdev, tmp, candidates) {
4079                                 list_del_init(&rdev->same_set);
4080                                 if (bind_rdev_to_array(rdev, mddev))
4081                                         export_rdev(rdev);
4082                         }
4083                         autorun_array(mddev);
4084                         mddev_unlock(mddev);
4085                 }
4086                 /* on success, candidates will be empty, on error
4087                  * it won't...
4088                  */
4089                 rdev_for_each_list(rdev, tmp, candidates)
4090                         export_rdev(rdev);
4091                 mddev_put(mddev);
4092         }
4093         printk(KERN_INFO "md: ... autorun DONE.\n");
4094 }
4095 #endif /* !MODULE */
4096
4097 static int get_version(void __user * arg)
4098 {
4099         mdu_version_t ver;
4100
4101         ver.major = MD_MAJOR_VERSION;
4102         ver.minor = MD_MINOR_VERSION;
4103         ver.patchlevel = MD_PATCHLEVEL_VERSION;
4104
4105         if (copy_to_user(arg, &ver, sizeof(ver)))
4106                 return -EFAULT;
4107
4108         return 0;
4109 }
4110
4111 static int get_array_info(mddev_t * mddev, void __user * arg)
4112 {
4113         mdu_array_info_t info;
4114         int nr,working,active,failed,spare;
4115         mdk_rdev_t *rdev;
4116         struct list_head *tmp;
4117
4118         nr=working=active=failed=spare=0;
4119         rdev_for_each(rdev, tmp, mddev) {
4120                 nr++;
4121                 if (test_bit(Faulty, &rdev->flags))
4122                         failed++;
4123                 else {
4124                         working++;
4125                         if (test_bit(In_sync, &rdev->flags))
4126                                 active++;       
4127                         else
4128                                 spare++;
4129                 }
4130         }
4131
4132         info.major_version = mddev->major_version;
4133         info.minor_version = mddev->minor_version;
4134         info.patch_version = MD_PATCHLEVEL_VERSION;
4135         info.ctime         = mddev->ctime;
4136         info.level         = mddev->level;
4137         info.size          = mddev->size;
4138         if (info.size != mddev->size) /* overflow */
4139                 info.size = -1;
4140         info.nr_disks      = nr;
4141         info.raid_disks    = mddev->raid_disks;
4142         info.md_minor      = mddev->md_minor;
4143         info.not_persistent= !mddev->persistent;
4144
4145         info.utime         = mddev->utime;
4146         info.state         = 0;
4147         if (mddev->in_sync)
4148                 info.state = (1<<MD_SB_CLEAN);
4149         if (mddev->bitmap && mddev->bitmap_offset)
4150                 info.state = (1<<MD_SB_BITMAP_PRESENT);
4151         info.active_disks  = active;
4152         info.working_disks = working;
4153         info.failed_disks  = failed;
4154         info.spare_disks   = spare;
4155
4156         info.layout        = mddev->layout;
4157         info.chunk_size    = mddev->chunk_size;
4158
4159         if (copy_to_user(arg, &info, sizeof(info)))
4160                 return -EFAULT;
4161
4162         return 0;
4163 }
4164
4165 static int get_bitmap_file(mddev_t * mddev, void __user * arg)
4166 {
4167         mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
4168         char *ptr, *buf = NULL;
4169         int err = -ENOMEM;
4170
4171         if (md_allow_write(mddev))
4172                 file = kmalloc(sizeof(*file), GFP_NOIO);
4173         else
4174                 file = kmalloc(sizeof(*file), GFP_KERNEL);
4175
4176         if (!file)
4177                 goto out;
4178
4179         /* bitmap disabled, zero the first byte and copy out */
4180         if (!mddev->bitmap || !mddev->bitmap->file) {
4181                 file->pathname[0] = '\0';
4182                 goto copy_out;
4183         }
4184
4185         buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
4186         if (!buf)
4187                 goto out;
4188
4189         ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
4190         if (IS_ERR(ptr))
4191                 goto out;
4192
4193         strcpy(file->pathname, ptr);
4194
4195 copy_out:
4196         err = 0;
4197         if (copy_to_user(arg, file, sizeof(*file)))
4198                 err = -EFAULT;
4199 out:
4200         kfree(buf);
4201         kfree(file);
4202         return err;
4203 }
4204
4205 static int get_disk_info(mddev_t * mddev, void __user * arg)
4206 {
4207         mdu_disk_info_t info;
4208         unsigned int nr;
4209         mdk_rdev_t *rdev;
4210
4211         if (copy_from_user(&info, arg, sizeof(info)))
4212                 return -EFAULT;
4213
4214         nr = info.number;
4215
4216         rdev = find_rdev_nr(mddev, nr);
4217         if (rdev) {
4218                 info.major = MAJOR(rdev->bdev->bd_dev);
4219                 info.minor = MINOR(rdev->bdev->bd_dev);
4220                 info.raid_disk = rdev->raid_disk;
4221                 info.state = 0;
4222                 if (test_bit(Faulty, &rdev->flags))
4223                         info.state |= (1<<MD_DISK_FAULTY);
4224                 else if (test_bit(In_sync, &rdev->flags)) {
4225                         info.state |= (1<<MD_DISK_ACTIVE);
4226                         info.state |= (1<<MD_DISK_SYNC);
4227                 }
4228                 if (test_bit(WriteMostly, &rdev->flags))
4229                         info.state |= (1<<MD_DISK_WRITEMOSTLY);
4230         } else {
4231                 info.major = info.minor = 0;
4232                 info.raid_disk = -1;
4233                 info.state = (1<<MD_DISK_REMOVED);
4234         }
4235
4236         if (copy_to_user(arg, &info, sizeof(info)))
4237                 return -EFAULT;
4238
4239         return 0;
4240 }
4241
4242 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4243 {
4244         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4245         mdk_rdev_t *rdev;
4246         dev_t dev = MKDEV(info->major,info->minor);
4247
4248         if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
4249                 return -EOVERFLOW;
4250
4251         if (!mddev->raid_disks) {
4252                 int err;
4253                 /* expecting a device which has a superblock */
4254                 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
4255                 if (IS_ERR(rdev)) {
4256                         printk(KERN_WARNING 
4257                                 "md: md_import_device returned %ld\n",
4258                                 PTR_ERR(rdev));
4259                         return PTR_ERR(rdev);
4260                 }
4261                 if (!list_empty(&mddev->disks)) {
4262                         mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
4263                                                         mdk_rdev_t, same_set);
4264                         int err = super_types[mddev->major_version]
4265                                 .load_super(rdev, rdev0, mddev->minor_version);
4266                         if (err < 0) {
4267                                 printk(KERN_WARNING 
4268                                         "md: %s has different UUID to %s\n",
4269                                         bdevname(rdev->bdev,b), 
4270                                         bdevname(rdev0->bdev,b2));
4271                                 export_rdev(rdev);
4272                                 return -EINVAL;
4273                         }
4274                 }
4275                 err = bind_rdev_to_array(rdev, mddev);
4276                 if (err)
4277                         export_rdev(rdev);
4278                 return err;
4279         }
4280
4281         /*
4282          * add_new_disk can be used once the array is assembled
4283          * to add "hot spares".  They must already have a superblock
4284          * written
4285          */
4286         if (mddev->pers) {
4287                 int err;
4288                 if (!mddev->pers->hot_add_disk) {
4289                         printk(KERN_WARNING 
4290                                 "%s: personality does not support diskops!\n",
4291                                mdname(mddev));
4292                         return -EINVAL;
4293                 }
4294                 if (mddev->persistent)
4295                         rdev = md_import_device(dev, mddev->major_version,
4296                                                 mddev->minor_version);
4297                 else
4298                         rdev = md_import_device(dev, -1, -1);
4299                 if (IS_ERR(rdev)) {
4300                         printk(KERN_WARNING 
4301                                 "md: md_import_device returned %ld\n",
4302                                 PTR_ERR(rdev));
4303                         return PTR_ERR(rdev);
4304                 }
4305                 /* set save_raid_disk if appropriate */
4306                 if (!mddev->persistent) {
4307                         if (info->state & (1<<MD_DISK_SYNC)  &&
4308                             info->raid_disk < mddev->raid_disks)
4309                                 rdev->raid_disk = info->raid_disk;
4310                         else
4311                                 rdev->raid_disk = -1;
4312                 } else
4313                         super_types[mddev->major_version].
4314                                 validate_super(mddev, rdev);
4315                 rdev->saved_raid_disk = rdev->raid_disk;
4316
4317                 clear_bit(In_sync, &rdev->flags); /* just to be sure */
4318                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4319                         set_bit(WriteMostly, &rdev->flags);
4320
4321                 rdev->raid_disk = -1;
4322                 err = bind_rdev_to_array(rdev, mddev);
4323                 if (!err && !mddev->pers->hot_remove_disk) {
4324                         /* If there is hot_add_disk but no hot_remove_disk
4325                          * then added disks for geometry changes,
4326                          * and should be added immediately.
4327                          */
4328                         super_types[mddev->major_version].
4329                                 validate_super(mddev, rdev);
4330                         err = mddev->pers->hot_add_disk(mddev, rdev);
4331                         if (err)
4332                                 unbind_rdev_from_array(rdev);
4333                 }
4334                 if (err)
4335                         export_rdev(rdev);
4336                 else
4337                         sysfs_notify(&rdev->kobj, NULL, "state");
4338
4339                 md_update_sb(mddev, 1);
4340                 if (mddev->degraded)
4341                         set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4342                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4343                 md_wakeup_thread(mddev->thread);
4344                 return err;
4345         }
4346
4347         /* otherwise, add_new_disk is only allowed
4348          * for major_version==0 superblocks
4349          */
4350         if (mddev->major_version != 0) {
4351                 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
4352                        mdname(mddev));
4353                 return -EINVAL;
4354         }
4355
4356         if (!(info->state & (1<<MD_DISK_FAULTY))) {
4357                 int err;
4358                 rdev = md_import_device (dev, -1, 0);
4359                 if (IS_ERR(rdev)) {
4360                         printk(KERN_WARNING 
4361                                 "md: error, md_import_device() returned %ld\n",
4362                                 PTR_ERR(rdev));
4363                         return PTR_ERR(rdev);
4364                 }
4365                 rdev->desc_nr = info->number;
4366                 if (info->raid_disk < mddev->raid_disks)
4367                         rdev->raid_disk = info->raid_disk;
4368                 else
4369                         rdev->raid_disk = -1;
4370
4371                 if (rdev->raid_disk < mddev->raid_disks)
4372                         if (info->state & (1<<MD_DISK_SYNC))
4373                                 set_bit(In_sync, &rdev->flags);
4374
4375                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4376                         set_bit(WriteMostly, &rdev->flags);
4377
4378                 if (!mddev->persistent) {
4379                         printk(KERN_INFO "md: nonpersistent superblock ...\n");
4380                         rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
4381                 } else 
4382                         rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
4383                 rdev->size = calc_dev_size(rdev, mddev->chunk_size);
4384
4385                 err = bind_rdev_to_array(rdev, mddev);
4386                 if (err) {
4387                         export_rdev(rdev);
4388                         return err;
4389                 }
4390         }
4391
4392         return 0;
4393 }
4394
4395 static int hot_remove_disk(mddev_t * mddev, dev_t dev)
4396 {
4397         char b[BDEVNAME_SIZE];
4398         mdk_rdev_t *rdev;
4399
4400         rdev = find_rdev(mddev, dev);
4401         if (!rdev)
4402                 return -ENXIO;
4403
4404         if (rdev->raid_disk >= 0)
4405                 goto busy;
4406
4407         kick_rdev_from_array(rdev);
4408         md_update_sb(mddev, 1);
4409         md_new_event(mddev);
4410
4411         return 0;
4412 busy:
4413         printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
4414                 bdevname(rdev->bdev,b), mdname(mddev));
4415         return -EBUSY;
4416 }
4417
4418 static int hot_add_disk(mddev_t * mddev, dev_t dev)
4419 {
4420         char b[BDEVNAME_SIZE];
4421         int err;
4422         unsigned int size;
4423         mdk_rdev_t *rdev;
4424
4425         if (!mddev->pers)
4426                 return -ENODEV;
4427
4428         if (mddev->major_version != 0) {
4429                 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
4430                         " version-0 superblocks.\n",
4431                         mdname(mddev));
4432                 return -EINVAL;
4433         }
4434         if (!mddev->pers->hot_add_disk) {
4435                 printk(KERN_WARNING 
4436                         "%s: personality does not support diskops!\n",
4437                         mdname(mddev));
4438                 return -EINVAL;
4439         }
4440
4441         rdev = md_import_device (dev, -1, 0);
4442         if (IS_ERR(rdev)) {
4443                 printk(KERN_WARNING 
4444                         "md: error, md_import_device() returned %ld\n",
4445                         PTR_ERR(rdev));
4446                 return -EINVAL;
4447         }
4448
4449         if (mddev->persistent)
4450                 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
4451         else
4452                 rdev->sb_offset =
4453                         rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
4454
4455         size = calc_dev_size(rdev, mddev->chunk_size);
4456         rdev->size = size;
4457
4458         if (test_bit(Faulty, &rdev->flags)) {
4459                 printk(KERN_WARNING 
4460                         "md: can not hot-add faulty %s disk to %s!\n",
4461                         bdevname(rdev->bdev,b), mdname(mddev));
4462                 err = -EINVAL;
4463                 goto abort_export;
4464         }
4465         clear_bit(In_sync, &rdev->flags);
4466         rdev->desc_nr = -1;
4467         rdev->saved_raid_disk = -1;
4468         err = bind_rdev_to_array(rdev, mddev);
4469         if (err)
4470                 goto abort_export;
4471
4472         /*
4473          * The rest should better be atomic, we can have disk failures
4474          * noticed in interrupt contexts ...
4475          */
4476
4477         if (rdev->desc_nr == mddev->max_disks) {
4478                 printk(KERN_WARNING "%s: can not hot-add to full array!\n",
4479                         mdname(mddev));
4480                 err = -EBUSY;
4481                 goto abort_unbind_export;
4482         }
4483
4484         rdev->raid_disk = -1;
4485
4486         md_update_sb(mddev, 1);
4487
4488         /*
4489          * Kick recovery, maybe this spare has to be added to the
4490          * array immediately.
4491          */
4492         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4493         md_wakeup_thread(mddev->thread);
4494         md_new_event(mddev);
4495         return 0;
4496
4497 abort_unbind_export:
4498         unbind_rdev_from_array(rdev);
4499
4500 abort_export:
4501         export_rdev(rdev);
4502         return err;
4503 }
4504
4505 static int set_bitmap_file(mddev_t *mddev, int fd)
4506 {
4507         int err;
4508
4509         if (mddev->pers) {
4510                 if (!mddev->pers->quiesce)
4511                         return -EBUSY;
4512                 if (mddev->recovery || mddev->sync_thread)
4513                         return -EBUSY;
4514                 /* we should be able to change the bitmap.. */
4515         }
4516
4517
4518         if (fd >= 0) {
4519                 if (mddev->bitmap)
4520                         return -EEXIST; /* cannot add when bitmap is present */
4521                 mddev->bitmap_file = fget(fd);
4522
4523                 if (mddev->bitmap_file == NULL) {
4524                         printk(KERN_ERR "%s: error: failed to get bitmap file\n",
4525                                mdname(mddev));
4526                         return -EBADF;
4527                 }
4528
4529                 err = deny_bitmap_write_access(mddev->bitmap_file);
4530                 if (err) {
4531                         printk(KERN_ERR "%s: error: bitmap file is already in use\n",
4532                                mdname(mddev));
4533                         fput(mddev->bitmap_file);
4534                         mddev->bitmap_file = NULL;
4535                         return err;
4536                 }
4537                 mddev->bitmap_offset = 0; /* file overrides offset */
4538         } else if (mddev->bitmap == NULL)
4539                 return -ENOENT; /* cannot remove what isn't there */
4540         err = 0;
4541         if (mddev->pers) {
4542                 mddev->pers->quiesce(mddev, 1);
4543                 if (fd >= 0)
4544                         err = bitmap_create(mddev);
4545                 if (fd < 0 || err) {
4546                         bitmap_destroy(mddev);
4547                         fd = -1; /* make sure to put the file */
4548                 }
4549                 mddev->pers->quiesce(mddev, 0);
4550         }
4551         if (fd < 0) {
4552                 if (mddev->bitmap_file) {
4553                         restore_bitmap_write_access(mddev->bitmap_file);
4554                         fput(mddev->bitmap_file);
4555                 }
4556                 mddev->bitmap_file = NULL;
4557         }
4558
4559         return err;
4560 }
4561
4562 /*
4563  * set_array_info is used two different ways
4564  * The original usage is when creating a new array.
4565  * In this usage, raid_disks is > 0 and it together with
4566  *  level, size, not_persistent,layout,chunksize determine the
4567  *  shape of the array.
4568  *  This will always create an array with a type-0.90.0 superblock.
4569  * The newer usage is when assembling an array.
4570  *  In this case raid_disks will be 0, and the major_version field is
4571  *  use to determine which style super-blocks are to be found on the devices.
4572  *  The minor and patch _version numbers are also kept incase the
4573  *  super_block handler wishes to interpret them.
4574  */
4575 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4576 {
4577
4578         if (info->raid_disks == 0) {
4579                 /* just setting version number for superblock loading */
4580                 if (info->major_version < 0 ||
4581                     info->major_version >= ARRAY_SIZE(super_types) ||
4582                     super_types[info->major_version].name == NULL) {
4583                         /* maybe try to auto-load a module? */
4584                         printk(KERN_INFO 
4585                                 "md: superblock version %d not known\n",
4586                                 info->major_version);
4587                         return -EINVAL;
4588                 }
4589                 mddev->major_version = info->major_version;
4590                 mddev->minor_version = info->minor_version;
4591                 mddev->patch_version = info->patch_version;
4592                 mddev->persistent = !info->not_persistent;
4593                 return 0;
4594         }
4595         mddev->major_version = MD_MAJOR_VERSION;
4596         mddev->minor_version = MD_MINOR_VERSION;
4597         mddev->patch_version = MD_PATCHLEVEL_VERSION;
4598         mddev->ctime         = get_seconds();
4599
4600         mddev->level         = info->level;
4601         mddev->clevel[0]     = 0;
4602         mddev->size          = info->size;
4603         mddev->raid_disks    = info->raid_disks;
4604         /* don't set md_minor, it is determined by which /dev/md* was
4605          * openned
4606          */
4607         if (info->state & (1<<MD_SB_CLEAN))
4608                 mddev->recovery_cp = MaxSector;
4609         else
4610                 mddev->recovery_cp = 0;
4611         mddev->persistent    = ! info->not_persistent;
4612         mddev->external      = 0;
4613
4614         mddev->layout        = info->layout;
4615         mddev->chunk_size    = info->chunk_size;
4616
4617         mddev->max_disks     = MD_SB_DISKS;
4618
4619         if (mddev->persistent)
4620                 mddev->flags         = 0;
4621         set_bit(MD_CHANGE_DEVS, &mddev->flags);
4622
4623         mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
4624         mddev->bitmap_offset = 0;
4625
4626         mddev->reshape_position = MaxSector;
4627
4628         /*
4629          * Generate a 128 bit UUID
4630          */
4631         get_random_bytes(mddev->uuid, 16);
4632
4633         mddev->new_level = mddev->level;
4634         mddev->new_chunk = mddev->chunk_size;
4635         mddev->new_layout = mddev->layout;
4636         mddev->delta_disks = 0;
4637
4638         return 0;
4639 }
4640
4641 static int update_size(mddev_t *mddev, unsigned long size)
4642 {
4643         mdk_rdev_t * rdev;
4644         int rv;
4645         struct list_head *tmp;
4646         int fit = (size == 0);
4647
4648         if (mddev->pers->resize == NULL)
4649                 return -EINVAL;
4650         /* The "size" is the amount of each device that is used.
4651          * This can only make sense for arrays with redundancy.
4652          * linear and raid0 always use whatever space is available
4653          * We can only consider changing the size if no resync
4654          * or reconstruction is happening, and if the new size
4655          * is acceptable. It must fit before the sb_offset or,
4656          * if that is <data_offset, it must fit before the
4657          * size of each device.
4658          * If size is zero, we find the largest size that fits.
4659          */
4660         if (mddev->sync_thread)
4661                 return -EBUSY;
4662         rdev_for_each(rdev, tmp, mddev) {
4663                 sector_t avail;
4664                 avail = rdev->size * 2;
4665
4666                 if (fit && (size == 0 || size > avail/2))
4667                         size = avail/2;
4668                 if (avail < ((sector_t)size << 1))
4669                         return -ENOSPC;
4670         }
4671         rv = mddev->pers->resize(mddev, (sector_t)size *2);
4672         if (!rv) {
4673                 struct block_device *bdev;
4674
4675                 bdev = bdget_disk(mddev->gendisk, 0);
4676                 if (bdev) {
4677                         mutex_lock(&bdev->bd_inode->i_mutex);
4678                         i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
4679                         mutex_unlock(&bdev->bd_inode->i_mutex);
4680                         bdput(bdev);
4681                 }
4682         }
4683         return rv;
4684 }
4685
4686 static int update_raid_disks(mddev_t *mddev, int raid_disks)
4687 {
4688         int rv;
4689         /* change the number of raid disks */
4690         if (mddev->pers->check_reshape == NULL)
4691                 return -EINVAL;
4692         if (raid_disks <= 0 ||
4693             raid_disks >= mddev->max_disks)
4694                 return -EINVAL;
4695         if (mddev->sync_thread || mddev->reshape_position != MaxSector)
4696                 return -EBUSY;
4697         mddev->delta_disks = raid_disks - mddev->raid_disks;
4698
4699         rv = mddev->pers->check_reshape(mddev);
4700         return rv;
4701 }
4702
4703
4704 /*
4705  * update_array_info is used to change the configuration of an
4706  * on-line array.
4707  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
4708  * fields in the info are checked against the array.
4709  * Any differences that cannot be handled will cause an error.
4710  * Normally, only one change can be managed at a time.
4711  */
4712 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4713 {
4714         int rv = 0;
4715         int cnt = 0;
4716         int state = 0;
4717
4718         /* calculate expected state,ignoring low bits */
4719         if (mddev->bitmap && mddev->bitmap_offset)
4720                 state |= (1 << MD_SB_BITMAP_PRESENT);
4721
4722         if (mddev->major_version != info->major_version ||
4723             mddev->minor_version != info->minor_version ||
4724 /*          mddev->patch_version != info->patch_version || */
4725             mddev->ctime         != info->ctime         ||
4726             mddev->level         != info->level         ||
4727 /*          mddev->layout        != info->layout        || */
4728             !mddev->persistent   != info->not_persistent||
4729             mddev->chunk_size    != info->chunk_size    ||
4730             /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
4731             ((state^info->state) & 0xfffffe00)
4732                 )
4733                 return -EINVAL;
4734         /* Check there is only one change */
4735         if (info->size >= 0 && mddev->size != info->size) cnt++;
4736         if (mddev->raid_disks != info->raid_disks) cnt++;
4737         if (mddev->layout != info->layout) cnt++;
4738         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
4739         if (cnt == 0) return 0;
4740         if (cnt > 1) return -EINVAL;
4741
4742         if (mddev->layout != info->layout) {
4743                 /* Change layout
4744                  * we don't need to do anything at the md level, the
4745                  * personality will take care of it all.
4746                  */
4747                 if (mddev->pers->reconfig == NULL)
4748                         return -EINVAL;
4749                 else
4750                         return mddev->pers->reconfig(mddev, info->layout, -1);
4751         }
4752         if (info->size >= 0 && mddev->size != info->size)
4753                 rv = update_size(mddev, info->size);
4754
4755         if (mddev->raid_disks    != info->raid_disks)
4756                 rv = update_raid_disks(mddev, info->raid_disks);
4757
4758         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
4759                 if (mddev->pers->quiesce == NULL)
4760                         return -EINVAL;
4761                 if (mddev->recovery || mddev->sync_thread)
4762                         return -EBUSY;
4763                 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
4764                         /* add the bitmap */
4765                         if (mddev->bitmap)
4766                                 return -EEXIST;
4767                         if (mddev->default_bitmap_offset == 0)
4768                                 return -EINVAL;
4769                         mddev->bitmap_offset = mddev->default_bitmap_offset;
4770                         mddev->pers->quiesce(mddev, 1);
4771                         rv = bitmap_create(mddev);
4772                         if (rv)
4773                                 bitmap_destroy(mddev);
4774                         mddev->pers->quiesce(mddev, 0);
4775                 } else {
4776                         /* remove the bitmap */
4777                         if (!mddev->bitmap)
4778                                 return -ENOENT;
4779                         if (mddev->bitmap->file)
4780                                 return -EINVAL;
4781                         mddev->pers->quiesce(mddev, 1);
4782                         bitmap_destroy(mddev);
4783                         mddev->pers->quiesce(mddev, 0);
4784                         mddev->bitmap_offset = 0;
4785                 }
4786         }
4787         md_update_sb(mddev, 1);
4788         return rv;
4789 }
4790
4791 static int set_disk_faulty(mddev_t *mddev, dev_t dev)
4792 {
4793         mdk_rdev_t *rdev;
4794
4795         if (mddev->pers == NULL)
4796                 return -ENODEV;
4797
4798         rdev = find_rdev(mddev, dev);
4799         if (!rdev)
4800                 return -ENODEV;
4801
4802         md_error(mddev, rdev);
4803         return 0;
4804 }
4805
4806 /*
4807  * We have a problem here : there is no easy way to give a CHS
4808  * virtual geometry. We currently pretend that we have a 2 heads
4809  * 4 sectors (with a BIG number of cylinders...). This drives
4810  * dosfs just mad... ;-)
4811  */
4812 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4813 {
4814         mddev_t *mddev = bdev->bd_disk->private_data;
4815
4816         geo->heads = 2;
4817         geo->sectors = 4;
4818         geo->cylinders = get_capacity(mddev->gendisk) / 8;
4819         return 0;
4820 }
4821
4822 static int md_ioctl(struct inode *inode, struct file *file,
4823                         unsigned int cmd, unsigned long arg)
4824 {
4825         int err = 0;
4826         void __user *argp = (void __user *)arg;
4827         mddev_t *mddev = NULL;
4828
4829         if (!capable(CAP_SYS_ADMIN))
4830                 return -EACCES;
4831
4832         /*
4833          * Commands dealing with the RAID driver but not any
4834          * particular array:
4835          */
4836         switch (cmd)
4837         {
4838                 case RAID_VERSION:
4839                         err = get_version(argp);
4840                         goto done;
4841
4842                 case PRINT_RAID_DEBUG:
4843                         err = 0;
4844                         md_print_devices();
4845                         goto done;
4846
4847 #ifndef MODULE
4848                 case RAID_AUTORUN:
4849                         err = 0;
4850                         autostart_arrays(arg);
4851                         goto done;
4852 #endif
4853                 default:;
4854         }
4855
4856         /*
4857          * Commands creating/starting a new array:
4858          */
4859
4860         mddev = inode->i_bdev->bd_disk->private_data;
4861
4862         if (!mddev) {
4863                 BUG();
4864                 goto abort;
4865         }
4866
4867         err = mddev_lock(mddev);
4868         if (err) {
4869                 printk(KERN_INFO 
4870                         "md: ioctl lock interrupted, reason %d, cmd %d\n",
4871                         err, cmd);
4872                 goto abort;
4873         }
4874
4875         switch (cmd)
4876         {
4877                 case SET_ARRAY_INFO:
4878                         {
4879                                 mdu_array_info_t info;
4880                                 if (!arg)
4881                                         memset(&info, 0, sizeof(info));
4882                                 else if (copy_from_user(&info, argp, sizeof(info))) {
4883                                         err = -EFAULT;
4884                                         goto abort_unlock;
4885                                 }
4886                                 if (mddev->pers) {
4887                                         err = update_array_info(mddev, &info);
4888                                         if (err) {
4889                                                 printk(KERN_WARNING "md: couldn't update"
4890                                                        " array info. %d\n", err);
4891                                                 goto abort_unlock;
4892                                         }
4893                                         goto done_unlock;
4894                                 }
4895                                 if (!list_empty(&mddev->disks)) {
4896                                         printk(KERN_WARNING
4897                                                "md: array %s already has disks!\n",
4898                                                mdname(mddev));
4899                                         err = -EBUSY;
4900                                         goto abort_unlock;
4901                                 }
4902                                 if (mddev->raid_disks) {
4903                                         printk(KERN_WARNING
4904                                                "md: array %s already initialised!\n",
4905                                                mdname(mddev));
4906                                         err = -EBUSY;
4907                                         goto abort_unlock;
4908                                 }
4909                                 err = set_array_info(mddev, &info);
4910                                 if (err) {
4911                                         printk(KERN_WARNING "md: couldn't set"
4912                                                " array info. %d\n", err);
4913                                         goto abort_unlock;
4914                                 }
4915                         }
4916                         goto done_unlock;
4917
4918                 default:;
4919         }
4920
4921         /*
4922          * Commands querying/configuring an existing array:
4923          */
4924         /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
4925          * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
4926         if ((!mddev->raid_disks && !mddev->external)
4927             && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
4928             && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
4929             && cmd != GET_BITMAP_FILE) {
4930                 err = -ENODEV;
4931                 goto abort_unlock;
4932         }
4933
4934         /*
4935          * Commands even a read-only array can execute:
4936          */
4937         switch (cmd)
4938         {
4939                 case GET_ARRAY_INFO:
4940                         err = get_array_info(mddev, argp);
4941                         goto done_unlock;
4942
4943                 case GET_BITMAP_FILE:
4944                         err = get_bitmap_file(mddev, argp);
4945                         goto done_unlock;
4946
4947                 case GET_DISK_INFO:
4948                         err = get_disk_info(mddev, argp);
4949                         goto done_unlock;
4950
4951                 case RESTART_ARRAY_RW:
4952                         err = restart_array(mddev);
4953                         goto done_unlock;
4954
4955                 case STOP_ARRAY:
4956                         err = do_md_stop (mddev, 0);
4957                         goto done_unlock;
4958
4959                 case STOP_ARRAY_RO:
4960                         err = do_md_stop (mddev, 1);
4961                         goto done_unlock;
4962
4963         }
4964
4965         /*
4966          * The remaining ioctls are changing the state of the
4967          * superblock, so we do not allow them on read-only arrays.
4968          * However non-MD ioctls (e.g. get-size) will still come through
4969          * here and hit the 'default' below, so only disallow
4970          * 'md' ioctls, and switch to rw mode if started auto-readonly.
4971          */
4972         if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
4973                 if (mddev->ro == 2) {
4974                         mddev->ro = 0;
4975                         sysfs_notify(&mddev->kobj, NULL, "array_state");
4976                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4977                         md_wakeup_thread(mddev->thread);
4978                 } else {
4979                         err = -EROFS;
4980                         goto abort_unlock;
4981                 }
4982         }
4983
4984         switch (cmd)
4985         {
4986                 case ADD_NEW_DISK:
4987                 {
4988                         mdu_disk_info_t info;
4989                         if (copy_from_user(&info, argp, sizeof(info)))
4990                                 err = -EFAULT;
4991                         else
4992                                 err = add_new_disk(mddev, &info);
4993                         goto done_unlock;
4994                 }
4995
4996                 case HOT_REMOVE_DISK:
4997                         err = hot_remove_disk(mddev, new_decode_dev(arg));
4998                         goto done_unlock;
4999
5000                 case HOT_ADD_DISK:
5001                         err = hot_add_disk(mddev, new_decode_dev(arg));
5002                         goto done_unlock;
5003
5004                 case SET_DISK_FAULTY:
5005                         err = set_disk_faulty(mddev, new_decode_dev(arg));
5006                         goto done_unlock;
5007
5008                 case RUN_ARRAY:
5009                         err = do_md_run (mddev);
5010                         goto done_unlock;
5011
5012                 case SET_BITMAP_FILE:
5013                         err = set_bitmap_file(mddev, (int)arg);
5014                         goto done_unlock;
5015
5016                 default:
5017                         err = -EINVAL;
5018                         goto abort_unlock;
5019         }
5020
5021 done_unlock:
5022 abort_unlock:
5023         mddev_unlock(mddev);
5024
5025         return err;
5026 done:
5027         if (err)
5028                 MD_BUG();
5029 abort:
5030         return err;
5031 }
5032
5033 static int md_open(struct inode *inode, struct file *file)
5034 {
5035         /*
5036          * Succeed if we can lock the mddev, which confirms that
5037          * it isn't being stopped right now.
5038          */
5039         mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
5040         int err;
5041
5042         if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
5043                 goto out;
5044
5045         err = 0;
5046         mddev_get(mddev);
5047         mddev_unlock(mddev);
5048
5049         check_disk_change(inode->i_bdev);
5050  out:
5051         return err;
5052 }
5053
5054 static int md_release(struct inode *inode, struct file * file)
5055 {
5056         mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
5057
5058         BUG_ON(!mddev);
5059         mddev_put(mddev);
5060
5061         return 0;
5062 }
5063
5064 static int md_media_changed(struct gendisk *disk)
5065 {
5066         mddev_t *mddev = disk->private_data;
5067
5068         return mddev->changed;
5069 }
5070
5071 static int md_revalidate(struct gendisk *disk)
5072 {
5073         mddev_t *mddev = disk->private_data;
5074
5075         mddev->changed = 0;
5076         return 0;
5077 }
5078 static struct block_device_operations md_fops =
5079 {
5080         .owner          = THIS_MODULE,
5081         .open           = md_open,
5082         .release        = md_release,
5083         .ioctl          = md_ioctl,
5084         .getgeo         = md_getgeo,
5085         .media_changed  = md_media_changed,
5086         .revalidate_disk= md_revalidate,
5087 };
5088
5089 static int md_thread(void * arg)
5090 {
5091         mdk_thread_t *thread = arg;
5092
5093         /*
5094          * md_thread is a 'system-thread', it's priority should be very
5095          * high. We avoid resource deadlocks individually in each
5096          * raid personality. (RAID5 does preallocation) We also use RR and
5097          * the very same RT priority as kswapd, thus we will never get
5098          * into a priority inversion deadlock.
5099          *
5100          * we definitely have to have equal or higher priority than
5101          * bdflush, otherwise bdflush will deadlock if there are too
5102          * many dirty RAID5 blocks.
5103          */
5104
5105         allow_signal(SIGKILL);
5106         while (!kthread_should_stop()) {
5107
5108                 /* We need to wait INTERRUPTIBLE so that
5109                  * we don't add to the load-average.
5110                  * That means we need to be sure no signals are
5111                  * pending
5112                  */
5113                 if (signal_pending(current))
5114                         flush_signals(current);
5115
5116                 wait_event_interruptible_timeout
5117                         (thread->wqueue,
5118                          test_bit(THREAD_WAKEUP, &thread->flags)
5119                          || kthread_should_stop(),
5120                          thread->timeout);
5121
5122                 clear_bit(THREAD_WAKEUP, &thread->flags);
5123
5124                 thread->run(thread->mddev);
5125         }
5126
5127         return 0;
5128 }
5129
5130 void md_wakeup_thread(mdk_thread_t *thread)
5131 {
5132         if (thread) {
5133                 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
5134                 set_bit(THREAD_WAKEUP, &thread->flags);
5135                 wake_up(&thread->wqueue);
5136         }
5137 }
5138
5139 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
5140                                  const char *name)
5141 {
5142         mdk_thread_t *thread;
5143
5144         thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
5145         if (!thread)
5146                 return NULL;
5147
5148         init_waitqueue_head(&thread->wqueue);
5149
5150         thread->run = run;
5151         thread->mddev = mddev;
5152         thread->timeout = MAX_SCHEDULE_TIMEOUT;
5153         thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
5154         if (IS_ERR(thread->tsk)) {
5155                 kfree(thread);
5156                 return NULL;
5157         }
5158         return thread;
5159 }
5160
5161 void md_unregister_thread(mdk_thread_t *thread)
5162 {
5163         dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
5164
5165         kthread_stop(thread->tsk);
5166         kfree(thread);
5167 }
5168
5169 void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5170 {
5171         if (!mddev) {
5172                 MD_BUG();
5173                 return;
5174         }
5175
5176         if (!rdev || test_bit(Faulty, &rdev->flags))
5177                 return;
5178
5179         if (mddev->external)
5180                 set_bit(Blocked, &rdev->flags);
5181 /*
5182         dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
5183                 mdname(mddev),
5184                 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
5185                 __builtin_return_address(0),__builtin_return_address(1),
5186                 __builtin_return_address(2),__builtin_return_address(3));
5187 */
5188         if (!mddev->pers)
5189                 return;
5190         if (!mddev->pers->error_handler)
5191                 return;
5192         mddev->pers->error_handler(mddev,rdev);
5193         if (mddev->degraded)
5194                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5195         set_bit(StateChanged, &rdev->flags);
5196         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5197         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5198         md_wakeup_thread(mddev->thread);
5199         md_new_event_inintr(mddev);
5200 }
5201
5202 /* seq_file implementation /proc/mdstat */
5203
5204 static void status_unused(struct seq_file *seq)
5205 {
5206         int i = 0;
5207         mdk_rdev_t *rdev;
5208         struct list_head *tmp;
5209
5210         seq_printf(seq, "unused devices: ");
5211
5212         rdev_for_each_list(rdev, tmp, pending_raid_disks) {
5213                 char b[BDEVNAME_SIZE];
5214                 i++;
5215                 seq_printf(seq, "%s ",
5216                               bdevname(rdev->bdev,b));
5217         }
5218         if (!i)
5219                 seq_printf(seq, "<none>");
5220
5221         seq_printf(seq, "\n");
5222 }
5223
5224
5225 static void status_resync(struct seq_file *seq, mddev_t * mddev)
5226 {
5227         sector_t max_blocks, resync, res;
5228         unsigned long dt, db, rt;
5229         int scale;
5230         unsigned int per_milli;
5231
5232         resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
5233
5234         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5235                 max_blocks = mddev->resync_max_sectors >> 1;
5236         else
5237                 max_blocks = mddev->size;
5238
5239         /*
5240          * Should not happen.
5241          */
5242         if (!max_blocks) {
5243                 MD_BUG();
5244                 return;
5245         }
5246         /* Pick 'scale' such that (resync>>scale)*1000 will fit
5247          * in a sector_t, and (max_blocks>>scale) will fit in a
5248          * u32, as those are the requirements for sector_div.
5249          * Thus 'scale' must be at least 10
5250          */
5251         scale = 10;
5252         if (sizeof(sector_t) > sizeof(unsigned long)) {
5253                 while ( max_blocks/2 > (1ULL<<(scale+32)))
5254                         scale++;
5255         }
5256         res = (resync>>scale)*1000;
5257         sector_div(res, (u32)((max_blocks>>scale)+1));
5258
5259         per_milli = res;
5260         {
5261                 int i, x = per_milli/50, y = 20-x;
5262                 seq_printf(seq, "[");
5263                 for (i = 0; i < x; i++)
5264                         seq_printf(seq, "=");
5265                 seq_printf(seq, ">");
5266                 for (i = 0; i < y; i++)
5267                         seq_printf(seq, ".");
5268                 seq_printf(seq, "] ");
5269         }
5270         seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
5271                    (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
5272                     "reshape" :
5273                     (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
5274                      "check" :
5275                      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
5276                       "resync" : "recovery"))),
5277                    per_milli/10, per_milli % 10,
5278                    (unsigned long long) resync,
5279                    (unsigned long long) max_blocks);
5280
5281         /*
5282          * We do not want to overflow, so the order of operands and
5283          * the * 100 / 100 trick are important. We do a +1 to be
5284          * safe against division by zero. We only estimate anyway.
5285          *
5286          * dt: time from mark until now
5287          * db: blocks written from mark until now
5288          * rt: remaining time
5289          */
5290         dt = ((jiffies - mddev->resync_mark) / HZ);
5291         if (!dt) dt++;
5292         db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
5293                 - mddev->resync_mark_cnt;
5294         rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100;
5295
5296         seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
5297
5298         seq_printf(seq, " speed=%ldK/sec", db/2/dt);
5299 }
5300
5301 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
5302 {
5303         struct list_head *tmp;
5304         loff_t l = *pos;
5305         mddev_t *mddev;
5306
5307         if (l >= 0x10000)
5308                 return NULL;
5309         if (!l--)
5310                 /* header */
5311                 return (void*)1;
5312
5313         spin_lock(&all_mddevs_lock);
5314         list_for_each(tmp,&all_mddevs)
5315                 if (!l--) {
5316                         mddev = list_entry(tmp, mddev_t, all_mddevs);
5317                         mddev_get(mddev);
5318                         spin_unlock(&all_mddevs_lock);
5319                         return mddev;
5320                 }
5321         spin_unlock(&all_mddevs_lock);
5322         if (!l--)
5323                 return (void*)2;/* tail */
5324         return NULL;
5325 }
5326
5327 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
5328 {
5329         struct list_head *tmp;
5330         mddev_t *next_mddev, *mddev = v;
5331         
5332         ++*pos;
5333         if (v == (void*)2)
5334                 return NULL;
5335
5336         spin_lock(&all_mddevs_lock);
5337         if (v == (void*)1)
5338                 tmp = all_mddevs.next;
5339         else
5340                 tmp = mddev->all_mddevs.next;
5341         if (tmp != &all_mddevs)
5342                 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
5343         else {
5344                 next_mddev = (void*)2;
5345                 *pos = 0x10000;
5346         }               
5347         spin_unlock(&all_mddevs_lock);
5348
5349         if (v != (void*)1)
5350                 mddev_put(mddev);
5351         return next_mddev;
5352
5353 }
5354
5355 static void md_seq_stop(struct seq_file *seq, void *v)
5356 {
5357         mddev_t *mddev = v;
5358
5359         if (mddev && v != (void*)1 && v != (void*)2)
5360                 mddev_put(mddev);
5361 }
5362
5363 struct mdstat_info {
5364         int event;
5365 };
5366
5367 static int md_seq_show(struct seq_file *seq, void *v)
5368 {
5369         mddev_t *mddev = v;
5370         sector_t size;
5371         struct list_head *tmp2;
5372         mdk_rdev_t *rdev;
5373         struct mdstat_info *mi = seq->private;
5374         struct bitmap *bitmap;
5375
5376         if (v == (void*)1) {
5377                 struct mdk_personality *pers;
5378                 seq_printf(seq, "Personalities : ");
5379                 spin_lock(&pers_lock);
5380                 list_for_each_entry(pers, &pers_list, list)
5381                         seq_printf(seq, "[%s] ", pers->name);
5382
5383                 spin_unlock(&pers_lock);
5384                 seq_printf(seq, "\n");
5385                 mi->event = atomic_read(&md_event_count);
5386                 return 0;
5387         }
5388         if (v == (void*)2) {
5389                 status_unused(seq);
5390                 return 0;
5391         }
5392
5393         if (mddev_lock(mddev) < 0)
5394                 return -EINTR;
5395
5396         if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
5397                 seq_printf(seq, "%s : %sactive", mdname(mddev),
5398                                                 mddev->pers ? "" : "in");
5399                 if (mddev->pers) {
5400                         if (mddev->ro==1)
5401                                 seq_printf(seq, " (read-only)");
5402                         if (mddev->ro==2)
5403                                 seq_printf(seq, " (auto-read-only)");
5404                         seq_printf(seq, " %s", mddev->pers->name);
5405                 }
5406
5407                 size = 0;
5408                 rdev_for_each(rdev, tmp2, mddev) {
5409                         char b[BDEVNAME_SIZE];
5410                         seq_printf(seq, " %s[%d]",
5411                                 bdevname(rdev->bdev,b), rdev->desc_nr);
5412                         if (test_bit(WriteMostly, &rdev->flags))
5413                                 seq_printf(seq, "(W)");
5414                         if (test_bit(Faulty, &rdev->flags)) {
5415                                 seq_printf(seq, "(F)");
5416                                 continue;
5417                         } else if (rdev->raid_disk < 0)
5418                                 seq_printf(seq, "(S)"); /* spare */
5419                         size += rdev->size;
5420                 }
5421
5422                 if (!list_empty(&mddev->disks)) {
5423                         if (mddev->pers)
5424                                 seq_printf(seq, "\n      %llu blocks",
5425                                         (unsigned long long)mddev->array_size);
5426                         else
5427                                 seq_printf(seq, "\n      %llu blocks",
5428                                         (unsigned long long)size);
5429                 }
5430                 if (mddev->persistent) {
5431                         if (mddev->major_version != 0 ||
5432                             mddev->minor_version != 90) {
5433                                 seq_printf(seq," super %d.%d",
5434                                            mddev->major_version,
5435                                            mddev->minor_version);
5436                         }
5437                 } else if (mddev->external)
5438                         seq_printf(seq, " super external:%s",
5439                                    mddev->metadata_type);
5440                 else
5441                         seq_printf(seq, " super non-persistent");
5442
5443                 if (mddev->pers) {
5444                         mddev->pers->status (seq, mddev);
5445                         seq_printf(seq, "\n      ");
5446                         if (mddev->pers->sync_request) {
5447                                 if (mddev->curr_resync > 2) {
5448                                         status_resync (seq, mddev);
5449                                         seq_printf(seq, "\n      ");
5450                                 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
5451                                         seq_printf(seq, "\tresync=DELAYED\n      ");
5452                                 else if (mddev->recovery_cp < MaxSector)
5453                                         seq_printf(seq, "\tresync=PENDING\n      ");
5454                         }
5455                 } else
5456                         seq_printf(seq, "\n       ");
5457
5458                 if ((bitmap = mddev->bitmap)) {
5459                         unsigned long chunk_kb;
5460                         unsigned long flags;
5461                         spin_lock_irqsave(&bitmap->lock, flags);
5462                         chunk_kb = bitmap->chunksize >> 10;
5463                         seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
5464                                 "%lu%s chunk",
5465                                 bitmap->pages - bitmap->missing_pages,
5466                                 bitmap->pages,
5467                                 (bitmap->pages - bitmap->missing_pages)
5468                                         << (PAGE_SHIFT - 10),
5469                                 chunk_kb ? chunk_kb : bitmap->chunksize,
5470                                 chunk_kb ? "KB" : "B");
5471                         if (bitmap->file) {
5472                                 seq_printf(seq, ", file: ");
5473                                 seq_path(seq, &bitmap->file->f_path, " \t\n");
5474                         }
5475
5476                         seq_printf(seq, "\n");
5477                         spin_unlock_irqrestore(&bitmap->lock, flags);
5478                 }
5479
5480                 seq_printf(seq, "\n");
5481         }
5482         mddev_unlock(mddev);
5483         
5484         return 0;
5485 }
5486
5487 static struct seq_operations md_seq_ops = {
5488         .start  = md_seq_start,
5489         .next   = md_seq_next,
5490         .stop   = md_seq_stop,
5491         .show   = md_seq_show,
5492 };
5493
5494 static int md_seq_open(struct inode *inode, struct file *file)
5495 {
5496         int error;
5497         struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
5498         if (mi == NULL)
5499                 return -ENOMEM;
5500
5501         error = seq_open(file, &md_seq_ops);
5502         if (error)
5503                 kfree(mi);
5504         else {
5505                 struct seq_file *p = file->private_data;
5506                 p->private = mi;
5507                 mi->event = atomic_read(&md_event_count);
5508         }
5509         return error;
5510 }
5511
5512 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
5513 {
5514         struct seq_file *m = filp->private_data;
5515         struct mdstat_info *mi = m->private;
5516         int mask;
5517
5518         poll_wait(filp, &md_event_waiters, wait);
5519
5520         /* always allow read */
5521         mask = POLLIN | POLLRDNORM;
5522
5523         if (mi->event != atomic_read(&md_event_count))
5524                 mask |= POLLERR | POLLPRI;
5525         return mask;
5526 }
5527
5528 static const struct file_operations md_seq_fops = {
5529         .owner          = THIS_MODULE,
5530         .open           = md_seq_open,
5531         .read           = seq_read,
5532         .llseek         = seq_lseek,
5533         .release        = seq_release_private,
5534         .poll           = mdstat_poll,
5535 };
5536
5537 int register_md_personality(struct mdk_personality *p)
5538 {
5539         spin_lock(&pers_lock);
5540         list_add_tail(&p->list, &pers_list);
5541         printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
5542         spin_unlock(&pers_lock);
5543         return 0;
5544 }
5545
5546 int unregister_md_personality(struct mdk_personality *p)
5547 {
5548         printk(KERN_INFO "md: %s personality unregistered\n", p->name);
5549         spin_lock(&pers_lock);
5550         list_del_init(&p->list);
5551         spin_unlock(&pers_lock);
5552         return 0;
5553 }
5554
5555 static int is_mddev_idle(mddev_t *mddev)
5556 {
5557         mdk_rdev_t * rdev;
5558         struct list_head *tmp;
5559         int idle;
5560         long curr_events;
5561
5562         idle = 1;
5563         rdev_for_each(rdev, tmp, mddev) {
5564                 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
5565                 curr_events = disk_stat_read(disk, sectors[0]) + 
5566                                 disk_stat_read(disk, sectors[1]) - 
5567                                 atomic_read(&disk->sync_io);
5568                 /* sync IO will cause sync_io to increase before the disk_stats
5569                  * as sync_io is counted when a request starts, and
5570                  * disk_stats is counted when it completes.
5571                  * So resync activity will cause curr_events to be smaller than
5572                  * when there was no such activity.
5573                  * non-sync IO will cause disk_stat to increase without
5574                  * increasing sync_io so curr_events will (eventually)
5575                  * be larger than it was before.  Once it becomes
5576                  * substantially larger, the test below will cause
5577                  * the array to appear non-idle, and resync will slow
5578                  * down.
5579                  * If there is a lot of outstanding resync activity when
5580                  * we set last_event to curr_events, then all that activity
5581                  * completing might cause the array to appear non-idle
5582                  * and resync will be slowed down even though there might
5583                  * not have been non-resync activity.  This will only
5584                  * happen once though.  'last_events' will soon reflect
5585                  * the state where there is little or no outstanding
5586                  * resync requests, and further resync activity will
5587                  * always make curr_events less than last_events.
5588                  *
5589                  */
5590                 if (curr_events - rdev->last_events > 4096) {
5591                         rdev->last_events = curr_events;
5592                         idle = 0;
5593                 }
5594         }
5595         return idle;
5596 }
5597
5598 void md_done_sync(mddev_t *mddev, int blocks, int ok)
5599 {
5600         /* another "blocks" (512byte) blocks have been synced */
5601         atomic_sub(blocks, &mddev->recovery_active);
5602         wake_up(&mddev->recovery_wait);
5603         if (!ok) {
5604                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5605                 md_wakeup_thread(mddev->thread);
5606                 // stop recovery, signal do_sync ....
5607         }
5608 }
5609
5610
5611 /* md_write_start(mddev, bi)
5612  * If we need to update some array metadata (e.g. 'active' flag
5613  * in superblock) before writing, schedule a superblock update
5614  * and wait for it to complete.
5615  */
5616 void md_write_start(mddev_t *mddev, struct bio *bi)
5617 {
5618         int did_change = 0;
5619         if (bio_data_dir(bi) != WRITE)
5620                 return;
5621
5622         BUG_ON(mddev->ro == 1);
5623         if (mddev->ro == 2) {
5624                 /* need to switch to read/write */
5625                 mddev->ro = 0;
5626                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5627                 md_wakeup_thread(mddev->thread);
5628                 md_wakeup_thread(mddev->sync_thread);
5629                 did_change = 1;
5630         }
5631         atomic_inc(&mddev->writes_pending);
5632         if (mddev->safemode == 1)
5633                 mddev->safemode = 0;
5634         if (mddev->in_sync) {
5635                 spin_lock_irq(&mddev->write_lock);
5636                 if (mddev->in_sync) {
5637                         mddev->in_sync = 0;
5638                         set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5639                         md_wakeup_thread(mddev->thread);
5640                         did_change = 1;
5641                 }
5642                 spin_unlock_irq(&mddev->write_lock);
5643         }
5644         if (did_change)
5645                 sysfs_notify(&mddev->kobj, NULL, "array_state");
5646         wait_event(mddev->sb_wait,
5647                    !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
5648                    !test_bit(MD_CHANGE_PENDING, &mddev->flags));
5649 }
5650
5651 void md_write_end(mddev_t *mddev)
5652 {
5653         if (atomic_dec_and_test(&mddev->writes_pending)) {
5654                 if (mddev->safemode == 2)
5655                         md_wakeup_thread(mddev->thread);
5656                 else if (mddev->safemode_delay)
5657                         mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
5658         }
5659 }
5660
5661 /* md_allow_write(mddev)
5662  * Calling this ensures that the array is marked 'active' so that writes
5663  * may proceed without blocking.  It is important to call this before
5664  * attempting a GFP_KERNEL allocation while holding the mddev lock.
5665  * Must be called with mddev_lock held.
5666  *
5667  * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
5668  * is dropped, so return -EAGAIN after notifying userspace.
5669  */
5670 int md_allow_write(mddev_t *mddev)
5671 {
5672         if (!mddev->pers)
5673                 return 0;
5674         if (mddev->ro)
5675                 return 0;
5676         if (!mddev->pers->sync_request)
5677                 return 0;
5678
5679         spin_lock_irq(&mddev->write_lock);
5680         if (mddev->in_sync) {
5681                 mddev->in_sync = 0;
5682                 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5683                 if (mddev->safemode_delay &&
5684                     mddev->safemode == 0)
5685                         mddev->safemode = 1;
5686                 spin_unlock_irq(&mddev->write_lock);
5687                 md_update_sb(mddev, 0);
5688                 sysfs_notify(&mddev->kobj, NULL, "array_state");
5689         } else
5690                 spin_unlock_irq(&mddev->write_lock);
5691
5692         if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
5693                 return -EAGAIN;
5694         else
5695                 return 0;
5696 }
5697 EXPORT_SYMBOL_GPL(md_allow_write);
5698
5699 #define SYNC_MARKS      10
5700 #define SYNC_MARK_STEP  (3*HZ)
5701 void md_do_sync(mddev_t *mddev)
5702 {
5703         mddev_t *mddev2;
5704         unsigned int currspeed = 0,
5705                  window;
5706         sector_t max_sectors,j, io_sectors;
5707         unsigned long mark[SYNC_MARKS];
5708         sector_t mark_cnt[SYNC_MARKS];
5709         int last_mark,m;
5710         struct list_head *tmp;
5711         sector_t last_check;
5712         int skipped = 0;
5713         struct list_head *rtmp;
5714         mdk_rdev_t *rdev;
5715         char *desc;
5716
5717         /* just incase thread restarts... */
5718         if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
5719                 return;
5720         if (mddev->ro) /* never try to sync a read-only array */
5721                 return;
5722
5723         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5724                 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
5725                         desc = "data-check";
5726                 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5727                         desc = "requested-resync";
5728                 else
5729                         desc = "resync";
5730         } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5731                 desc = "reshape";
5732         else
5733                 desc = "recovery";
5734
5735         /* we overload curr_resync somewhat here.
5736          * 0 == not engaged in resync at all
5737          * 2 == checking that there is no conflict with another sync
5738          * 1 == like 2, but have yielded to allow conflicting resync to
5739          *              commense
5740          * other == active in resync - this many blocks
5741          *
5742          * Before starting a resync we must have set curr_resync to
5743          * 2, and then checked that every "conflicting" array has curr_resync
5744          * less than ours.  When we find one that is the same or higher
5745          * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
5746          * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
5747          * This will mean we have to start checking from the beginning again.
5748          *
5749          */
5750
5751         do {
5752                 mddev->curr_resync = 2;
5753
5754         try_again:
5755                 if (kthread_should_stop()) {
5756                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5757                         goto skip;
5758                 }
5759                 for_each_mddev(mddev2, tmp) {
5760                         if (mddev2 == mddev)
5761                                 continue;
5762                         if (!mddev->parallel_resync
5763                         &&  mddev2->curr_resync
5764                         &&  match_mddev_units(mddev, mddev2)) {
5765                                 DEFINE_WAIT(wq);
5766                                 if (mddev < mddev2 && mddev->curr_resync == 2) {
5767                                         /* arbitrarily yield */
5768                                         mddev->curr_resync = 1;
5769                                         wake_up(&resync_wait);
5770                                 }
5771                                 if (mddev > mddev2 && mddev->curr_resync == 1)
5772                                         /* no need to wait here, we can wait the next
5773                                          * time 'round when curr_resync == 2
5774                                          */
5775                                         continue;
5776                                 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
5777                                 if (!kthread_should_stop() &&
5778                                     mddev2->curr_resync >= mddev->curr_resync) {
5779                                         printk(KERN_INFO "md: delaying %s of %s"
5780                                                " until %s has finished (they"
5781                                                " share one or more physical units)\n",
5782                                                desc, mdname(mddev), mdname(mddev2));
5783                                         mddev_put(mddev2);
5784                                         schedule();
5785                                         finish_wait(&resync_wait, &wq);
5786                                         goto try_again;
5787                                 }
5788                                 finish_wait(&resync_wait, &wq);
5789                         }
5790                 }
5791         } while (mddev->curr_resync < 2);
5792
5793         j = 0;
5794         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5795                 /* resync follows the size requested by the personality,
5796                  * which defaults to physical size, but can be virtual size
5797                  */
5798                 max_sectors = mddev->resync_max_sectors;
5799                 mddev->resync_mismatches = 0;
5800                 /* we don't use the checkpoint if there's a bitmap */
5801                 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5802                         j = mddev->resync_min;
5803                 else if (!mddev->bitmap)
5804                         j = mddev->recovery_cp;
5805
5806         } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5807                 max_sectors = mddev->size << 1;
5808         else {
5809                 /* recovery follows the physical size of devices */
5810                 max_sectors = mddev->size << 1;
5811                 j = MaxSector;
5812                 rdev_for_each(rdev, rtmp, mddev)
5813                         if (rdev->raid_disk >= 0 &&
5814                             !test_bit(Faulty, &rdev->flags) &&
5815                             !test_bit(In_sync, &rdev->flags) &&
5816                             rdev->recovery_offset < j)
5817                                 j = rdev->recovery_offset;
5818         }
5819
5820         printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
5821         printk(KERN_INFO "md: minimum _guaranteed_  speed:"
5822                 " %d KB/sec/disk.\n", speed_min(mddev));
5823         printk(KERN_INFO "md: using maximum available idle IO bandwidth "
5824                "(but not more than %d KB/sec) for %s.\n",
5825                speed_max(mddev), desc);
5826
5827         is_mddev_idle(mddev); /* this also initializes IO event counters */
5828
5829         io_sectors = 0;
5830         for (m = 0; m < SYNC_MARKS; m++) {
5831                 mark[m] = jiffies;
5832                 mark_cnt[m] = io_sectors;
5833         }
5834         last_mark = 0;
5835         mddev->resync_mark = mark[last_mark];
5836         mddev->resync_mark_cnt = mark_cnt[last_mark];
5837
5838         /*
5839          * Tune reconstruction:
5840          */
5841         window = 32*(PAGE_SIZE/512);
5842         printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
5843                 window/2,(unsigned long long) max_sectors/2);
5844
5845         atomic_set(&mddev->recovery_active, 0);
5846         last_check = 0;
5847
5848         if (j>2) {
5849                 printk(KERN_INFO 
5850                        "md: resuming %s of %s from checkpoint.\n",
5851                        desc, mdname(mddev));
5852                 mddev->curr_resync = j;
5853         }
5854
5855         while (j < max_sectors) {
5856                 sector_t sectors;
5857
5858                 skipped = 0;
5859                 if (j >= mddev->resync_max) {
5860                         sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5861                         wait_event(mddev->recovery_wait,
5862                                    mddev->resync_max > j
5863                                    || kthread_should_stop());
5864                 }
5865                 if (kthread_should_stop())
5866                         goto interrupted;
5867                 sectors = mddev->pers->sync_request(mddev, j, &skipped,
5868                                                   currspeed < speed_min(mddev));
5869                 if (sectors == 0) {
5870                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5871                         goto out;
5872                 }
5873
5874                 if (!skipped) { /* actual IO requested */
5875                         io_sectors += sectors;
5876                         atomic_add(sectors, &mddev->recovery_active);
5877                 }
5878
5879                 j += sectors;
5880                 if (j>1) mddev->curr_resync = j;
5881                 mddev->curr_mark_cnt = io_sectors;
5882                 if (last_check == 0)
5883                         /* this is the earliers that rebuilt will be
5884                          * visible in /proc/mdstat
5885                          */
5886                         md_new_event(mddev);
5887
5888                 if (last_check + window > io_sectors || j == max_sectors)
5889                         continue;
5890
5891                 last_check = io_sectors;
5892
5893                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5894                         break;
5895
5896         repeat:
5897                 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
5898                         /* step marks */
5899                         int next = (last_mark+1) % SYNC_MARKS;
5900
5901                         mddev->resync_mark = mark[next];
5902                         mddev->resync_mark_cnt = mark_cnt[next];
5903                         mark[next] = jiffies;
5904                         mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
5905                         last_mark = next;
5906                 }
5907
5908
5909                 if (kthread_should_stop())
5910                         goto interrupted;
5911
5912
5913                 /*
5914                  * this loop exits only if either when we are slower than
5915                  * the 'hard' speed limit, or the system was IO-idle for
5916                  * a jiffy.
5917                  * the system might be non-idle CPU-wise, but we only care
5918                  * about not overloading the IO subsystem. (things like an
5919                  * e2fsck being done on the RAID array should execute fast)
5920                  */
5921                 blk_unplug(mddev->queue);
5922                 cond_resched();
5923
5924                 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
5925                         /((jiffies-mddev->resync_mark)/HZ +1) +1;
5926
5927                 if (currspeed > speed_min(mddev)) {
5928                         if ((currspeed > speed_max(mddev)) ||
5929                                         !is_mddev_idle(mddev)) {
5930                                 msleep(500);
5931                                 goto repeat;
5932                         }
5933                 }
5934         }
5935         printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
5936         /*
5937          * this also signals 'finished resyncing' to md_stop
5938          */
5939  out:
5940         blk_unplug(mddev->queue);
5941
5942         wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
5943
5944         /* tell personality that we are finished */
5945         mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
5946
5947         if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
5948             mddev->curr_resync > 2) {
5949                 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5950                         if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5951                                 if (mddev->curr_resync >= mddev->recovery_cp) {
5952                                         printk(KERN_INFO
5953                                                "md: checkpointing %s of %s.\n",
5954                                                desc, mdname(mddev));
5955                                         mddev->recovery_cp = mddev->curr_resync;
5956                                 }
5957                         } else
5958                                 mddev->recovery_cp = MaxSector;
5959                 } else {
5960                         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5961                                 mddev->curr_resync = MaxSector;
5962                         rdev_for_each(rdev, rtmp, mddev)
5963                                 if (rdev->raid_disk >= 0 &&
5964                                     !test_bit(Faulty, &rdev->flags) &&
5965                                     !test_bit(In_sync, &rdev->flags) &&
5966                                     rdev->recovery_offset < mddev->curr_resync)
5967                                         rdev->recovery_offset = mddev->curr_resync;
5968                 }
5969         }
5970         set_bit(MD_CHANGE_DEVS, &mddev->flags);
5971
5972  skip:
5973         mddev->curr_resync = 0;
5974         mddev->resync_min = 0;
5975         mddev->resync_max = MaxSector;
5976         sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5977         wake_up(&resync_wait);
5978         set_bit(MD_RECOVERY_DONE, &mddev->recovery);
5979         md_wakeup_thread(mddev->thread);
5980         return;
5981
5982  interrupted:
5983         /*
5984          * got a signal, exit.
5985          */
5986         printk(KERN_INFO
5987                "md: md_do_sync() got signal ... exiting\n");
5988         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5989         goto out;
5990
5991 }
5992 EXPORT_SYMBOL_GPL(md_do_sync);
5993
5994
5995 static int remove_and_add_spares(mddev_t *mddev)
5996 {
5997         mdk_rdev_t *rdev;
5998         struct list_head *rtmp;
5999         int spares = 0;
6000
6001         rdev_for_each(rdev, rtmp, mddev)
6002                 if (rdev->raid_disk >= 0 &&
6003                     !test_bit(Blocked, &rdev->flags) &&
6004                     (test_bit(Faulty, &rdev->flags) ||
6005                      ! test_bit(In_sync, &rdev->flags)) &&
6006                     atomic_read(&rdev->nr_pending)==0) {
6007                         if (mddev->pers->hot_remove_disk(
6008                                     mddev, rdev->raid_disk)==0) {
6009                                 char nm[20];
6010                                 sprintf(nm,"rd%d", rdev->raid_disk);
6011                                 sysfs_remove_link(&mddev->kobj, nm);
6012                                 rdev->raid_disk = -1;
6013                         }
6014                 }
6015
6016         if (mddev->degraded) {
6017                 rdev_for_each(rdev, rtmp, mddev) {
6018                         if (rdev->raid_disk >= 0 &&
6019                             !test_bit(In_sync, &rdev->flags))
6020                                 spares++;
6021                         if (rdev->raid_disk < 0
6022                             && !test_bit(Faulty, &rdev->flags)) {
6023                                 rdev->recovery_offset = 0;
6024                                 if (mddev->pers->
6025                                     hot_add_disk(mddev, rdev) == 0) {
6026                                         char nm[20];
6027                                         sprintf(nm, "rd%d", rdev->raid_disk);
6028                                         if (sysfs_create_link(&mddev->kobj,
6029                                                               &rdev->kobj, nm))
6030                                                 printk(KERN_WARNING
6031                                                        "md: cannot register "
6032                                                        "%s for %s\n",
6033                                                        nm, mdname(mddev));
6034                                         spares++;
6035                                         md_new_event(mddev);
6036                                 } else
6037                                         break;
6038                         }
6039                 }
6040         }
6041         return spares;
6042 }
6043 /*
6044  * This routine is regularly called by all per-raid-array threads to
6045  * deal with generic issues like resync and super-block update.
6046  * Raid personalities that don't have a thread (linear/raid0) do not
6047  * need this as they never do any recovery or update the superblock.
6048  *
6049  * It does not do any resync itself, but rather "forks" off other threads
6050  * to do that as needed.
6051  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
6052  * "->recovery" and create a thread at ->sync_thread.
6053  * When the thread finishes it sets MD_RECOVERY_DONE
6054  * and wakeups up this thread which will reap the thread and finish up.
6055  * This thread also removes any faulty devices (with nr_pending == 0).
6056  *
6057  * The overall approach is:
6058  *  1/ if the superblock needs updating, update it.
6059  *  2/ If a recovery thread is running, don't do anything else.
6060  *  3/ If recovery has finished, clean up, possibly marking spares active.
6061  *  4/ If there are any faulty devices, remove them.
6062  *  5/ If array is degraded, try to add spares devices
6063  *  6/ If array has spares or is not in-sync, start a resync thread.
6064  */
6065 void md_check_recovery(mddev_t *mddev)
6066 {
6067         mdk_rdev_t *rdev;
6068         struct list_head *rtmp;
6069
6070
6071         if (mddev->bitmap)
6072                 bitmap_daemon_work(mddev->bitmap);
6073
6074         if (mddev->ro)
6075                 return;
6076
6077         if (signal_pending(current)) {
6078                 if (mddev->pers->sync_request && !mddev->external) {
6079                         printk(KERN_INFO "md: %s in immediate safe mode\n",
6080                                mdname(mddev));
6081                         mddev->safemode = 2;
6082                 }
6083                 flush_signals(current);
6084         }
6085
6086         if ( ! (
6087                 (mddev->flags && !mddev->external) ||
6088                 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
6089                 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
6090                 (mddev->external == 0 && mddev->safemode == 1) ||
6091                 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
6092                  && !mddev->in_sync && mddev->recovery_cp == MaxSector)
6093                 ))
6094                 return;
6095
6096         if (mddev_trylock(mddev)) {
6097                 int spares = 0;
6098
6099                 if (!mddev->external) {
6100                         int did_change = 0;
6101                         spin_lock_irq(&mddev->write_lock);
6102                         if (mddev->safemode &&
6103                             !atomic_read(&mddev->writes_pending) &&
6104                             !mddev->in_sync &&
6105                             mddev->recovery_cp == MaxSector) {
6106                                 mddev->in_sync = 1;
6107                                 did_change = 1;
6108                                 if (mddev->persistent)
6109                                         set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6110                         }
6111                         if (mddev->safemode == 1)
6112                                 mddev->safemode = 0;
6113                         spin_unlock_irq(&mddev->write_lock);
6114                         if (did_change)
6115                                 sysfs_notify(&mddev->kobj, NULL, "array_state");
6116                 }
6117
6118                 if (mddev->flags)
6119                         md_update_sb(mddev, 0);
6120
6121                 rdev_for_each(rdev, rtmp, mddev)
6122                         if (test_and_clear_bit(StateChanged, &rdev->flags))
6123                                 sysfs_notify(&rdev->kobj, NULL, "state");
6124
6125
6126                 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
6127                     !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
6128                         /* resync/recovery still happening */
6129                         clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6130                         goto unlock;
6131                 }
6132                 if (mddev->sync_thread) {
6133                         /* resync has finished, collect result */
6134                         md_unregister_thread(mddev->sync_thread);
6135                         mddev->sync_thread = NULL;
6136                         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
6137                                 /* success...*/
6138                                 /* activate any spares */
6139                                 if (mddev->pers->spare_active(mddev))
6140                                         sysfs_notify(&mddev->kobj, NULL,
6141                                                      "degraded");
6142                         }
6143                         md_update_sb(mddev, 1);
6144
6145                         /* if array is no-longer degraded, then any saved_raid_disk
6146                          * information must be scrapped
6147                          */
6148                         if (!mddev->degraded)
6149                                 rdev_for_each(rdev, rtmp, mddev)
6150                                         rdev->saved_raid_disk = -1;
6151
6152                         mddev->recovery = 0;
6153                         /* flag recovery needed just to double check */
6154                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6155                         sysfs_notify(&mddev->kobj, NULL, "sync_action");
6156                         md_new_event(mddev);
6157                         goto unlock;
6158                 }
6159                 /* Set RUNNING before clearing NEEDED to avoid
6160                  * any transients in the value of "sync_action".
6161                  */
6162                 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6163                 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6164                 /* Clear some bits that don't mean anything, but
6165                  * might be left set
6166                  */
6167                 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
6168                 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
6169
6170                 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
6171                         goto unlock;
6172                 /* no recovery is running.
6173                  * remove any failed drives, then
6174                  * add spares if possible.
6175                  * Spare are also removed and re-added, to allow
6176                  * the personality to fail the re-add.
6177                  */
6178
6179                 if (mddev->reshape_position != MaxSector) {
6180                         if (mddev->pers->check_reshape(mddev) != 0)
6181                                 /* Cannot proceed */
6182                                 goto unlock;
6183                         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
6184                         clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6185                 } else if ((spares = remove_and_add_spares(mddev))) {
6186                         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6187                         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
6188                         set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6189                 } else if (mddev->recovery_cp < MaxSector) {
6190                         set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6191                         clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6192                 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
6193                         /* nothing to be done ... */
6194                         goto unlock;
6195
6196                 if (mddev->pers->sync_request) {
6197                         if (spares && mddev->bitmap && ! mddev->bitmap->file) {
6198                                 /* We are adding a device or devices to an array
6199                                  * which has the bitmap stored on all devices.
6200                                  * So make sure all bitmap pages get written
6201                                  */
6202                                 bitmap_write_all(mddev->bitmap);
6203                         }
6204                         mddev->sync_thread = md_register_thread(md_do_sync,
6205                                                                 mddev,
6206                                                                 "%s_resync");
6207                         if (!mddev->sync_thread) {
6208                                 printk(KERN_ERR "%s: could not start resync"
6209                                         " thread...\n", 
6210                                         mdname(mddev));
6211                                 /* leave the spares where they are, it shouldn't hurt */
6212                                 mddev->recovery = 0;
6213                         } else
6214                                 md_wakeup_thread(mddev->sync_thread);
6215                         sysfs_notify(&mddev->kobj, NULL, "sync_action");
6216                         md_new_event(mddev);
6217                 }
6218         unlock:
6219                 if (!mddev->sync_thread) {
6220                         clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6221                         if (test_and_clear_bit(MD_RECOVERY_RECOVER,
6222                                                &mddev->recovery))
6223                                 sysfs_notify(&mddev->kobj, NULL, "sync_action");
6224                 }
6225                 mddev_unlock(mddev);
6226         }
6227 }
6228
6229 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
6230 {
6231         sysfs_notify(&rdev->kobj, NULL, "state");
6232         wait_event_timeout(rdev->blocked_wait,
6233                            !test_bit(Blocked, &rdev->flags),
6234                            msecs_to_jiffies(5000));
6235         rdev_dec_pending(rdev, mddev);
6236 }
6237 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
6238
6239 static int md_notify_reboot(struct notifier_block *this,
6240                             unsigned long code, void *x)
6241 {
6242         struct list_head *tmp;
6243         mddev_t *mddev;
6244
6245         if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
6246
6247                 printk(KERN_INFO "md: stopping all md devices.\n");
6248
6249                 for_each_mddev(mddev, tmp)
6250                         if (mddev_trylock(mddev)) {
6251                                 do_md_stop (mddev, 1);
6252                                 mddev_unlock(mddev);
6253                         }
6254                 /*
6255                  * certain more exotic SCSI devices are known to be
6256                  * volatile wrt too early system reboots. While the
6257                  * right place to handle this issue is the given
6258                  * driver, we do want to have a safe RAID driver ...
6259                  */
6260                 mdelay(1000*1);
6261         }
6262         return NOTIFY_DONE;
6263 }
6264
6265 static struct notifier_block md_notifier = {
6266         .notifier_call  = md_notify_reboot,
6267         .next           = NULL,
6268         .priority       = INT_MAX, /* before any real devices */
6269 };
6270
6271 static void md_geninit(void)
6272 {
6273         dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
6274
6275         proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
6276 }
6277
6278 static int __init md_init(void)
6279 {
6280         if (register_blkdev(MAJOR_NR, "md"))
6281                 return -1;
6282         if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
6283                 unregister_blkdev(MAJOR_NR, "md");
6284                 return -1;
6285         }
6286         blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE,
6287                             md_probe, NULL, NULL);
6288         blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
6289                             md_probe, NULL, NULL);
6290
6291         register_reboot_notifier(&md_notifier);
6292         raid_table_header = register_sysctl_table(raid_root_table);
6293
6294         md_geninit();
6295         return (0);
6296 }
6297
6298
6299 #ifndef MODULE
6300
6301 /*
6302  * Searches all registered partitions for autorun RAID arrays
6303  * at boot time.
6304  */
6305
6306 static LIST_HEAD(all_detected_devices);
6307 struct detected_devices_node {
6308         struct list_head list;
6309         dev_t dev;
6310 };
6311
6312 void md_autodetect_dev(dev_t dev)
6313 {
6314         struct detected_devices_node *node_detected_dev;
6315
6316         node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
6317         if (node_detected_dev) {
6318                 node_detected_dev->dev = dev;
6319                 list_add_tail(&node_detected_dev->list, &all_detected_devices);
6320         } else {
6321                 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
6322                         ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
6323         }
6324 }
6325
6326
6327 static void autostart_arrays(int part)
6328 {
6329         mdk_rdev_t *rdev;
6330         struct detected_devices_node *node_detected_dev;
6331         dev_t dev;
6332         int i_scanned, i_passed;
6333
6334         i_scanned = 0;
6335         i_passed = 0;
6336
6337         printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
6338
6339         while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
6340                 i_scanned++;
6341                 node_detected_dev = list_entry(all_detected_devices.next,
6342                                         struct detected_devices_node, list);
6343                 list_del(&node_detected_dev->list);
6344                 dev = node_detected_dev->dev;
6345                 kfree(node_detected_dev);
6346                 rdev = md_import_device(dev,0, 90);
6347                 if (IS_ERR(rdev))
6348                         continue;
6349
6350                 if (test_bit(Faulty, &rdev->flags)) {
6351                         MD_BUG();
6352                         continue;
6353                 }
6354                 set_bit(AutoDetected, &rdev->flags);
6355                 list_add(&rdev->same_set, &pending_raid_disks);
6356                 i_passed++;
6357         }
6358
6359         printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
6360                                                 i_scanned, i_passed);
6361
6362         autorun_devices(part);
6363 }
6364
6365 #endif /* !MODULE */
6366
6367 static __exit void md_exit(void)
6368 {
6369         mddev_t *mddev;
6370         struct list_head *tmp;
6371
6372         blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS);
6373         blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
6374
6375         unregister_blkdev(MAJOR_NR,"md");
6376         unregister_blkdev(mdp_major, "mdp");
6377         unregister_reboot_notifier(&md_notifier);
6378         unregister_sysctl_table(raid_table_header);
6379         remove_proc_entry("mdstat", NULL);
6380         for_each_mddev(mddev, tmp) {
6381                 struct gendisk *disk = mddev->gendisk;
6382                 if (!disk)
6383                         continue;
6384                 export_array(mddev);
6385                 del_gendisk(disk);
6386                 put_disk(disk);
6387                 mddev->gendisk = NULL;
6388                 mddev_put(mddev);
6389         }
6390 }
6391
6392 subsys_initcall(md_init);
6393 module_exit(md_exit)
6394
6395 static int get_ro(char *buffer, struct kernel_param *kp)
6396 {
6397         return sprintf(buffer, "%d", start_readonly);
6398 }
6399 static int set_ro(const char *val, struct kernel_param *kp)
6400 {
6401         char *e;
6402         int num = simple_strtoul(val, &e, 10);
6403         if (*val && (*e == '\0' || *e == '\n')) {
6404                 start_readonly = num;
6405                 return 0;
6406         }
6407         return -EINVAL;
6408 }
6409
6410 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
6411 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
6412
6413
6414 EXPORT_SYMBOL(register_md_personality);
6415 EXPORT_SYMBOL(unregister_md_personality);
6416 EXPORT_SYMBOL(md_error);
6417 EXPORT_SYMBOL(md_done_sync);
6418 EXPORT_SYMBOL(md_write_start);
6419 EXPORT_SYMBOL(md_write_end);
6420 EXPORT_SYMBOL(md_register_thread);
6421 EXPORT_SYMBOL(md_unregister_thread);
6422 EXPORT_SYMBOL(md_wakeup_thread);
6423 EXPORT_SYMBOL(md_check_recovery);
6424 MODULE_LICENSE("GPL");
6425 MODULE_ALIAS("md");
6426 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);