Merge branch 'writeback' of git://git.kernel.dk/linux-2.6-block
[pandora-kernel.git] / drivers / md / md.c
1 /*
2    md.c : Multiple Devices driver for Linux
3           Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5      completely rewritten, based on the MD driver code from Marc Zyngier
6
7    Changes:
8
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13    - kmod support by: Cyrus Durgin
14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17    - lots of fixes and improvements to the RAID1/RAID5 and generic
18      RAID code (such as request based resynchronization):
19
20      Neil Brown <neilb@cse.unsw.edu.au>.
21
22    - persistent bitmap code
23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25    This program is free software; you can redistribute it and/or modify
26    it under the terms of the GNU General Public License as published by
27    the Free Software Foundation; either version 2, or (at your option)
28    any later version.
29
30    You should have received a copy of the GNU General Public License
31    (for example /usr/src/linux/COPYING); if not, write to the Free
32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */
34
35 #include <linux/kthread.h>
36 #include <linux/blkdev.h>
37 #include <linux/sysctl.h>
38 #include <linux/seq_file.h>
39 #include <linux/buffer_head.h> /* for invalidate_bdev */
40 #include <linux/poll.h>
41 #include <linux/ctype.h>
42 #include <linux/hdreg.h>
43 #include <linux/proc_fs.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/file.h>
47 #include <linux/delay.h>
48 #include <linux/raid/md_p.h>
49 #include <linux/raid/md_u.h>
50 #include "md.h"
51 #include "bitmap.h"
52
53 #define DEBUG 0
54 #define dprintk(x...) ((void)(DEBUG && printk(x)))
55
56
57 #ifndef MODULE
58 static void autostart_arrays(int part);
59 #endif
60
61 static LIST_HEAD(pers_list);
62 static DEFINE_SPINLOCK(pers_lock);
63
64 static void md_print_devices(void);
65
66 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
67
68 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
69
70 /*
71  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
72  * is 1000 KB/sec, so the extra system load does not show up that much.
73  * Increase it if you want to have more _guaranteed_ speed. Note that
74  * the RAID driver will use the maximum available bandwidth if the IO
75  * subsystem is idle. There is also an 'absolute maximum' reconstruction
76  * speed limit - in case reconstruction slows down your system despite
77  * idle IO detection.
78  *
79  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
80  * or /sys/block/mdX/md/sync_speed_{min,max}
81  */
82
83 static int sysctl_speed_limit_min = 1000;
84 static int sysctl_speed_limit_max = 200000;
85 static inline int speed_min(mddev_t *mddev)
86 {
87         return mddev->sync_speed_min ?
88                 mddev->sync_speed_min : sysctl_speed_limit_min;
89 }
90
91 static inline int speed_max(mddev_t *mddev)
92 {
93         return mddev->sync_speed_max ?
94                 mddev->sync_speed_max : sysctl_speed_limit_max;
95 }
96
97 static struct ctl_table_header *raid_table_header;
98
99 static ctl_table raid_table[] = {
100         {
101                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MIN,
102                 .procname       = "speed_limit_min",
103                 .data           = &sysctl_speed_limit_min,
104                 .maxlen         = sizeof(int),
105                 .mode           = S_IRUGO|S_IWUSR,
106                 .proc_handler   = &proc_dointvec,
107         },
108         {
109                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MAX,
110                 .procname       = "speed_limit_max",
111                 .data           = &sysctl_speed_limit_max,
112                 .maxlen         = sizeof(int),
113                 .mode           = S_IRUGO|S_IWUSR,
114                 .proc_handler   = &proc_dointvec,
115         },
116         { .ctl_name = 0 }
117 };
118
119 static ctl_table raid_dir_table[] = {
120         {
121                 .ctl_name       = DEV_RAID,
122                 .procname       = "raid",
123                 .maxlen         = 0,
124                 .mode           = S_IRUGO|S_IXUGO,
125                 .child          = raid_table,
126         },
127         { .ctl_name = 0 }
128 };
129
130 static ctl_table raid_root_table[] = {
131         {
132                 .ctl_name       = CTL_DEV,
133                 .procname       = "dev",
134                 .maxlen         = 0,
135                 .mode           = 0555,
136                 .child          = raid_dir_table,
137         },
138         { .ctl_name = 0 }
139 };
140
141 static const struct block_device_operations md_fops;
142
143 static int start_readonly;
144
145 /*
146  * We have a system wide 'event count' that is incremented
147  * on any 'interesting' event, and readers of /proc/mdstat
148  * can use 'poll' or 'select' to find out when the event
149  * count increases.
150  *
151  * Events are:
152  *  start array, stop array, error, add device, remove device,
153  *  start build, activate spare
154  */
155 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
156 static atomic_t md_event_count;
157 void md_new_event(mddev_t *mddev)
158 {
159         atomic_inc(&md_event_count);
160         wake_up(&md_event_waiters);
161 }
162 EXPORT_SYMBOL_GPL(md_new_event);
163
164 /* Alternate version that can be called from interrupts
165  * when calling sysfs_notify isn't needed.
166  */
167 static void md_new_event_inintr(mddev_t *mddev)
168 {
169         atomic_inc(&md_event_count);
170         wake_up(&md_event_waiters);
171 }
172
173 /*
174  * Enables to iterate over all existing md arrays
175  * all_mddevs_lock protects this list.
176  */
177 static LIST_HEAD(all_mddevs);
178 static DEFINE_SPINLOCK(all_mddevs_lock);
179
180
181 /*
182  * iterates through all used mddevs in the system.
183  * We take care to grab the all_mddevs_lock whenever navigating
184  * the list, and to always hold a refcount when unlocked.
185  * Any code which breaks out of this loop while own
186  * a reference to the current mddev and must mddev_put it.
187  */
188 #define for_each_mddev(mddev,tmp)                                       \
189                                                                         \
190         for (({ spin_lock(&all_mddevs_lock);                            \
191                 tmp = all_mddevs.next;                                  \
192                 mddev = NULL;});                                        \
193              ({ if (tmp != &all_mddevs)                                 \
194                         mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
195                 spin_unlock(&all_mddevs_lock);                          \
196                 if (mddev) mddev_put(mddev);                            \
197                 mddev = list_entry(tmp, mddev_t, all_mddevs);           \
198                 tmp != &all_mddevs;});                                  \
199              ({ spin_lock(&all_mddevs_lock);                            \
200                 tmp = tmp->next;})                                      \
201                 )
202
203
204 /* Rather than calling directly into the personality make_request function,
205  * IO requests come here first so that we can check if the device is
206  * being suspended pending a reconfiguration.
207  * We hold a refcount over the call to ->make_request.  By the time that
208  * call has finished, the bio has been linked into some internal structure
209  * and so is visible to ->quiesce(), so we don't need the refcount any more.
210  */
211 static int md_make_request(struct request_queue *q, struct bio *bio)
212 {
213         mddev_t *mddev = q->queuedata;
214         int rv;
215         if (mddev == NULL || mddev->pers == NULL) {
216                 bio_io_error(bio);
217                 return 0;
218         }
219         rcu_read_lock();
220         if (mddev->suspended) {
221                 DEFINE_WAIT(__wait);
222                 for (;;) {
223                         prepare_to_wait(&mddev->sb_wait, &__wait,
224                                         TASK_UNINTERRUPTIBLE);
225                         if (!mddev->suspended)
226                                 break;
227                         rcu_read_unlock();
228                         schedule();
229                         rcu_read_lock();
230                 }
231                 finish_wait(&mddev->sb_wait, &__wait);
232         }
233         atomic_inc(&mddev->active_io);
234         rcu_read_unlock();
235         rv = mddev->pers->make_request(q, bio);
236         if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
237                 wake_up(&mddev->sb_wait);
238
239         return rv;
240 }
241
242 static void mddev_suspend(mddev_t *mddev)
243 {
244         BUG_ON(mddev->suspended);
245         mddev->suspended = 1;
246         synchronize_rcu();
247         wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
248         mddev->pers->quiesce(mddev, 1);
249         md_unregister_thread(mddev->thread);
250         mddev->thread = NULL;
251         /* we now know that no code is executing in the personality module,
252          * except possibly the tail end of a ->bi_end_io function, but that
253          * is certain to complete before the module has a chance to get
254          * unloaded
255          */
256 }
257
258 static void mddev_resume(mddev_t *mddev)
259 {
260         mddev->suspended = 0;
261         wake_up(&mddev->sb_wait);
262         mddev->pers->quiesce(mddev, 0);
263 }
264
265 int mddev_congested(mddev_t *mddev, int bits)
266 {
267         return mddev->suspended;
268 }
269 EXPORT_SYMBOL(mddev_congested);
270
271
272 static inline mddev_t *mddev_get(mddev_t *mddev)
273 {
274         atomic_inc(&mddev->active);
275         return mddev;
276 }
277
278 static void mddev_delayed_delete(struct work_struct *ws);
279
280 static void mddev_put(mddev_t *mddev)
281 {
282         if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
283                 return;
284         if (!mddev->raid_disks && list_empty(&mddev->disks) &&
285             !mddev->hold_active) {
286                 list_del(&mddev->all_mddevs);
287                 if (mddev->gendisk) {
288                         /* we did a probe so need to clean up.
289                          * Call schedule_work inside the spinlock
290                          * so that flush_scheduled_work() after
291                          * mddev_find will succeed in waiting for the
292                          * work to be done.
293                          */
294                         INIT_WORK(&mddev->del_work, mddev_delayed_delete);
295                         schedule_work(&mddev->del_work);
296                 } else
297                         kfree(mddev);
298         }
299         spin_unlock(&all_mddevs_lock);
300 }
301
302 static mddev_t * mddev_find(dev_t unit)
303 {
304         mddev_t *mddev, *new = NULL;
305
306  retry:
307         spin_lock(&all_mddevs_lock);
308
309         if (unit) {
310                 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
311                         if (mddev->unit == unit) {
312                                 mddev_get(mddev);
313                                 spin_unlock(&all_mddevs_lock);
314                                 kfree(new);
315                                 return mddev;
316                         }
317
318                 if (new) {
319                         list_add(&new->all_mddevs, &all_mddevs);
320                         spin_unlock(&all_mddevs_lock);
321                         new->hold_active = UNTIL_IOCTL;
322                         return new;
323                 }
324         } else if (new) {
325                 /* find an unused unit number */
326                 static int next_minor = 512;
327                 int start = next_minor;
328                 int is_free = 0;
329                 int dev = 0;
330                 while (!is_free) {
331                         dev = MKDEV(MD_MAJOR, next_minor);
332                         next_minor++;
333                         if (next_minor > MINORMASK)
334                                 next_minor = 0;
335                         if (next_minor == start) {
336                                 /* Oh dear, all in use. */
337                                 spin_unlock(&all_mddevs_lock);
338                                 kfree(new);
339                                 return NULL;
340                         }
341                                 
342                         is_free = 1;
343                         list_for_each_entry(mddev, &all_mddevs, all_mddevs)
344                                 if (mddev->unit == dev) {
345                                         is_free = 0;
346                                         break;
347                                 }
348                 }
349                 new->unit = dev;
350                 new->md_minor = MINOR(dev);
351                 new->hold_active = UNTIL_STOP;
352                 list_add(&new->all_mddevs, &all_mddevs);
353                 spin_unlock(&all_mddevs_lock);
354                 return new;
355         }
356         spin_unlock(&all_mddevs_lock);
357
358         new = kzalloc(sizeof(*new), GFP_KERNEL);
359         if (!new)
360                 return NULL;
361
362         new->unit = unit;
363         if (MAJOR(unit) == MD_MAJOR)
364                 new->md_minor = MINOR(unit);
365         else
366                 new->md_minor = MINOR(unit) >> MdpMinorShift;
367
368         mutex_init(&new->open_mutex);
369         mutex_init(&new->reconfig_mutex);
370         INIT_LIST_HEAD(&new->disks);
371         INIT_LIST_HEAD(&new->all_mddevs);
372         init_timer(&new->safemode_timer);
373         atomic_set(&new->active, 1);
374         atomic_set(&new->openers, 0);
375         atomic_set(&new->active_io, 0);
376         spin_lock_init(&new->write_lock);
377         init_waitqueue_head(&new->sb_wait);
378         init_waitqueue_head(&new->recovery_wait);
379         new->reshape_position = MaxSector;
380         new->resync_min = 0;
381         new->resync_max = MaxSector;
382         new->level = LEVEL_NONE;
383
384         goto retry;
385 }
386
387 static inline int mddev_lock(mddev_t * mddev)
388 {
389         return mutex_lock_interruptible(&mddev->reconfig_mutex);
390 }
391
392 static inline int mddev_is_locked(mddev_t *mddev)
393 {
394         return mutex_is_locked(&mddev->reconfig_mutex);
395 }
396
397 static inline int mddev_trylock(mddev_t * mddev)
398 {
399         return mutex_trylock(&mddev->reconfig_mutex);
400 }
401
402 static inline void mddev_unlock(mddev_t * mddev)
403 {
404         mutex_unlock(&mddev->reconfig_mutex);
405
406         md_wakeup_thread(mddev->thread);
407 }
408
409 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
410 {
411         mdk_rdev_t *rdev;
412
413         list_for_each_entry(rdev, &mddev->disks, same_set)
414                 if (rdev->desc_nr == nr)
415                         return rdev;
416
417         return NULL;
418 }
419
420 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
421 {
422         mdk_rdev_t *rdev;
423
424         list_for_each_entry(rdev, &mddev->disks, same_set)
425                 if (rdev->bdev->bd_dev == dev)
426                         return rdev;
427
428         return NULL;
429 }
430
431 static struct mdk_personality *find_pers(int level, char *clevel)
432 {
433         struct mdk_personality *pers;
434         list_for_each_entry(pers, &pers_list, list) {
435                 if (level != LEVEL_NONE && pers->level == level)
436                         return pers;
437                 if (strcmp(pers->name, clevel)==0)
438                         return pers;
439         }
440         return NULL;
441 }
442
443 /* return the offset of the super block in 512byte sectors */
444 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
445 {
446         sector_t num_sectors = bdev->bd_inode->i_size / 512;
447         return MD_NEW_SIZE_SECTORS(num_sectors);
448 }
449
450 static int alloc_disk_sb(mdk_rdev_t * rdev)
451 {
452         if (rdev->sb_page)
453                 MD_BUG();
454
455         rdev->sb_page = alloc_page(GFP_KERNEL);
456         if (!rdev->sb_page) {
457                 printk(KERN_ALERT "md: out of memory.\n");
458                 return -ENOMEM;
459         }
460
461         return 0;
462 }
463
464 static void free_disk_sb(mdk_rdev_t * rdev)
465 {
466         if (rdev->sb_page) {
467                 put_page(rdev->sb_page);
468                 rdev->sb_loaded = 0;
469                 rdev->sb_page = NULL;
470                 rdev->sb_start = 0;
471                 rdev->sectors = 0;
472         }
473 }
474
475
476 static void super_written(struct bio *bio, int error)
477 {
478         mdk_rdev_t *rdev = bio->bi_private;
479         mddev_t *mddev = rdev->mddev;
480
481         if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
482                 printk("md: super_written gets error=%d, uptodate=%d\n",
483                        error, test_bit(BIO_UPTODATE, &bio->bi_flags));
484                 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
485                 md_error(mddev, rdev);
486         }
487
488         if (atomic_dec_and_test(&mddev->pending_writes))
489                 wake_up(&mddev->sb_wait);
490         bio_put(bio);
491 }
492
493 static void super_written_barrier(struct bio *bio, int error)
494 {
495         struct bio *bio2 = bio->bi_private;
496         mdk_rdev_t *rdev = bio2->bi_private;
497         mddev_t *mddev = rdev->mddev;
498
499         if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
500             error == -EOPNOTSUPP) {
501                 unsigned long flags;
502                 /* barriers don't appear to be supported :-( */
503                 set_bit(BarriersNotsupp, &rdev->flags);
504                 mddev->barriers_work = 0;
505                 spin_lock_irqsave(&mddev->write_lock, flags);
506                 bio2->bi_next = mddev->biolist;
507                 mddev->biolist = bio2;
508                 spin_unlock_irqrestore(&mddev->write_lock, flags);
509                 wake_up(&mddev->sb_wait);
510                 bio_put(bio);
511         } else {
512                 bio_put(bio2);
513                 bio->bi_private = rdev;
514                 super_written(bio, error);
515         }
516 }
517
518 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
519                    sector_t sector, int size, struct page *page)
520 {
521         /* write first size bytes of page to sector of rdev
522          * Increment mddev->pending_writes before returning
523          * and decrement it on completion, waking up sb_wait
524          * if zero is reached.
525          * If an error occurred, call md_error
526          *
527          * As we might need to resubmit the request if BIO_RW_BARRIER
528          * causes ENOTSUPP, we allocate a spare bio...
529          */
530         struct bio *bio = bio_alloc(GFP_NOIO, 1);
531         int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
532
533         bio->bi_bdev = rdev->bdev;
534         bio->bi_sector = sector;
535         bio_add_page(bio, page, size, 0);
536         bio->bi_private = rdev;
537         bio->bi_end_io = super_written;
538         bio->bi_rw = rw;
539
540         atomic_inc(&mddev->pending_writes);
541         if (!test_bit(BarriersNotsupp, &rdev->flags)) {
542                 struct bio *rbio;
543                 rw |= (1<<BIO_RW_BARRIER);
544                 rbio = bio_clone(bio, GFP_NOIO);
545                 rbio->bi_private = bio;
546                 rbio->bi_end_io = super_written_barrier;
547                 submit_bio(rw, rbio);
548         } else
549                 submit_bio(rw, bio);
550 }
551
552 void md_super_wait(mddev_t *mddev)
553 {
554         /* wait for all superblock writes that were scheduled to complete.
555          * if any had to be retried (due to BARRIER problems), retry them
556          */
557         DEFINE_WAIT(wq);
558         for(;;) {
559                 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
560                 if (atomic_read(&mddev->pending_writes)==0)
561                         break;
562                 while (mddev->biolist) {
563                         struct bio *bio;
564                         spin_lock_irq(&mddev->write_lock);
565                         bio = mddev->biolist;
566                         mddev->biolist = bio->bi_next ;
567                         bio->bi_next = NULL;
568                         spin_unlock_irq(&mddev->write_lock);
569                         submit_bio(bio->bi_rw, bio);
570                 }
571                 schedule();
572         }
573         finish_wait(&mddev->sb_wait, &wq);
574 }
575
576 static void bi_complete(struct bio *bio, int error)
577 {
578         complete((struct completion*)bio->bi_private);
579 }
580
581 int sync_page_io(struct block_device *bdev, sector_t sector, int size,
582                    struct page *page, int rw)
583 {
584         struct bio *bio = bio_alloc(GFP_NOIO, 1);
585         struct completion event;
586         int ret;
587
588         rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
589
590         bio->bi_bdev = bdev;
591         bio->bi_sector = sector;
592         bio_add_page(bio, page, size, 0);
593         init_completion(&event);
594         bio->bi_private = &event;
595         bio->bi_end_io = bi_complete;
596         submit_bio(rw, bio);
597         wait_for_completion(&event);
598
599         ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
600         bio_put(bio);
601         return ret;
602 }
603 EXPORT_SYMBOL_GPL(sync_page_io);
604
605 static int read_disk_sb(mdk_rdev_t * rdev, int size)
606 {
607         char b[BDEVNAME_SIZE];
608         if (!rdev->sb_page) {
609                 MD_BUG();
610                 return -EINVAL;
611         }
612         if (rdev->sb_loaded)
613                 return 0;
614
615
616         if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
617                 goto fail;
618         rdev->sb_loaded = 1;
619         return 0;
620
621 fail:
622         printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
623                 bdevname(rdev->bdev,b));
624         return -EINVAL;
625 }
626
627 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
628 {
629         return  sb1->set_uuid0 == sb2->set_uuid0 &&
630                 sb1->set_uuid1 == sb2->set_uuid1 &&
631                 sb1->set_uuid2 == sb2->set_uuid2 &&
632                 sb1->set_uuid3 == sb2->set_uuid3;
633 }
634
635 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
636 {
637         int ret;
638         mdp_super_t *tmp1, *tmp2;
639
640         tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
641         tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
642
643         if (!tmp1 || !tmp2) {
644                 ret = 0;
645                 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
646                 goto abort;
647         }
648
649         *tmp1 = *sb1;
650         *tmp2 = *sb2;
651
652         /*
653          * nr_disks is not constant
654          */
655         tmp1->nr_disks = 0;
656         tmp2->nr_disks = 0;
657
658         ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
659 abort:
660         kfree(tmp1);
661         kfree(tmp2);
662         return ret;
663 }
664
665
666 static u32 md_csum_fold(u32 csum)
667 {
668         csum = (csum & 0xffff) + (csum >> 16);
669         return (csum & 0xffff) + (csum >> 16);
670 }
671
672 static unsigned int calc_sb_csum(mdp_super_t * sb)
673 {
674         u64 newcsum = 0;
675         u32 *sb32 = (u32*)sb;
676         int i;
677         unsigned int disk_csum, csum;
678
679         disk_csum = sb->sb_csum;
680         sb->sb_csum = 0;
681
682         for (i = 0; i < MD_SB_BYTES/4 ; i++)
683                 newcsum += sb32[i];
684         csum = (newcsum & 0xffffffff) + (newcsum>>32);
685
686
687 #ifdef CONFIG_ALPHA
688         /* This used to use csum_partial, which was wrong for several
689          * reasons including that different results are returned on
690          * different architectures.  It isn't critical that we get exactly
691          * the same return value as before (we always csum_fold before
692          * testing, and that removes any differences).  However as we
693          * know that csum_partial always returned a 16bit value on
694          * alphas, do a fold to maximise conformity to previous behaviour.
695          */
696         sb->sb_csum = md_csum_fold(disk_csum);
697 #else
698         sb->sb_csum = disk_csum;
699 #endif
700         return csum;
701 }
702
703
704 /*
705  * Handle superblock details.
706  * We want to be able to handle multiple superblock formats
707  * so we have a common interface to them all, and an array of
708  * different handlers.
709  * We rely on user-space to write the initial superblock, and support
710  * reading and updating of superblocks.
711  * Interface methods are:
712  *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
713  *      loads and validates a superblock on dev.
714  *      if refdev != NULL, compare superblocks on both devices
715  *    Return:
716  *      0 - dev has a superblock that is compatible with refdev
717  *      1 - dev has a superblock that is compatible and newer than refdev
718  *          so dev should be used as the refdev in future
719  *     -EINVAL superblock incompatible or invalid
720  *     -othererror e.g. -EIO
721  *
722  *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
723  *      Verify that dev is acceptable into mddev.
724  *       The first time, mddev->raid_disks will be 0, and data from
725  *       dev should be merged in.  Subsequent calls check that dev
726  *       is new enough.  Return 0 or -EINVAL
727  *
728  *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
729  *     Update the superblock for rdev with data in mddev
730  *     This does not write to disc.
731  *
732  */
733
734 struct super_type  {
735         char                *name;
736         struct module       *owner;
737         int                 (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
738                                           int minor_version);
739         int                 (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
740         void                (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
741         unsigned long long  (*rdev_size_change)(mdk_rdev_t *rdev,
742                                                 sector_t num_sectors);
743 };
744
745 /*
746  * Check that the given mddev has no bitmap.
747  *
748  * This function is called from the run method of all personalities that do not
749  * support bitmaps. It prints an error message and returns non-zero if mddev
750  * has a bitmap. Otherwise, it returns 0.
751  *
752  */
753 int md_check_no_bitmap(mddev_t *mddev)
754 {
755         if (!mddev->bitmap_file && !mddev->bitmap_offset)
756                 return 0;
757         printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
758                 mdname(mddev), mddev->pers->name);
759         return 1;
760 }
761 EXPORT_SYMBOL(md_check_no_bitmap);
762
763 /*
764  * load_super for 0.90.0 
765  */
766 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
767 {
768         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
769         mdp_super_t *sb;
770         int ret;
771
772         /*
773          * Calculate the position of the superblock (512byte sectors),
774          * it's at the end of the disk.
775          *
776          * It also happens to be a multiple of 4Kb.
777          */
778         rdev->sb_start = calc_dev_sboffset(rdev->bdev);
779
780         ret = read_disk_sb(rdev, MD_SB_BYTES);
781         if (ret) return ret;
782
783         ret = -EINVAL;
784
785         bdevname(rdev->bdev, b);
786         sb = (mdp_super_t*)page_address(rdev->sb_page);
787
788         if (sb->md_magic != MD_SB_MAGIC) {
789                 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
790                        b);
791                 goto abort;
792         }
793
794         if (sb->major_version != 0 ||
795             sb->minor_version < 90 ||
796             sb->minor_version > 91) {
797                 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
798                         sb->major_version, sb->minor_version,
799                         b);
800                 goto abort;
801         }
802
803         if (sb->raid_disks <= 0)
804                 goto abort;
805
806         if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
807                 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
808                         b);
809                 goto abort;
810         }
811
812         rdev->preferred_minor = sb->md_minor;
813         rdev->data_offset = 0;
814         rdev->sb_size = MD_SB_BYTES;
815
816         if (sb->level == LEVEL_MULTIPATH)
817                 rdev->desc_nr = -1;
818         else
819                 rdev->desc_nr = sb->this_disk.number;
820
821         if (!refdev) {
822                 ret = 1;
823         } else {
824                 __u64 ev1, ev2;
825                 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
826                 if (!uuid_equal(refsb, sb)) {
827                         printk(KERN_WARNING "md: %s has different UUID to %s\n",
828                                 b, bdevname(refdev->bdev,b2));
829                         goto abort;
830                 }
831                 if (!sb_equal(refsb, sb)) {
832                         printk(KERN_WARNING "md: %s has same UUID"
833                                " but different superblock to %s\n",
834                                b, bdevname(refdev->bdev, b2));
835                         goto abort;
836                 }
837                 ev1 = md_event(sb);
838                 ev2 = md_event(refsb);
839                 if (ev1 > ev2)
840                         ret = 1;
841                 else 
842                         ret = 0;
843         }
844         rdev->sectors = rdev->sb_start;
845
846         if (rdev->sectors < sb->size * 2 && sb->level > 1)
847                 /* "this cannot possibly happen" ... */
848                 ret = -EINVAL;
849
850  abort:
851         return ret;
852 }
853
854 /*
855  * validate_super for 0.90.0
856  */
857 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
858 {
859         mdp_disk_t *desc;
860         mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
861         __u64 ev1 = md_event(sb);
862
863         rdev->raid_disk = -1;
864         clear_bit(Faulty, &rdev->flags);
865         clear_bit(In_sync, &rdev->flags);
866         clear_bit(WriteMostly, &rdev->flags);
867         clear_bit(BarriersNotsupp, &rdev->flags);
868
869         if (mddev->raid_disks == 0) {
870                 mddev->major_version = 0;
871                 mddev->minor_version = sb->minor_version;
872                 mddev->patch_version = sb->patch_version;
873                 mddev->external = 0;
874                 mddev->chunk_sectors = sb->chunk_size >> 9;
875                 mddev->ctime = sb->ctime;
876                 mddev->utime = sb->utime;
877                 mddev->level = sb->level;
878                 mddev->clevel[0] = 0;
879                 mddev->layout = sb->layout;
880                 mddev->raid_disks = sb->raid_disks;
881                 mddev->dev_sectors = sb->size * 2;
882                 mddev->events = ev1;
883                 mddev->bitmap_offset = 0;
884                 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
885
886                 if (mddev->minor_version >= 91) {
887                         mddev->reshape_position = sb->reshape_position;
888                         mddev->delta_disks = sb->delta_disks;
889                         mddev->new_level = sb->new_level;
890                         mddev->new_layout = sb->new_layout;
891                         mddev->new_chunk_sectors = sb->new_chunk >> 9;
892                 } else {
893                         mddev->reshape_position = MaxSector;
894                         mddev->delta_disks = 0;
895                         mddev->new_level = mddev->level;
896                         mddev->new_layout = mddev->layout;
897                         mddev->new_chunk_sectors = mddev->chunk_sectors;
898                 }
899
900                 if (sb->state & (1<<MD_SB_CLEAN))
901                         mddev->recovery_cp = MaxSector;
902                 else {
903                         if (sb->events_hi == sb->cp_events_hi && 
904                                 sb->events_lo == sb->cp_events_lo) {
905                                 mddev->recovery_cp = sb->recovery_cp;
906                         } else
907                                 mddev->recovery_cp = 0;
908                 }
909
910                 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
911                 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
912                 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
913                 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
914
915                 mddev->max_disks = MD_SB_DISKS;
916
917                 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
918                     mddev->bitmap_file == NULL)
919                         mddev->bitmap_offset = mddev->default_bitmap_offset;
920
921         } else if (mddev->pers == NULL) {
922                 /* Insist on good event counter while assembling */
923                 ++ev1;
924                 if (ev1 < mddev->events) 
925                         return -EINVAL;
926         } else if (mddev->bitmap) {
927                 /* if adding to array with a bitmap, then we can accept an
928                  * older device ... but not too old.
929                  */
930                 if (ev1 < mddev->bitmap->events_cleared)
931                         return 0;
932         } else {
933                 if (ev1 < mddev->events)
934                         /* just a hot-add of a new device, leave raid_disk at -1 */
935                         return 0;
936         }
937
938         if (mddev->level != LEVEL_MULTIPATH) {
939                 desc = sb->disks + rdev->desc_nr;
940
941                 if (desc->state & (1<<MD_DISK_FAULTY))
942                         set_bit(Faulty, &rdev->flags);
943                 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
944                             desc->raid_disk < mddev->raid_disks */) {
945                         set_bit(In_sync, &rdev->flags);
946                         rdev->raid_disk = desc->raid_disk;
947                 }
948                 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
949                         set_bit(WriteMostly, &rdev->flags);
950         } else /* MULTIPATH are always insync */
951                 set_bit(In_sync, &rdev->flags);
952         return 0;
953 }
954
955 /*
956  * sync_super for 0.90.0
957  */
958 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
959 {
960         mdp_super_t *sb;
961         mdk_rdev_t *rdev2;
962         int next_spare = mddev->raid_disks;
963
964
965         /* make rdev->sb match mddev data..
966          *
967          * 1/ zero out disks
968          * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
969          * 3/ any empty disks < next_spare become removed
970          *
971          * disks[0] gets initialised to REMOVED because
972          * we cannot be sure from other fields if it has
973          * been initialised or not.
974          */
975         int i;
976         int active=0, working=0,failed=0,spare=0,nr_disks=0;
977
978         rdev->sb_size = MD_SB_BYTES;
979
980         sb = (mdp_super_t*)page_address(rdev->sb_page);
981
982         memset(sb, 0, sizeof(*sb));
983
984         sb->md_magic = MD_SB_MAGIC;
985         sb->major_version = mddev->major_version;
986         sb->patch_version = mddev->patch_version;
987         sb->gvalid_words  = 0; /* ignored */
988         memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
989         memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
990         memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
991         memcpy(&sb->set_uuid3, mddev->uuid+12,4);
992
993         sb->ctime = mddev->ctime;
994         sb->level = mddev->level;
995         sb->size = mddev->dev_sectors / 2;
996         sb->raid_disks = mddev->raid_disks;
997         sb->md_minor = mddev->md_minor;
998         sb->not_persistent = 0;
999         sb->utime = mddev->utime;
1000         sb->state = 0;
1001         sb->events_hi = (mddev->events>>32);
1002         sb->events_lo = (u32)mddev->events;
1003
1004         if (mddev->reshape_position == MaxSector)
1005                 sb->minor_version = 90;
1006         else {
1007                 sb->minor_version = 91;
1008                 sb->reshape_position = mddev->reshape_position;
1009                 sb->new_level = mddev->new_level;
1010                 sb->delta_disks = mddev->delta_disks;
1011                 sb->new_layout = mddev->new_layout;
1012                 sb->new_chunk = mddev->new_chunk_sectors << 9;
1013         }
1014         mddev->minor_version = sb->minor_version;
1015         if (mddev->in_sync)
1016         {
1017                 sb->recovery_cp = mddev->recovery_cp;
1018                 sb->cp_events_hi = (mddev->events>>32);
1019                 sb->cp_events_lo = (u32)mddev->events;
1020                 if (mddev->recovery_cp == MaxSector)
1021                         sb->state = (1<< MD_SB_CLEAN);
1022         } else
1023                 sb->recovery_cp = 0;
1024
1025         sb->layout = mddev->layout;
1026         sb->chunk_size = mddev->chunk_sectors << 9;
1027
1028         if (mddev->bitmap && mddev->bitmap_file == NULL)
1029                 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1030
1031         sb->disks[0].state = (1<<MD_DISK_REMOVED);
1032         list_for_each_entry(rdev2, &mddev->disks, same_set) {
1033                 mdp_disk_t *d;
1034                 int desc_nr;
1035                 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
1036                     && !test_bit(Faulty, &rdev2->flags))
1037                         desc_nr = rdev2->raid_disk;
1038                 else
1039                         desc_nr = next_spare++;
1040                 rdev2->desc_nr = desc_nr;
1041                 d = &sb->disks[rdev2->desc_nr];
1042                 nr_disks++;
1043                 d->number = rdev2->desc_nr;
1044                 d->major = MAJOR(rdev2->bdev->bd_dev);
1045                 d->minor = MINOR(rdev2->bdev->bd_dev);
1046                 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
1047                     && !test_bit(Faulty, &rdev2->flags))
1048                         d->raid_disk = rdev2->raid_disk;
1049                 else
1050                         d->raid_disk = rdev2->desc_nr; /* compatibility */
1051                 if (test_bit(Faulty, &rdev2->flags))
1052                         d->state = (1<<MD_DISK_FAULTY);
1053                 else if (test_bit(In_sync, &rdev2->flags)) {
1054                         d->state = (1<<MD_DISK_ACTIVE);
1055                         d->state |= (1<<MD_DISK_SYNC);
1056                         active++;
1057                         working++;
1058                 } else {
1059                         d->state = 0;
1060                         spare++;
1061                         working++;
1062                 }
1063                 if (test_bit(WriteMostly, &rdev2->flags))
1064                         d->state |= (1<<MD_DISK_WRITEMOSTLY);
1065         }
1066         /* now set the "removed" and "faulty" bits on any missing devices */
1067         for (i=0 ; i < mddev->raid_disks ; i++) {
1068                 mdp_disk_t *d = &sb->disks[i];
1069                 if (d->state == 0 && d->number == 0) {
1070                         d->number = i;
1071                         d->raid_disk = i;
1072                         d->state = (1<<MD_DISK_REMOVED);
1073                         d->state |= (1<<MD_DISK_FAULTY);
1074                         failed++;
1075                 }
1076         }
1077         sb->nr_disks = nr_disks;
1078         sb->active_disks = active;
1079         sb->working_disks = working;
1080         sb->failed_disks = failed;
1081         sb->spare_disks = spare;
1082
1083         sb->this_disk = sb->disks[rdev->desc_nr];
1084         sb->sb_csum = calc_sb_csum(sb);
1085 }
1086
1087 /*
1088  * rdev_size_change for 0.90.0
1089  */
1090 static unsigned long long
1091 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1092 {
1093         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1094                 return 0; /* component must fit device */
1095         if (rdev->mddev->bitmap_offset)
1096                 return 0; /* can't move bitmap */
1097         rdev->sb_start = calc_dev_sboffset(rdev->bdev);
1098         if (!num_sectors || num_sectors > rdev->sb_start)
1099                 num_sectors = rdev->sb_start;
1100         md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1101                        rdev->sb_page);
1102         md_super_wait(rdev->mddev);
1103         return num_sectors / 2; /* kB for sysfs */
1104 }
1105
1106
1107 /*
1108  * version 1 superblock
1109  */
1110
1111 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1112 {
1113         __le32 disk_csum;
1114         u32 csum;
1115         unsigned long long newcsum;
1116         int size = 256 + le32_to_cpu(sb->max_dev)*2;
1117         __le32 *isuper = (__le32*)sb;
1118         int i;
1119
1120         disk_csum = sb->sb_csum;
1121         sb->sb_csum = 0;
1122         newcsum = 0;
1123         for (i=0; size>=4; size -= 4 )
1124                 newcsum += le32_to_cpu(*isuper++);
1125
1126         if (size == 2)
1127                 newcsum += le16_to_cpu(*(__le16*) isuper);
1128
1129         csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1130         sb->sb_csum = disk_csum;
1131         return cpu_to_le32(csum);
1132 }
1133
1134 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1135 {
1136         struct mdp_superblock_1 *sb;
1137         int ret;
1138         sector_t sb_start;
1139         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1140         int bmask;
1141
1142         /*
1143          * Calculate the position of the superblock in 512byte sectors.
1144          * It is always aligned to a 4K boundary and
1145          * depeding on minor_version, it can be:
1146          * 0: At least 8K, but less than 12K, from end of device
1147          * 1: At start of device
1148          * 2: 4K from start of device.
1149          */
1150         switch(minor_version) {
1151         case 0:
1152                 sb_start = rdev->bdev->bd_inode->i_size >> 9;
1153                 sb_start -= 8*2;
1154                 sb_start &= ~(sector_t)(4*2-1);
1155                 break;
1156         case 1:
1157                 sb_start = 0;
1158                 break;
1159         case 2:
1160                 sb_start = 8;
1161                 break;
1162         default:
1163                 return -EINVAL;
1164         }
1165         rdev->sb_start = sb_start;
1166
1167         /* superblock is rarely larger than 1K, but it can be larger,
1168          * and it is safe to read 4k, so we do that
1169          */
1170         ret = read_disk_sb(rdev, 4096);
1171         if (ret) return ret;
1172
1173
1174         sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1175
1176         if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1177             sb->major_version != cpu_to_le32(1) ||
1178             le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1179             le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1180             (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1181                 return -EINVAL;
1182
1183         if (calc_sb_1_csum(sb) != sb->sb_csum) {
1184                 printk("md: invalid superblock checksum on %s\n",
1185                         bdevname(rdev->bdev,b));
1186                 return -EINVAL;
1187         }
1188         if (le64_to_cpu(sb->data_size) < 10) {
1189                 printk("md: data_size too small on %s\n",
1190                        bdevname(rdev->bdev,b));
1191                 return -EINVAL;
1192         }
1193
1194         rdev->preferred_minor = 0xffff;
1195         rdev->data_offset = le64_to_cpu(sb->data_offset);
1196         atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1197
1198         rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1199         bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1200         if (rdev->sb_size & bmask)
1201                 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1202
1203         if (minor_version
1204             && rdev->data_offset < sb_start + (rdev->sb_size/512))
1205                 return -EINVAL;
1206
1207         if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1208                 rdev->desc_nr = -1;
1209         else
1210                 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1211
1212         if (!refdev) {
1213                 ret = 1;
1214         } else {
1215                 __u64 ev1, ev2;
1216                 struct mdp_superblock_1 *refsb = 
1217                         (struct mdp_superblock_1*)page_address(refdev->sb_page);
1218
1219                 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1220                     sb->level != refsb->level ||
1221                     sb->layout != refsb->layout ||
1222                     sb->chunksize != refsb->chunksize) {
1223                         printk(KERN_WARNING "md: %s has strangely different"
1224                                 " superblock to %s\n",
1225                                 bdevname(rdev->bdev,b),
1226                                 bdevname(refdev->bdev,b2));
1227                         return -EINVAL;
1228                 }
1229                 ev1 = le64_to_cpu(sb->events);
1230                 ev2 = le64_to_cpu(refsb->events);
1231
1232                 if (ev1 > ev2)
1233                         ret = 1;
1234                 else
1235                         ret = 0;
1236         }
1237         if (minor_version)
1238                 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
1239                         le64_to_cpu(sb->data_offset);
1240         else
1241                 rdev->sectors = rdev->sb_start;
1242         if (rdev->sectors < le64_to_cpu(sb->data_size))
1243                 return -EINVAL;
1244         rdev->sectors = le64_to_cpu(sb->data_size);
1245         if (le64_to_cpu(sb->size) > rdev->sectors)
1246                 return -EINVAL;
1247         return ret;
1248 }
1249
1250 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1251 {
1252         struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1253         __u64 ev1 = le64_to_cpu(sb->events);
1254
1255         rdev->raid_disk = -1;
1256         clear_bit(Faulty, &rdev->flags);
1257         clear_bit(In_sync, &rdev->flags);
1258         clear_bit(WriteMostly, &rdev->flags);
1259         clear_bit(BarriersNotsupp, &rdev->flags);
1260
1261         if (mddev->raid_disks == 0) {
1262                 mddev->major_version = 1;
1263                 mddev->patch_version = 0;
1264                 mddev->external = 0;
1265                 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1266                 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1267                 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1268                 mddev->level = le32_to_cpu(sb->level);
1269                 mddev->clevel[0] = 0;
1270                 mddev->layout = le32_to_cpu(sb->layout);
1271                 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1272                 mddev->dev_sectors = le64_to_cpu(sb->size);
1273                 mddev->events = ev1;
1274                 mddev->bitmap_offset = 0;
1275                 mddev->default_bitmap_offset = 1024 >> 9;
1276                 
1277                 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1278                 memcpy(mddev->uuid, sb->set_uuid, 16);
1279
1280                 mddev->max_disks =  (4096-256)/2;
1281
1282                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1283                     mddev->bitmap_file == NULL )
1284                         mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1285
1286                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1287                         mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1288                         mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1289                         mddev->new_level = le32_to_cpu(sb->new_level);
1290                         mddev->new_layout = le32_to_cpu(sb->new_layout);
1291                         mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1292                 } else {
1293                         mddev->reshape_position = MaxSector;
1294                         mddev->delta_disks = 0;
1295                         mddev->new_level = mddev->level;
1296                         mddev->new_layout = mddev->layout;
1297                         mddev->new_chunk_sectors = mddev->chunk_sectors;
1298                 }
1299
1300         } else if (mddev->pers == NULL) {
1301                 /* Insist of good event counter while assembling */
1302                 ++ev1;
1303                 if (ev1 < mddev->events)
1304                         return -EINVAL;
1305         } else if (mddev->bitmap) {
1306                 /* If adding to array with a bitmap, then we can accept an
1307                  * older device, but not too old.
1308                  */
1309                 if (ev1 < mddev->bitmap->events_cleared)
1310                         return 0;
1311         } else {
1312                 if (ev1 < mddev->events)
1313                         /* just a hot-add of a new device, leave raid_disk at -1 */
1314                         return 0;
1315         }
1316         if (mddev->level != LEVEL_MULTIPATH) {
1317                 int role;
1318                 if (rdev->desc_nr < 0 ||
1319                     rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1320                         role = 0xffff;
1321                         rdev->desc_nr = -1;
1322                 } else
1323                         role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1324                 switch(role) {
1325                 case 0xffff: /* spare */
1326                         break;
1327                 case 0xfffe: /* faulty */
1328                         set_bit(Faulty, &rdev->flags);
1329                         break;
1330                 default:
1331                         if ((le32_to_cpu(sb->feature_map) &
1332                              MD_FEATURE_RECOVERY_OFFSET))
1333                                 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1334                         else
1335                                 set_bit(In_sync, &rdev->flags);
1336                         rdev->raid_disk = role;
1337                         break;
1338                 }
1339                 if (sb->devflags & WriteMostly1)
1340                         set_bit(WriteMostly, &rdev->flags);
1341         } else /* MULTIPATH are always insync */
1342                 set_bit(In_sync, &rdev->flags);
1343
1344         return 0;
1345 }
1346
1347 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1348 {
1349         struct mdp_superblock_1 *sb;
1350         mdk_rdev_t *rdev2;
1351         int max_dev, i;
1352         /* make rdev->sb match mddev and rdev data. */
1353
1354         sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1355
1356         sb->feature_map = 0;
1357         sb->pad0 = 0;
1358         sb->recovery_offset = cpu_to_le64(0);
1359         memset(sb->pad1, 0, sizeof(sb->pad1));
1360         memset(sb->pad2, 0, sizeof(sb->pad2));
1361         memset(sb->pad3, 0, sizeof(sb->pad3));
1362
1363         sb->utime = cpu_to_le64((__u64)mddev->utime);
1364         sb->events = cpu_to_le64(mddev->events);
1365         if (mddev->in_sync)
1366                 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1367         else
1368                 sb->resync_offset = cpu_to_le64(0);
1369
1370         sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1371
1372         sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1373         sb->size = cpu_to_le64(mddev->dev_sectors);
1374         sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1375         sb->level = cpu_to_le32(mddev->level);
1376         sb->layout = cpu_to_le32(mddev->layout);
1377
1378         if (mddev->bitmap && mddev->bitmap_file == NULL) {
1379                 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1380                 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1381         }
1382
1383         if (rdev->raid_disk >= 0 &&
1384             !test_bit(In_sync, &rdev->flags)) {
1385                 if (mddev->curr_resync_completed > rdev->recovery_offset)
1386                         rdev->recovery_offset = mddev->curr_resync_completed;
1387                 if (rdev->recovery_offset > 0) {
1388                         sb->feature_map |=
1389                                 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1390                         sb->recovery_offset =
1391                                 cpu_to_le64(rdev->recovery_offset);
1392                 }
1393         }
1394
1395         if (mddev->reshape_position != MaxSector) {
1396                 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1397                 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1398                 sb->new_layout = cpu_to_le32(mddev->new_layout);
1399                 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1400                 sb->new_level = cpu_to_le32(mddev->new_level);
1401                 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1402         }
1403
1404         max_dev = 0;
1405         list_for_each_entry(rdev2, &mddev->disks, same_set)
1406                 if (rdev2->desc_nr+1 > max_dev)
1407                         max_dev = rdev2->desc_nr+1;
1408
1409         if (max_dev > le32_to_cpu(sb->max_dev)) {
1410                 int bmask;
1411                 sb->max_dev = cpu_to_le32(max_dev);
1412                 rdev->sb_size = max_dev * 2 + 256;
1413                 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1414                 if (rdev->sb_size & bmask)
1415                         rdev->sb_size = (rdev->sb_size | bmask) + 1;
1416         }
1417         for (i=0; i<max_dev;i++)
1418                 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1419         
1420         list_for_each_entry(rdev2, &mddev->disks, same_set) {
1421                 i = rdev2->desc_nr;
1422                 if (test_bit(Faulty, &rdev2->flags))
1423                         sb->dev_roles[i] = cpu_to_le16(0xfffe);
1424                 else if (test_bit(In_sync, &rdev2->flags))
1425                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1426                 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1427                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1428                 else
1429                         sb->dev_roles[i] = cpu_to_le16(0xffff);
1430         }
1431
1432         sb->sb_csum = calc_sb_1_csum(sb);
1433 }
1434
1435 static unsigned long long
1436 super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1437 {
1438         struct mdp_superblock_1 *sb;
1439         sector_t max_sectors;
1440         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1441                 return 0; /* component must fit device */
1442         if (rdev->sb_start < rdev->data_offset) {
1443                 /* minor versions 1 and 2; superblock before data */
1444                 max_sectors = rdev->bdev->bd_inode->i_size >> 9;
1445                 max_sectors -= rdev->data_offset;
1446                 if (!num_sectors || num_sectors > max_sectors)
1447                         num_sectors = max_sectors;
1448         } else if (rdev->mddev->bitmap_offset) {
1449                 /* minor version 0 with bitmap we can't move */
1450                 return 0;
1451         } else {
1452                 /* minor version 0; superblock after data */
1453                 sector_t sb_start;
1454                 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1455                 sb_start &= ~(sector_t)(4*2 - 1);
1456                 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1457                 if (!num_sectors || num_sectors > max_sectors)
1458                         num_sectors = max_sectors;
1459                 rdev->sb_start = sb_start;
1460         }
1461         sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
1462         sb->data_size = cpu_to_le64(num_sectors);
1463         sb->super_offset = rdev->sb_start;
1464         sb->sb_csum = calc_sb_1_csum(sb);
1465         md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1466                        rdev->sb_page);
1467         md_super_wait(rdev->mddev);
1468         return num_sectors / 2; /* kB for sysfs */
1469 }
1470
1471 static struct super_type super_types[] = {
1472         [0] = {
1473                 .name   = "0.90.0",
1474                 .owner  = THIS_MODULE,
1475                 .load_super         = super_90_load,
1476                 .validate_super     = super_90_validate,
1477                 .sync_super         = super_90_sync,
1478                 .rdev_size_change   = super_90_rdev_size_change,
1479         },
1480         [1] = {
1481                 .name   = "md-1",
1482                 .owner  = THIS_MODULE,
1483                 .load_super         = super_1_load,
1484                 .validate_super     = super_1_validate,
1485                 .sync_super         = super_1_sync,
1486                 .rdev_size_change   = super_1_rdev_size_change,
1487         },
1488 };
1489
1490 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1491 {
1492         mdk_rdev_t *rdev, *rdev2;
1493
1494         rcu_read_lock();
1495         rdev_for_each_rcu(rdev, mddev1)
1496                 rdev_for_each_rcu(rdev2, mddev2)
1497                         if (rdev->bdev->bd_contains ==
1498                             rdev2->bdev->bd_contains) {
1499                                 rcu_read_unlock();
1500                                 return 1;
1501                         }
1502         rcu_read_unlock();
1503         return 0;
1504 }
1505
1506 static LIST_HEAD(pending_raid_disks);
1507
1508 /*
1509  * Try to register data integrity profile for an mddev
1510  *
1511  * This is called when an array is started and after a disk has been kicked
1512  * from the array. It only succeeds if all working and active component devices
1513  * are integrity capable with matching profiles.
1514  */
1515 int md_integrity_register(mddev_t *mddev)
1516 {
1517         mdk_rdev_t *rdev, *reference = NULL;
1518
1519         if (list_empty(&mddev->disks))
1520                 return 0; /* nothing to do */
1521         if (blk_get_integrity(mddev->gendisk))
1522                 return 0; /* already registered */
1523         list_for_each_entry(rdev, &mddev->disks, same_set) {
1524                 /* skip spares and non-functional disks */
1525                 if (test_bit(Faulty, &rdev->flags))
1526                         continue;
1527                 if (rdev->raid_disk < 0)
1528                         continue;
1529                 /*
1530                  * If at least one rdev is not integrity capable, we can not
1531                  * enable data integrity for the md device.
1532                  */
1533                 if (!bdev_get_integrity(rdev->bdev))
1534                         return -EINVAL;
1535                 if (!reference) {
1536                         /* Use the first rdev as the reference */
1537                         reference = rdev;
1538                         continue;
1539                 }
1540                 /* does this rdev's profile match the reference profile? */
1541                 if (blk_integrity_compare(reference->bdev->bd_disk,
1542                                 rdev->bdev->bd_disk) < 0)
1543                         return -EINVAL;
1544         }
1545         /*
1546          * All component devices are integrity capable and have matching
1547          * profiles, register the common profile for the md device.
1548          */
1549         if (blk_integrity_register(mddev->gendisk,
1550                         bdev_get_integrity(reference->bdev)) != 0) {
1551                 printk(KERN_ERR "md: failed to register integrity for %s\n",
1552                         mdname(mddev));
1553                 return -EINVAL;
1554         }
1555         printk(KERN_NOTICE "md: data integrity on %s enabled\n",
1556                 mdname(mddev));
1557         return 0;
1558 }
1559 EXPORT_SYMBOL(md_integrity_register);
1560
1561 /* Disable data integrity if non-capable/non-matching disk is being added */
1562 void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
1563 {
1564         struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
1565         struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
1566
1567         if (!bi_mddev) /* nothing to do */
1568                 return;
1569         if (rdev->raid_disk < 0) /* skip spares */
1570                 return;
1571         if (bi_rdev && blk_integrity_compare(mddev->gendisk,
1572                                              rdev->bdev->bd_disk) >= 0)
1573                 return;
1574         printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
1575         blk_integrity_unregister(mddev->gendisk);
1576 }
1577 EXPORT_SYMBOL(md_integrity_add_rdev);
1578
1579 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1580 {
1581         char b[BDEVNAME_SIZE];
1582         struct kobject *ko;
1583         char *s;
1584         int err;
1585
1586         if (rdev->mddev) {
1587                 MD_BUG();
1588                 return -EINVAL;
1589         }
1590
1591         /* prevent duplicates */
1592         if (find_rdev(mddev, rdev->bdev->bd_dev))
1593                 return -EEXIST;
1594
1595         /* make sure rdev->sectors exceeds mddev->dev_sectors */
1596         if (rdev->sectors && (mddev->dev_sectors == 0 ||
1597                         rdev->sectors < mddev->dev_sectors)) {
1598                 if (mddev->pers) {
1599                         /* Cannot change size, so fail
1600                          * If mddev->level <= 0, then we don't care
1601                          * about aligning sizes (e.g. linear)
1602                          */
1603                         if (mddev->level > 0)
1604                                 return -ENOSPC;
1605                 } else
1606                         mddev->dev_sectors = rdev->sectors;
1607         }
1608
1609         /* Verify rdev->desc_nr is unique.
1610          * If it is -1, assign a free number, else
1611          * check number is not in use
1612          */
1613         if (rdev->desc_nr < 0) {
1614                 int choice = 0;
1615                 if (mddev->pers) choice = mddev->raid_disks;
1616                 while (find_rdev_nr(mddev, choice))
1617                         choice++;
1618                 rdev->desc_nr = choice;
1619         } else {
1620                 if (find_rdev_nr(mddev, rdev->desc_nr))
1621                         return -EBUSY;
1622         }
1623         if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
1624                 printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
1625                        mdname(mddev), mddev->max_disks);
1626                 return -EBUSY;
1627         }
1628         bdevname(rdev->bdev,b);
1629         while ( (s=strchr(b, '/')) != NULL)
1630                 *s = '!';
1631
1632         rdev->mddev = mddev;
1633         printk(KERN_INFO "md: bind<%s>\n", b);
1634
1635         if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
1636                 goto fail;
1637
1638         ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
1639         if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
1640                 kobject_del(&rdev->kobj);
1641                 goto fail;
1642         }
1643         rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state");
1644
1645         list_add_rcu(&rdev->same_set, &mddev->disks);
1646         bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1647
1648         /* May as well allow recovery to be retried once */
1649         mddev->recovery_disabled = 0;
1650
1651         return 0;
1652
1653  fail:
1654         printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
1655                b, mdname(mddev));
1656         return err;
1657 }
1658
1659 static void md_delayed_delete(struct work_struct *ws)
1660 {
1661         mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
1662         kobject_del(&rdev->kobj);
1663         kobject_put(&rdev->kobj);
1664 }
1665
1666 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1667 {
1668         char b[BDEVNAME_SIZE];
1669         if (!rdev->mddev) {
1670                 MD_BUG();
1671                 return;
1672         }
1673         bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1674         list_del_rcu(&rdev->same_set);
1675         printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1676         rdev->mddev = NULL;
1677         sysfs_remove_link(&rdev->kobj, "block");
1678         sysfs_put(rdev->sysfs_state);
1679         rdev->sysfs_state = NULL;
1680         /* We need to delay this, otherwise we can deadlock when
1681          * writing to 'remove' to "dev/state".  We also need
1682          * to delay it due to rcu usage.
1683          */
1684         synchronize_rcu();
1685         INIT_WORK(&rdev->del_work, md_delayed_delete);
1686         kobject_get(&rdev->kobj);
1687         schedule_work(&rdev->del_work);
1688 }
1689
1690 /*
1691  * prevent the device from being mounted, repartitioned or
1692  * otherwise reused by a RAID array (or any other kernel
1693  * subsystem), by bd_claiming the device.
1694  */
1695 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
1696 {
1697         int err = 0;
1698         struct block_device *bdev;
1699         char b[BDEVNAME_SIZE];
1700
1701         bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1702         if (IS_ERR(bdev)) {
1703                 printk(KERN_ERR "md: could not open %s.\n",
1704                         __bdevname(dev, b));
1705                 return PTR_ERR(bdev);
1706         }
1707         err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
1708         if (err) {
1709                 printk(KERN_ERR "md: could not bd_claim %s.\n",
1710                         bdevname(bdev, b));
1711                 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1712                 return err;
1713         }
1714         if (!shared)
1715                 set_bit(AllReserved, &rdev->flags);
1716         rdev->bdev = bdev;
1717         return err;
1718 }
1719
1720 static void unlock_rdev(mdk_rdev_t *rdev)
1721 {
1722         struct block_device *bdev = rdev->bdev;
1723         rdev->bdev = NULL;
1724         if (!bdev)
1725                 MD_BUG();
1726         bd_release(bdev);
1727         blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1728 }
1729
1730 void md_autodetect_dev(dev_t dev);
1731
1732 static void export_rdev(mdk_rdev_t * rdev)
1733 {
1734         char b[BDEVNAME_SIZE];
1735         printk(KERN_INFO "md: export_rdev(%s)\n",
1736                 bdevname(rdev->bdev,b));
1737         if (rdev->mddev)
1738                 MD_BUG();
1739         free_disk_sb(rdev);
1740 #ifndef MODULE
1741         if (test_bit(AutoDetected, &rdev->flags))
1742                 md_autodetect_dev(rdev->bdev->bd_dev);
1743 #endif
1744         unlock_rdev(rdev);
1745         kobject_put(&rdev->kobj);
1746 }
1747
1748 static void kick_rdev_from_array(mdk_rdev_t * rdev)
1749 {
1750         unbind_rdev_from_array(rdev);
1751         export_rdev(rdev);
1752 }
1753
1754 static void export_array(mddev_t *mddev)
1755 {
1756         mdk_rdev_t *rdev, *tmp;
1757
1758         rdev_for_each(rdev, tmp, mddev) {
1759                 if (!rdev->mddev) {
1760                         MD_BUG();
1761                         continue;
1762                 }
1763                 kick_rdev_from_array(rdev);
1764         }
1765         if (!list_empty(&mddev->disks))
1766                 MD_BUG();
1767         mddev->raid_disks = 0;
1768         mddev->major_version = 0;
1769 }
1770
1771 static void print_desc(mdp_disk_t *desc)
1772 {
1773         printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1774                 desc->major,desc->minor,desc->raid_disk,desc->state);
1775 }
1776
1777 static void print_sb_90(mdp_super_t *sb)
1778 {
1779         int i;
1780
1781         printk(KERN_INFO 
1782                 "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1783                 sb->major_version, sb->minor_version, sb->patch_version,
1784                 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1785                 sb->ctime);
1786         printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1787                 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1788                 sb->md_minor, sb->layout, sb->chunk_size);
1789         printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
1790                 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1791                 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1792                 sb->failed_disks, sb->spare_disks,
1793                 sb->sb_csum, (unsigned long)sb->events_lo);
1794
1795         printk(KERN_INFO);
1796         for (i = 0; i < MD_SB_DISKS; i++) {
1797                 mdp_disk_t *desc;
1798
1799                 desc = sb->disks + i;
1800                 if (desc->number || desc->major || desc->minor ||
1801                     desc->raid_disk || (desc->state && (desc->state != 4))) {
1802                         printk("     D %2d: ", i);
1803                         print_desc(desc);
1804                 }
1805         }
1806         printk(KERN_INFO "md:     THIS: ");
1807         print_desc(&sb->this_disk);
1808 }
1809
1810 static void print_sb_1(struct mdp_superblock_1 *sb)
1811 {
1812         __u8 *uuid;
1813
1814         uuid = sb->set_uuid;
1815         printk(KERN_INFO
1816                "md:  SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
1817                ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
1818                "md:    Name: \"%s\" CT:%llu\n",
1819                 le32_to_cpu(sb->major_version),
1820                 le32_to_cpu(sb->feature_map),
1821                 uuid[0], uuid[1], uuid[2], uuid[3],
1822                 uuid[4], uuid[5], uuid[6], uuid[7],
1823                 uuid[8], uuid[9], uuid[10], uuid[11],
1824                 uuid[12], uuid[13], uuid[14], uuid[15],
1825                 sb->set_name,
1826                 (unsigned long long)le64_to_cpu(sb->ctime)
1827                        & MD_SUPERBLOCK_1_TIME_SEC_MASK);
1828
1829         uuid = sb->device_uuid;
1830         printk(KERN_INFO
1831                "md:       L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
1832                         " RO:%llu\n"
1833                "md:     Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
1834                         ":%02x%02x%02x%02x%02x%02x\n"
1835                "md:       (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
1836                "md:         (MaxDev:%u) \n",
1837                 le32_to_cpu(sb->level),
1838                 (unsigned long long)le64_to_cpu(sb->size),
1839                 le32_to_cpu(sb->raid_disks),
1840                 le32_to_cpu(sb->layout),
1841                 le32_to_cpu(sb->chunksize),
1842                 (unsigned long long)le64_to_cpu(sb->data_offset),
1843                 (unsigned long long)le64_to_cpu(sb->data_size),
1844                 (unsigned long long)le64_to_cpu(sb->super_offset),
1845                 (unsigned long long)le64_to_cpu(sb->recovery_offset),
1846                 le32_to_cpu(sb->dev_number),
1847                 uuid[0], uuid[1], uuid[2], uuid[3],
1848                 uuid[4], uuid[5], uuid[6], uuid[7],
1849                 uuid[8], uuid[9], uuid[10], uuid[11],
1850                 uuid[12], uuid[13], uuid[14], uuid[15],
1851                 sb->devflags,
1852                 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
1853                 (unsigned long long)le64_to_cpu(sb->events),
1854                 (unsigned long long)le64_to_cpu(sb->resync_offset),
1855                 le32_to_cpu(sb->sb_csum),
1856                 le32_to_cpu(sb->max_dev)
1857                 );
1858 }
1859
1860 static void print_rdev(mdk_rdev_t *rdev, int major_version)
1861 {
1862         char b[BDEVNAME_SIZE];
1863         printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
1864                 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
1865                 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1866                 rdev->desc_nr);
1867         if (rdev->sb_loaded) {
1868                 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
1869                 switch (major_version) {
1870                 case 0:
1871                         print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
1872                         break;
1873                 case 1:
1874                         print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
1875                         break;
1876                 }
1877         } else
1878                 printk(KERN_INFO "md: no rdev superblock!\n");
1879 }
1880
1881 static void md_print_devices(void)
1882 {
1883         struct list_head *tmp;
1884         mdk_rdev_t *rdev;
1885         mddev_t *mddev;
1886         char b[BDEVNAME_SIZE];
1887
1888         printk("\n");
1889         printk("md:     **********************************\n");
1890         printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
1891         printk("md:     **********************************\n");
1892         for_each_mddev(mddev, tmp) {
1893
1894                 if (mddev->bitmap)
1895                         bitmap_print_sb(mddev->bitmap);
1896                 else
1897                         printk("%s: ", mdname(mddev));
1898                 list_for_each_entry(rdev, &mddev->disks, same_set)
1899                         printk("<%s>", bdevname(rdev->bdev,b));
1900                 printk("\n");
1901
1902                 list_for_each_entry(rdev, &mddev->disks, same_set)
1903                         print_rdev(rdev, mddev->major_version);
1904         }
1905         printk("md:     **********************************\n");
1906         printk("\n");
1907 }
1908
1909
1910 static void sync_sbs(mddev_t * mddev, int nospares)
1911 {
1912         /* Update each superblock (in-memory image), but
1913          * if we are allowed to, skip spares which already
1914          * have the right event counter, or have one earlier
1915          * (which would mean they aren't being marked as dirty
1916          * with the rest of the array)
1917          */
1918         mdk_rdev_t *rdev;
1919
1920         list_for_each_entry(rdev, &mddev->disks, same_set) {
1921                 if (rdev->sb_events == mddev->events ||
1922                     (nospares &&
1923                      rdev->raid_disk < 0 &&
1924                      (rdev->sb_events&1)==0 &&
1925                      rdev->sb_events+1 == mddev->events)) {
1926                         /* Don't update this superblock */
1927                         rdev->sb_loaded = 2;
1928                 } else {
1929                         super_types[mddev->major_version].
1930                                 sync_super(mddev, rdev);
1931                         rdev->sb_loaded = 1;
1932                 }
1933         }
1934 }
1935
1936 static void md_update_sb(mddev_t * mddev, int force_change)
1937 {
1938         mdk_rdev_t *rdev;
1939         int sync_req;
1940         int nospares = 0;
1941
1942         mddev->utime = get_seconds();
1943         if (mddev->external)
1944                 return;
1945 repeat:
1946         spin_lock_irq(&mddev->write_lock);
1947
1948         set_bit(MD_CHANGE_PENDING, &mddev->flags);
1949         if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
1950                 force_change = 1;
1951         if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
1952                 /* just a clean<-> dirty transition, possibly leave spares alone,
1953                  * though if events isn't the right even/odd, we will have to do
1954                  * spares after all
1955                  */
1956                 nospares = 1;
1957         if (force_change)
1958                 nospares = 0;
1959         if (mddev->degraded)
1960                 /* If the array is degraded, then skipping spares is both
1961                  * dangerous and fairly pointless.
1962                  * Dangerous because a device that was removed from the array
1963                  * might have a event_count that still looks up-to-date,
1964                  * so it can be re-added without a resync.
1965                  * Pointless because if there are any spares to skip,
1966                  * then a recovery will happen and soon that array won't
1967                  * be degraded any more and the spare can go back to sleep then.
1968                  */
1969                 nospares = 0;
1970
1971         sync_req = mddev->in_sync;
1972
1973         /* If this is just a dirty<->clean transition, and the array is clean
1974          * and 'events' is odd, we can roll back to the previous clean state */
1975         if (nospares
1976             && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1977             && (mddev->events & 1)
1978             && mddev->events != 1)
1979                 mddev->events--;
1980         else {
1981                 /* otherwise we have to go forward and ... */
1982                 mddev->events ++;
1983                 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1984                         /* .. if the array isn't clean, an 'even' event must also go
1985                          * to spares. */
1986                         if ((mddev->events&1)==0)
1987                                 nospares = 0;
1988                 } else {
1989                         /* otherwise an 'odd' event must go to spares */
1990                         if ((mddev->events&1))
1991                                 nospares = 0;
1992                 }
1993         }
1994
1995         if (!mddev->events) {
1996                 /*
1997                  * oops, this 64-bit counter should never wrap.
1998                  * Either we are in around ~1 trillion A.C., assuming
1999                  * 1 reboot per second, or we have a bug:
2000                  */
2001                 MD_BUG();
2002                 mddev->events --;
2003         }
2004
2005         /*
2006          * do not write anything to disk if using
2007          * nonpersistent superblocks
2008          */
2009         if (!mddev->persistent) {
2010                 if (!mddev->external)
2011                         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2012
2013                 spin_unlock_irq(&mddev->write_lock);
2014                 wake_up(&mddev->sb_wait);
2015                 return;
2016         }
2017         sync_sbs(mddev, nospares);
2018         spin_unlock_irq(&mddev->write_lock);
2019
2020         dprintk(KERN_INFO 
2021                 "md: updating %s RAID superblock on device (in sync %d)\n",
2022                 mdname(mddev),mddev->in_sync);
2023
2024         bitmap_update_sb(mddev->bitmap);
2025         list_for_each_entry(rdev, &mddev->disks, same_set) {
2026                 char b[BDEVNAME_SIZE];
2027                 dprintk(KERN_INFO "md: ");
2028                 if (rdev->sb_loaded != 1)
2029                         continue; /* no noise on spare devices */
2030                 if (test_bit(Faulty, &rdev->flags))
2031                         dprintk("(skipping faulty ");
2032
2033                 dprintk("%s ", bdevname(rdev->bdev,b));
2034                 if (!test_bit(Faulty, &rdev->flags)) {
2035                         md_super_write(mddev,rdev,
2036                                        rdev->sb_start, rdev->sb_size,
2037                                        rdev->sb_page);
2038                         dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
2039                                 bdevname(rdev->bdev,b),
2040                                 (unsigned long long)rdev->sb_start);
2041                         rdev->sb_events = mddev->events;
2042
2043                 } else
2044                         dprintk(")\n");
2045                 if (mddev->level == LEVEL_MULTIPATH)
2046                         /* only need to write one superblock... */
2047                         break;
2048         }
2049         md_super_wait(mddev);
2050         /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
2051
2052         spin_lock_irq(&mddev->write_lock);
2053         if (mddev->in_sync != sync_req ||
2054             test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2055                 /* have to write it out again */
2056                 spin_unlock_irq(&mddev->write_lock);
2057                 goto repeat;
2058         }
2059         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2060         spin_unlock_irq(&mddev->write_lock);
2061         wake_up(&mddev->sb_wait);
2062         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2063                 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2064
2065 }
2066
2067 /* words written to sysfs files may, or may not, be \n terminated.
2068  * We want to accept with case. For this we use cmd_match.
2069  */
2070 static int cmd_match(const char *cmd, const char *str)
2071 {
2072         /* See if cmd, written into a sysfs file, matches
2073          * str.  They must either be the same, or cmd can
2074          * have a trailing newline
2075          */
2076         while (*cmd && *str && *cmd == *str) {
2077                 cmd++;
2078                 str++;
2079         }
2080         if (*cmd == '\n')
2081                 cmd++;
2082         if (*str || *cmd)
2083                 return 0;
2084         return 1;
2085 }
2086
2087 struct rdev_sysfs_entry {
2088         struct attribute attr;
2089         ssize_t (*show)(mdk_rdev_t *, char *);
2090         ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
2091 };
2092
2093 static ssize_t
2094 state_show(mdk_rdev_t *rdev, char *page)
2095 {
2096         char *sep = "";
2097         size_t len = 0;
2098
2099         if (test_bit(Faulty, &rdev->flags)) {
2100                 len+= sprintf(page+len, "%sfaulty",sep);
2101                 sep = ",";
2102         }
2103         if (test_bit(In_sync, &rdev->flags)) {
2104                 len += sprintf(page+len, "%sin_sync",sep);
2105                 sep = ",";
2106         }
2107         if (test_bit(WriteMostly, &rdev->flags)) {
2108                 len += sprintf(page+len, "%swrite_mostly",sep);
2109                 sep = ",";
2110         }
2111         if (test_bit(Blocked, &rdev->flags)) {
2112                 len += sprintf(page+len, "%sblocked", sep);
2113                 sep = ",";
2114         }
2115         if (!test_bit(Faulty, &rdev->flags) &&
2116             !test_bit(In_sync, &rdev->flags)) {
2117                 len += sprintf(page+len, "%sspare", sep);
2118                 sep = ",";
2119         }
2120         return len+sprintf(page+len, "\n");
2121 }
2122
2123 static ssize_t
2124 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2125 {
2126         /* can write
2127          *  faulty  - simulates and error
2128          *  remove  - disconnects the device
2129          *  writemostly - sets write_mostly
2130          *  -writemostly - clears write_mostly
2131          *  blocked - sets the Blocked flag
2132          *  -blocked - clears the Blocked flag
2133          *  insync - sets Insync providing device isn't active
2134          */
2135         int err = -EINVAL;
2136         if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2137                 md_error(rdev->mddev, rdev);
2138                 err = 0;
2139         } else if (cmd_match(buf, "remove")) {
2140                 if (rdev->raid_disk >= 0)
2141                         err = -EBUSY;
2142                 else {
2143                         mddev_t *mddev = rdev->mddev;
2144                         kick_rdev_from_array(rdev);
2145                         if (mddev->pers)
2146                                 md_update_sb(mddev, 1);
2147                         md_new_event(mddev);
2148                         err = 0;
2149                 }
2150         } else if (cmd_match(buf, "writemostly")) {
2151                 set_bit(WriteMostly, &rdev->flags);
2152                 err = 0;
2153         } else if (cmd_match(buf, "-writemostly")) {
2154                 clear_bit(WriteMostly, &rdev->flags);
2155                 err = 0;
2156         } else if (cmd_match(buf, "blocked")) {
2157                 set_bit(Blocked, &rdev->flags);
2158                 err = 0;
2159         } else if (cmd_match(buf, "-blocked")) {
2160                 clear_bit(Blocked, &rdev->flags);
2161                 wake_up(&rdev->blocked_wait);
2162                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2163                 md_wakeup_thread(rdev->mddev->thread);
2164
2165                 err = 0;
2166         } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2167                 set_bit(In_sync, &rdev->flags);
2168                 err = 0;
2169         }
2170         if (!err && rdev->sysfs_state)
2171                 sysfs_notify_dirent(rdev->sysfs_state);
2172         return err ? err : len;
2173 }
2174 static struct rdev_sysfs_entry rdev_state =
2175 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
2176
2177 static ssize_t
2178 errors_show(mdk_rdev_t *rdev, char *page)
2179 {
2180         return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2181 }
2182
2183 static ssize_t
2184 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2185 {
2186         char *e;
2187         unsigned long n = simple_strtoul(buf, &e, 10);
2188         if (*buf && (*e == 0 || *e == '\n')) {
2189                 atomic_set(&rdev->corrected_errors, n);
2190                 return len;
2191         }
2192         return -EINVAL;
2193 }
2194 static struct rdev_sysfs_entry rdev_errors =
2195 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2196
2197 static ssize_t
2198 slot_show(mdk_rdev_t *rdev, char *page)
2199 {
2200         if (rdev->raid_disk < 0)
2201                 return sprintf(page, "none\n");
2202         else
2203                 return sprintf(page, "%d\n", rdev->raid_disk);
2204 }
2205
2206 static ssize_t
2207 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2208 {
2209         char *e;
2210         int err;
2211         char nm[20];
2212         int slot = simple_strtoul(buf, &e, 10);
2213         if (strncmp(buf, "none", 4)==0)
2214                 slot = -1;
2215         else if (e==buf || (*e && *e!= '\n'))
2216                 return -EINVAL;
2217         if (rdev->mddev->pers && slot == -1) {
2218                 /* Setting 'slot' on an active array requires also
2219                  * updating the 'rd%d' link, and communicating
2220                  * with the personality with ->hot_*_disk.
2221                  * For now we only support removing
2222                  * failed/spare devices.  This normally happens automatically,
2223                  * but not when the metadata is externally managed.
2224                  */
2225                 if (rdev->raid_disk == -1)
2226                         return -EEXIST;
2227                 /* personality does all needed checks */
2228                 if (rdev->mddev->pers->hot_add_disk == NULL)
2229                         return -EINVAL;
2230                 err = rdev->mddev->pers->
2231                         hot_remove_disk(rdev->mddev, rdev->raid_disk);
2232                 if (err)
2233                         return err;
2234                 sprintf(nm, "rd%d", rdev->raid_disk);
2235                 sysfs_remove_link(&rdev->mddev->kobj, nm);
2236                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2237                 md_wakeup_thread(rdev->mddev->thread);
2238         } else if (rdev->mddev->pers) {
2239                 mdk_rdev_t *rdev2;
2240                 /* Activating a spare .. or possibly reactivating
2241                  * if we ever get bitmaps working here.
2242                  */
2243
2244                 if (rdev->raid_disk != -1)
2245                         return -EBUSY;
2246
2247                 if (rdev->mddev->pers->hot_add_disk == NULL)
2248                         return -EINVAL;
2249
2250                 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
2251                         if (rdev2->raid_disk == slot)
2252                                 return -EEXIST;
2253
2254                 rdev->raid_disk = slot;
2255                 if (test_bit(In_sync, &rdev->flags))
2256                         rdev->saved_raid_disk = slot;
2257                 else
2258                         rdev->saved_raid_disk = -1;
2259                 err = rdev->mddev->pers->
2260                         hot_add_disk(rdev->mddev, rdev);
2261                 if (err) {
2262                         rdev->raid_disk = -1;
2263                         return err;
2264                 } else
2265                         sysfs_notify_dirent(rdev->sysfs_state);
2266                 sprintf(nm, "rd%d", rdev->raid_disk);
2267                 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2268                         printk(KERN_WARNING
2269                                "md: cannot register "
2270                                "%s for %s\n",
2271                                nm, mdname(rdev->mddev));
2272
2273                 /* don't wakeup anyone, leave that to userspace. */
2274         } else {
2275                 if (slot >= rdev->mddev->raid_disks)
2276                         return -ENOSPC;
2277                 rdev->raid_disk = slot;
2278                 /* assume it is working */
2279                 clear_bit(Faulty, &rdev->flags);
2280                 clear_bit(WriteMostly, &rdev->flags);
2281                 set_bit(In_sync, &rdev->flags);
2282                 sysfs_notify_dirent(rdev->sysfs_state);
2283         }
2284         return len;
2285 }
2286
2287
2288 static struct rdev_sysfs_entry rdev_slot =
2289 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2290
2291 static ssize_t
2292 offset_show(mdk_rdev_t *rdev, char *page)
2293 {
2294         return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2295 }
2296
2297 static ssize_t
2298 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2299 {
2300         char *e;
2301         unsigned long long offset = simple_strtoull(buf, &e, 10);
2302         if (e==buf || (*e && *e != '\n'))
2303                 return -EINVAL;
2304         if (rdev->mddev->pers && rdev->raid_disk >= 0)
2305                 return -EBUSY;
2306         if (rdev->sectors && rdev->mddev->external)
2307                 /* Must set offset before size, so overlap checks
2308                  * can be sane */
2309                 return -EBUSY;
2310         rdev->data_offset = offset;
2311         return len;
2312 }
2313
2314 static struct rdev_sysfs_entry rdev_offset =
2315 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2316
2317 static ssize_t
2318 rdev_size_show(mdk_rdev_t *rdev, char *page)
2319 {
2320         return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2321 }
2322
2323 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2324 {
2325         /* check if two start/length pairs overlap */
2326         if (s1+l1 <= s2)
2327                 return 0;
2328         if (s2+l2 <= s1)
2329                 return 0;
2330         return 1;
2331 }
2332
2333 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2334 {
2335         unsigned long long blocks;
2336         sector_t new;
2337
2338         if (strict_strtoull(buf, 10, &blocks) < 0)
2339                 return -EINVAL;
2340
2341         if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2342                 return -EINVAL; /* sector conversion overflow */
2343
2344         new = blocks * 2;
2345         if (new != blocks * 2)
2346                 return -EINVAL; /* unsigned long long to sector_t overflow */
2347
2348         *sectors = new;
2349         return 0;
2350 }
2351
2352 static ssize_t
2353 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2354 {
2355         mddev_t *my_mddev = rdev->mddev;
2356         sector_t oldsectors = rdev->sectors;
2357         sector_t sectors;
2358
2359         if (strict_blocks_to_sectors(buf, &sectors) < 0)
2360                 return -EINVAL;
2361         if (my_mddev->pers && rdev->raid_disk >= 0) {
2362                 if (my_mddev->persistent) {
2363                         sectors = super_types[my_mddev->major_version].
2364                                 rdev_size_change(rdev, sectors);
2365                         if (!sectors)
2366                                 return -EBUSY;
2367                 } else if (!sectors)
2368                         sectors = (rdev->bdev->bd_inode->i_size >> 9) -
2369                                 rdev->data_offset;
2370         }
2371         if (sectors < my_mddev->dev_sectors)
2372                 return -EINVAL; /* component must fit device */
2373
2374         rdev->sectors = sectors;
2375         if (sectors > oldsectors && my_mddev->external) {
2376                 /* need to check that all other rdevs with the same ->bdev
2377                  * do not overlap.  We need to unlock the mddev to avoid
2378                  * a deadlock.  We have already changed rdev->sectors, and if
2379                  * we have to change it back, we will have the lock again.
2380                  */
2381                 mddev_t *mddev;
2382                 int overlap = 0;
2383                 struct list_head *tmp;
2384
2385                 mddev_unlock(my_mddev);
2386                 for_each_mddev(mddev, tmp) {
2387                         mdk_rdev_t *rdev2;
2388
2389                         mddev_lock(mddev);
2390                         list_for_each_entry(rdev2, &mddev->disks, same_set)
2391                                 if (test_bit(AllReserved, &rdev2->flags) ||
2392                                     (rdev->bdev == rdev2->bdev &&
2393                                      rdev != rdev2 &&
2394                                      overlaps(rdev->data_offset, rdev->sectors,
2395                                               rdev2->data_offset,
2396                                               rdev2->sectors))) {
2397                                         overlap = 1;
2398                                         break;
2399                                 }
2400                         mddev_unlock(mddev);
2401                         if (overlap) {
2402                                 mddev_put(mddev);
2403                                 break;
2404                         }
2405                 }
2406                 mddev_lock(my_mddev);
2407                 if (overlap) {
2408                         /* Someone else could have slipped in a size
2409                          * change here, but doing so is just silly.
2410                          * We put oldsectors back because we *know* it is
2411                          * safe, and trust userspace not to race with
2412                          * itself
2413                          */
2414                         rdev->sectors = oldsectors;
2415                         return -EBUSY;
2416                 }
2417         }
2418         return len;
2419 }
2420
2421 static struct rdev_sysfs_entry rdev_size =
2422 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2423
2424 static struct attribute *rdev_default_attrs[] = {
2425         &rdev_state.attr,
2426         &rdev_errors.attr,
2427         &rdev_slot.attr,
2428         &rdev_offset.attr,
2429         &rdev_size.attr,
2430         NULL,
2431 };
2432 static ssize_t
2433 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2434 {
2435         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2436         mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2437         mddev_t *mddev = rdev->mddev;
2438         ssize_t rv;
2439
2440         if (!entry->show)
2441                 return -EIO;
2442
2443         rv = mddev ? mddev_lock(mddev) : -EBUSY;
2444         if (!rv) {
2445                 if (rdev->mddev == NULL)
2446                         rv = -EBUSY;
2447                 else
2448                         rv = entry->show(rdev, page);
2449                 mddev_unlock(mddev);
2450         }
2451         return rv;
2452 }
2453
2454 static ssize_t
2455 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2456               const char *page, size_t length)
2457 {
2458         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2459         mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2460         ssize_t rv;
2461         mddev_t *mddev = rdev->mddev;
2462
2463         if (!entry->store)
2464                 return -EIO;
2465         if (!capable(CAP_SYS_ADMIN))
2466                 return -EACCES;
2467         rv = mddev ? mddev_lock(mddev): -EBUSY;
2468         if (!rv) {
2469                 if (rdev->mddev == NULL)
2470                         rv = -EBUSY;
2471                 else
2472                         rv = entry->store(rdev, page, length);
2473                 mddev_unlock(mddev);
2474         }
2475         return rv;
2476 }
2477
2478 static void rdev_free(struct kobject *ko)
2479 {
2480         mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
2481         kfree(rdev);
2482 }
2483 static struct sysfs_ops rdev_sysfs_ops = {
2484         .show           = rdev_attr_show,
2485         .store          = rdev_attr_store,
2486 };
2487 static struct kobj_type rdev_ktype = {
2488         .release        = rdev_free,
2489         .sysfs_ops      = &rdev_sysfs_ops,
2490         .default_attrs  = rdev_default_attrs,
2491 };
2492
2493 /*
2494  * Import a device. If 'super_format' >= 0, then sanity check the superblock
2495  *
2496  * mark the device faulty if:
2497  *
2498  *   - the device is nonexistent (zero size)
2499  *   - the device has no valid superblock
2500  *
2501  * a faulty rdev _never_ has rdev->sb set.
2502  */
2503 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
2504 {
2505         char b[BDEVNAME_SIZE];
2506         int err;
2507         mdk_rdev_t *rdev;
2508         sector_t size;
2509
2510         rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
2511         if (!rdev) {
2512                 printk(KERN_ERR "md: could not alloc mem for new device!\n");
2513                 return ERR_PTR(-ENOMEM);
2514         }
2515
2516         if ((err = alloc_disk_sb(rdev)))
2517                 goto abort_free;
2518
2519         err = lock_rdev(rdev, newdev, super_format == -2);
2520         if (err)
2521                 goto abort_free;
2522
2523         kobject_init(&rdev->kobj, &rdev_ktype);
2524
2525         rdev->desc_nr = -1;
2526         rdev->saved_raid_disk = -1;
2527         rdev->raid_disk = -1;
2528         rdev->flags = 0;
2529         rdev->data_offset = 0;
2530         rdev->sb_events = 0;
2531         atomic_set(&rdev->nr_pending, 0);
2532         atomic_set(&rdev->read_errors, 0);
2533         atomic_set(&rdev->corrected_errors, 0);
2534
2535         size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2536         if (!size) {
2537                 printk(KERN_WARNING 
2538                         "md: %s has zero or unknown size, marking faulty!\n",
2539                         bdevname(rdev->bdev,b));
2540                 err = -EINVAL;
2541                 goto abort_free;
2542         }
2543
2544         if (super_format >= 0) {
2545                 err = super_types[super_format].
2546                         load_super(rdev, NULL, super_minor);
2547                 if (err == -EINVAL) {
2548                         printk(KERN_WARNING
2549                                 "md: %s does not have a valid v%d.%d "
2550                                "superblock, not importing!\n",
2551                                 bdevname(rdev->bdev,b),
2552                                super_format, super_minor);
2553                         goto abort_free;
2554                 }
2555                 if (err < 0) {
2556                         printk(KERN_WARNING 
2557                                 "md: could not read %s's sb, not importing!\n",
2558                                 bdevname(rdev->bdev,b));
2559                         goto abort_free;
2560                 }
2561         }
2562
2563         INIT_LIST_HEAD(&rdev->same_set);
2564         init_waitqueue_head(&rdev->blocked_wait);
2565
2566         return rdev;
2567
2568 abort_free:
2569         if (rdev->sb_page) {
2570                 if (rdev->bdev)
2571                         unlock_rdev(rdev);
2572                 free_disk_sb(rdev);
2573         }
2574         kfree(rdev);
2575         return ERR_PTR(err);
2576 }
2577
2578 /*
2579  * Check a full RAID array for plausibility
2580  */
2581
2582
2583 static void analyze_sbs(mddev_t * mddev)
2584 {
2585         int i;
2586         mdk_rdev_t *rdev, *freshest, *tmp;
2587         char b[BDEVNAME_SIZE];
2588
2589         freshest = NULL;
2590         rdev_for_each(rdev, tmp, mddev)
2591                 switch (super_types[mddev->major_version].
2592                         load_super(rdev, freshest, mddev->minor_version)) {
2593                 case 1:
2594                         freshest = rdev;
2595                         break;
2596                 case 0:
2597                         break;
2598                 default:
2599                         printk( KERN_ERR \
2600                                 "md: fatal superblock inconsistency in %s"
2601                                 " -- removing from array\n", 
2602                                 bdevname(rdev->bdev,b));
2603                         kick_rdev_from_array(rdev);
2604                 }
2605
2606
2607         super_types[mddev->major_version].
2608                 validate_super(mddev, freshest);
2609
2610         i = 0;
2611         rdev_for_each(rdev, tmp, mddev) {
2612                 if (rdev->desc_nr >= mddev->max_disks ||
2613                     i > mddev->max_disks) {
2614                         printk(KERN_WARNING
2615                                "md: %s: %s: only %d devices permitted\n",
2616                                mdname(mddev), bdevname(rdev->bdev, b),
2617                                mddev->max_disks);
2618                         kick_rdev_from_array(rdev);
2619                         continue;
2620                 }
2621                 if (rdev != freshest)
2622                         if (super_types[mddev->major_version].
2623                             validate_super(mddev, rdev)) {
2624                                 printk(KERN_WARNING "md: kicking non-fresh %s"
2625                                         " from array!\n",
2626                                         bdevname(rdev->bdev,b));
2627                                 kick_rdev_from_array(rdev);
2628                                 continue;
2629                         }
2630                 if (mddev->level == LEVEL_MULTIPATH) {
2631                         rdev->desc_nr = i++;
2632                         rdev->raid_disk = rdev->desc_nr;
2633                         set_bit(In_sync, &rdev->flags);
2634                 } else if (rdev->raid_disk >= mddev->raid_disks) {
2635                         rdev->raid_disk = -1;
2636                         clear_bit(In_sync, &rdev->flags);
2637                 }
2638         }
2639 }
2640
2641 static void md_safemode_timeout(unsigned long data);
2642
2643 static ssize_t
2644 safe_delay_show(mddev_t *mddev, char *page)
2645 {
2646         int msec = (mddev->safemode_delay*1000)/HZ;
2647         return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2648 }
2649 static ssize_t
2650 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2651 {
2652         int scale=1;
2653         int dot=0;
2654         int i;
2655         unsigned long msec;
2656         char buf[30];
2657
2658         /* remove a period, and count digits after it */
2659         if (len >= sizeof(buf))
2660                 return -EINVAL;
2661         strlcpy(buf, cbuf, sizeof(buf));
2662         for (i=0; i<len; i++) {
2663                 if (dot) {
2664                         if (isdigit(buf[i])) {
2665                                 buf[i-1] = buf[i];
2666                                 scale *= 10;
2667                         }
2668                         buf[i] = 0;
2669                 } else if (buf[i] == '.') {
2670                         dot=1;
2671                         buf[i] = 0;
2672                 }
2673         }
2674         if (strict_strtoul(buf, 10, &msec) < 0)
2675                 return -EINVAL;
2676         msec = (msec * 1000) / scale;
2677         if (msec == 0)
2678                 mddev->safemode_delay = 0;
2679         else {
2680                 unsigned long old_delay = mddev->safemode_delay;
2681                 mddev->safemode_delay = (msec*HZ)/1000;
2682                 if (mddev->safemode_delay == 0)
2683                         mddev->safemode_delay = 1;
2684                 if (mddev->safemode_delay < old_delay)
2685                         md_safemode_timeout((unsigned long)mddev);
2686         }
2687         return len;
2688 }
2689 static struct md_sysfs_entry md_safe_delay =
2690 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2691
2692 static ssize_t
2693 level_show(mddev_t *mddev, char *page)
2694 {
2695         struct mdk_personality *p = mddev->pers;
2696         if (p)
2697                 return sprintf(page, "%s\n", p->name);
2698         else if (mddev->clevel[0])
2699                 return sprintf(page, "%s\n", mddev->clevel);
2700         else if (mddev->level != LEVEL_NONE)
2701                 return sprintf(page, "%d\n", mddev->level);
2702         else
2703                 return 0;
2704 }
2705
2706 static ssize_t
2707 level_store(mddev_t *mddev, const char *buf, size_t len)
2708 {
2709         char level[16];
2710         ssize_t rv = len;
2711         struct mdk_personality *pers;
2712         void *priv;
2713         mdk_rdev_t *rdev;
2714
2715         if (mddev->pers == NULL) {
2716                 if (len == 0)
2717                         return 0;
2718                 if (len >= sizeof(mddev->clevel))
2719                         return -ENOSPC;
2720                 strncpy(mddev->clevel, buf, len);
2721                 if (mddev->clevel[len-1] == '\n')
2722                         len--;
2723                 mddev->clevel[len] = 0;
2724                 mddev->level = LEVEL_NONE;
2725                 return rv;
2726         }
2727
2728         /* request to change the personality.  Need to ensure:
2729          *  - array is not engaged in resync/recovery/reshape
2730          *  - old personality can be suspended
2731          *  - new personality will access other array.
2732          */
2733
2734         if (mddev->sync_thread || mddev->reshape_position != MaxSector)
2735                 return -EBUSY;
2736
2737         if (!mddev->pers->quiesce) {
2738                 printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
2739                        mdname(mddev), mddev->pers->name);
2740                 return -EINVAL;
2741         }
2742
2743         /* Now find the new personality */
2744         if (len == 0 || len >= sizeof(level))
2745                 return -EINVAL;
2746         strncpy(level, buf, len);
2747         if (level[len-1] == '\n')
2748                 len--;
2749         level[len] = 0;
2750
2751         request_module("md-%s", level);
2752         spin_lock(&pers_lock);
2753         pers = find_pers(LEVEL_NONE, level);
2754         if (!pers || !try_module_get(pers->owner)) {
2755                 spin_unlock(&pers_lock);
2756                 printk(KERN_WARNING "md: personality %s not loaded\n", level);
2757                 return -EINVAL;
2758         }
2759         spin_unlock(&pers_lock);
2760
2761         if (pers == mddev->pers) {
2762                 /* Nothing to do! */
2763                 module_put(pers->owner);
2764                 return rv;
2765         }
2766         if (!pers->takeover) {
2767                 module_put(pers->owner);
2768                 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
2769                        mdname(mddev), level);
2770                 return -EINVAL;
2771         }
2772
2773         /* ->takeover must set new_* and/or delta_disks
2774          * if it succeeds, and may set them when it fails.
2775          */
2776         priv = pers->takeover(mddev);
2777         if (IS_ERR(priv)) {
2778                 mddev->new_level = mddev->level;
2779                 mddev->new_layout = mddev->layout;
2780                 mddev->new_chunk_sectors = mddev->chunk_sectors;
2781                 mddev->raid_disks -= mddev->delta_disks;
2782                 mddev->delta_disks = 0;
2783                 module_put(pers->owner);
2784                 printk(KERN_WARNING "md: %s: %s would not accept array\n",
2785                        mdname(mddev), level);
2786                 return PTR_ERR(priv);
2787         }
2788
2789         /* Looks like we have a winner */
2790         mddev_suspend(mddev);
2791         mddev->pers->stop(mddev);
2792         module_put(mddev->pers->owner);
2793         /* Invalidate devices that are now superfluous */
2794         list_for_each_entry(rdev, &mddev->disks, same_set)
2795                 if (rdev->raid_disk >= mddev->raid_disks) {
2796                         rdev->raid_disk = -1;
2797                         clear_bit(In_sync, &rdev->flags);
2798                 }
2799         mddev->pers = pers;
2800         mddev->private = priv;
2801         strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
2802         mddev->level = mddev->new_level;
2803         mddev->layout = mddev->new_layout;
2804         mddev->chunk_sectors = mddev->new_chunk_sectors;
2805         mddev->delta_disks = 0;
2806         pers->run(mddev);
2807         mddev_resume(mddev);
2808         set_bit(MD_CHANGE_DEVS, &mddev->flags);
2809         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2810         md_wakeup_thread(mddev->thread);
2811         return rv;
2812 }
2813
2814 static struct md_sysfs_entry md_level =
2815 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2816
2817
2818 static ssize_t
2819 layout_show(mddev_t *mddev, char *page)
2820 {
2821         /* just a number, not meaningful for all levels */
2822         if (mddev->reshape_position != MaxSector &&
2823             mddev->layout != mddev->new_layout)
2824                 return sprintf(page, "%d (%d)\n",
2825                                mddev->new_layout, mddev->layout);
2826         return sprintf(page, "%d\n", mddev->layout);
2827 }
2828
2829 static ssize_t
2830 layout_store(mddev_t *mddev, const char *buf, size_t len)
2831 {
2832         char *e;
2833         unsigned long n = simple_strtoul(buf, &e, 10);
2834
2835         if (!*buf || (*e && *e != '\n'))
2836                 return -EINVAL;
2837
2838         if (mddev->pers) {
2839                 int err;
2840                 if (mddev->pers->check_reshape == NULL)
2841                         return -EBUSY;
2842                 mddev->new_layout = n;
2843                 err = mddev->pers->check_reshape(mddev);
2844                 if (err) {
2845                         mddev->new_layout = mddev->layout;
2846                         return err;
2847                 }
2848         } else {
2849                 mddev->new_layout = n;
2850                 if (mddev->reshape_position == MaxSector)
2851                         mddev->layout = n;
2852         }
2853         return len;
2854 }
2855 static struct md_sysfs_entry md_layout =
2856 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2857
2858
2859 static ssize_t
2860 raid_disks_show(mddev_t *mddev, char *page)
2861 {
2862         if (mddev->raid_disks == 0)
2863                 return 0;
2864         if (mddev->reshape_position != MaxSector &&
2865             mddev->delta_disks != 0)
2866                 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
2867                                mddev->raid_disks - mddev->delta_disks);
2868         return sprintf(page, "%d\n", mddev->raid_disks);
2869 }
2870
2871 static int update_raid_disks(mddev_t *mddev, int raid_disks);
2872
2873 static ssize_t
2874 raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2875 {
2876         char *e;
2877         int rv = 0;
2878         unsigned long n = simple_strtoul(buf, &e, 10);
2879
2880         if (!*buf || (*e && *e != '\n'))
2881                 return -EINVAL;
2882
2883         if (mddev->pers)
2884                 rv = update_raid_disks(mddev, n);
2885         else if (mddev->reshape_position != MaxSector) {
2886                 int olddisks = mddev->raid_disks - mddev->delta_disks;
2887                 mddev->delta_disks = n - olddisks;
2888                 mddev->raid_disks = n;
2889         } else
2890                 mddev->raid_disks = n;
2891         return rv ? rv : len;
2892 }
2893 static struct md_sysfs_entry md_raid_disks =
2894 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2895
2896 static ssize_t
2897 chunk_size_show(mddev_t *mddev, char *page)
2898 {
2899         if (mddev->reshape_position != MaxSector &&
2900             mddev->chunk_sectors != mddev->new_chunk_sectors)
2901                 return sprintf(page, "%d (%d)\n",
2902                                mddev->new_chunk_sectors << 9,
2903                                mddev->chunk_sectors << 9);
2904         return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
2905 }
2906
2907 static ssize_t
2908 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2909 {
2910         char *e;
2911         unsigned long n = simple_strtoul(buf, &e, 10);
2912
2913         if (!*buf || (*e && *e != '\n'))
2914                 return -EINVAL;
2915
2916         if (mddev->pers) {
2917                 int err;
2918                 if (mddev->pers->check_reshape == NULL)
2919                         return -EBUSY;
2920                 mddev->new_chunk_sectors = n >> 9;
2921                 err = mddev->pers->check_reshape(mddev);
2922                 if (err) {
2923                         mddev->new_chunk_sectors = mddev->chunk_sectors;
2924                         return err;
2925                 }
2926         } else {
2927                 mddev->new_chunk_sectors = n >> 9;
2928                 if (mddev->reshape_position == MaxSector)
2929                         mddev->chunk_sectors = n >> 9;
2930         }
2931         return len;
2932 }
2933 static struct md_sysfs_entry md_chunk_size =
2934 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2935
2936 static ssize_t
2937 resync_start_show(mddev_t *mddev, char *page)
2938 {
2939         if (mddev->recovery_cp == MaxSector)
2940                 return sprintf(page, "none\n");
2941         return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2942 }
2943
2944 static ssize_t
2945 resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2946 {
2947         char *e;
2948         unsigned long long n = simple_strtoull(buf, &e, 10);
2949
2950         if (mddev->pers)
2951                 return -EBUSY;
2952         if (!*buf || (*e && *e != '\n'))
2953                 return -EINVAL;
2954
2955         mddev->recovery_cp = n;
2956         return len;
2957 }
2958 static struct md_sysfs_entry md_resync_start =
2959 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2960
2961 /*
2962  * The array state can be:
2963  *
2964  * clear
2965  *     No devices, no size, no level
2966  *     Equivalent to STOP_ARRAY ioctl
2967  * inactive
2968  *     May have some settings, but array is not active
2969  *        all IO results in error
2970  *     When written, doesn't tear down array, but just stops it
2971  * suspended (not supported yet)
2972  *     All IO requests will block. The array can be reconfigured.
2973  *     Writing this, if accepted, will block until array is quiescent
2974  * readonly
2975  *     no resync can happen.  no superblocks get written.
2976  *     write requests fail
2977  * read-auto
2978  *     like readonly, but behaves like 'clean' on a write request.
2979  *
2980  * clean - no pending writes, but otherwise active.
2981  *     When written to inactive array, starts without resync
2982  *     If a write request arrives then
2983  *       if metadata is known, mark 'dirty' and switch to 'active'.
2984  *       if not known, block and switch to write-pending
2985  *     If written to an active array that has pending writes, then fails.
2986  * active
2987  *     fully active: IO and resync can be happening.
2988  *     When written to inactive array, starts with resync
2989  *
2990  * write-pending
2991  *     clean, but writes are blocked waiting for 'active' to be written.
2992  *
2993  * active-idle
2994  *     like active, but no writes have been seen for a while (100msec).
2995  *
2996  */
2997 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2998                    write_pending, active_idle, bad_word};
2999 static char *array_states[] = {
3000         "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
3001         "write-pending", "active-idle", NULL };
3002
3003 static int match_word(const char *word, char **list)
3004 {
3005         int n;
3006         for (n=0; list[n]; n++)
3007                 if (cmd_match(word, list[n]))
3008                         break;
3009         return n;
3010 }
3011
3012 static ssize_t
3013 array_state_show(mddev_t *mddev, char *page)
3014 {
3015         enum array_state st = inactive;
3016
3017         if (mddev->pers)
3018                 switch(mddev->ro) {
3019                 case 1:
3020                         st = readonly;
3021                         break;
3022                 case 2:
3023                         st = read_auto;
3024                         break;
3025                 case 0:
3026                         if (mddev->in_sync)
3027                                 st = clean;
3028                         else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
3029                                 st = write_pending;
3030                         else if (mddev->safemode)
3031                                 st = active_idle;
3032                         else
3033                                 st = active;
3034                 }
3035         else {
3036                 if (list_empty(&mddev->disks) &&
3037                     mddev->raid_disks == 0 &&
3038                     mddev->dev_sectors == 0)
3039                         st = clear;
3040                 else
3041                         st = inactive;
3042         }
3043         return sprintf(page, "%s\n", array_states[st]);
3044 }
3045
3046 static int do_md_stop(mddev_t * mddev, int ro, int is_open);
3047 static int do_md_run(mddev_t * mddev);
3048 static int restart_array(mddev_t *mddev);
3049
3050 static ssize_t
3051 array_state_store(mddev_t *mddev, const char *buf, size_t len)
3052 {
3053         int err = -EINVAL;
3054         enum array_state st = match_word(buf, array_states);
3055         switch(st) {
3056         case bad_word:
3057                 break;
3058         case clear:
3059                 /* stopping an active array */
3060                 if (atomic_read(&mddev->openers) > 0)
3061                         return -EBUSY;
3062                 err = do_md_stop(mddev, 0, 0);
3063                 break;
3064         case inactive:
3065                 /* stopping an active array */
3066                 if (mddev->pers) {
3067                         if (atomic_read(&mddev->openers) > 0)
3068                                 return -EBUSY;
3069                         err = do_md_stop(mddev, 2, 0);
3070                 } else
3071                         err = 0; /* already inactive */
3072                 break;
3073         case suspended:
3074                 break; /* not supported yet */
3075         case readonly:
3076                 if (mddev->pers)
3077                         err = do_md_stop(mddev, 1, 0);
3078                 else {
3079                         mddev->ro = 1;
3080                         set_disk_ro(mddev->gendisk, 1);
3081                         err = do_md_run(mddev);
3082                 }
3083                 break;
3084         case read_auto:
3085                 if (mddev->pers) {
3086                         if (mddev->ro == 0)
3087                                 err = do_md_stop(mddev, 1, 0);
3088                         else if (mddev->ro == 1)
3089                                 err = restart_array(mddev);
3090                         if (err == 0) {
3091                                 mddev->ro = 2;
3092                                 set_disk_ro(mddev->gendisk, 0);
3093                         }
3094                 } else {
3095                         mddev->ro = 2;
3096                         err = do_md_run(mddev);
3097                 }
3098                 break;
3099         case clean:
3100                 if (mddev->pers) {
3101                         restart_array(mddev);
3102                         spin_lock_irq(&mddev->write_lock);
3103                         if (atomic_read(&mddev->writes_pending) == 0) {
3104                                 if (mddev->in_sync == 0) {
3105                                         mddev->in_sync = 1;
3106                                         if (mddev->safemode == 1)
3107                                                 mddev->safemode = 0;
3108                                         if (mddev->persistent)
3109                                                 set_bit(MD_CHANGE_CLEAN,
3110                                                         &mddev->flags);
3111                                 }
3112                                 err = 0;
3113                         } else
3114                                 err = -EBUSY;
3115                         spin_unlock_irq(&mddev->write_lock);
3116                 } else
3117                         err = -EINVAL;
3118                 break;
3119         case active:
3120                 if (mddev->pers) {
3121                         restart_array(mddev);
3122                         if (mddev->external)
3123                                 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
3124                         wake_up(&mddev->sb_wait);
3125                         err = 0;
3126                 } else {
3127                         mddev->ro = 0;
3128                         set_disk_ro(mddev->gendisk, 0);
3129                         err = do_md_run(mddev);
3130                 }
3131                 break;
3132         case write_pending:
3133         case active_idle:
3134                 /* these cannot be set */
3135                 break;
3136         }
3137         if (err)
3138                 return err;
3139         else {
3140                 sysfs_notify_dirent(mddev->sysfs_state);
3141                 return len;
3142         }
3143 }
3144 static struct md_sysfs_entry md_array_state =
3145 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3146
3147 static ssize_t
3148 null_show(mddev_t *mddev, char *page)
3149 {
3150         return -EINVAL;
3151 }
3152
3153 static ssize_t
3154 new_dev_store(mddev_t *mddev, const char *buf, size_t len)
3155 {
3156         /* buf must be %d:%d\n? giving major and minor numbers */
3157         /* The new device is added to the array.
3158          * If the array has a persistent superblock, we read the
3159          * superblock to initialise info and check validity.
3160          * Otherwise, only checking done is that in bind_rdev_to_array,
3161          * which mainly checks size.
3162          */
3163         char *e;
3164         int major = simple_strtoul(buf, &e, 10);
3165         int minor;
3166         dev_t dev;
3167         mdk_rdev_t *rdev;
3168         int err;
3169
3170         if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
3171                 return -EINVAL;
3172         minor = simple_strtoul(e+1, &e, 10);
3173         if (*e && *e != '\n')
3174                 return -EINVAL;
3175         dev = MKDEV(major, minor);
3176         if (major != MAJOR(dev) ||
3177             minor != MINOR(dev))
3178                 return -EOVERFLOW;
3179
3180
3181         if (mddev->persistent) {
3182                 rdev = md_import_device(dev, mddev->major_version,
3183                                         mddev->minor_version);
3184                 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
3185                         mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3186                                                        mdk_rdev_t, same_set);
3187                         err = super_types[mddev->major_version]
3188                                 .load_super(rdev, rdev0, mddev->minor_version);
3189                         if (err < 0)
3190                                 goto out;
3191                 }
3192         } else if (mddev->external)
3193                 rdev = md_import_device(dev, -2, -1);
3194         else
3195                 rdev = md_import_device(dev, -1, -1);
3196
3197         if (IS_ERR(rdev))
3198                 return PTR_ERR(rdev);
3199         err = bind_rdev_to_array(rdev, mddev);
3200  out:
3201         if (err)
3202                 export_rdev(rdev);
3203         return err ? err : len;
3204 }
3205
3206 static struct md_sysfs_entry md_new_device =
3207 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
3208
3209 static ssize_t
3210 bitmap_store(mddev_t *mddev, const char *buf, size_t len)
3211 {
3212         char *end;
3213         unsigned long chunk, end_chunk;
3214
3215         if (!mddev->bitmap)
3216                 goto out;
3217         /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
3218         while (*buf) {
3219                 chunk = end_chunk = simple_strtoul(buf, &end, 0);
3220                 if (buf == end) break;
3221                 if (*end == '-') { /* range */
3222                         buf = end + 1;
3223                         end_chunk = simple_strtoul(buf, &end, 0);
3224                         if (buf == end) break;
3225                 }
3226                 if (*end && !isspace(*end)) break;
3227                 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
3228                 buf = end;
3229                 while (isspace(*buf)) buf++;
3230         }
3231         bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
3232 out:
3233         return len;
3234 }
3235
3236 static struct md_sysfs_entry md_bitmap =
3237 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
3238
3239 static ssize_t
3240 size_show(mddev_t *mddev, char *page)
3241 {
3242         return sprintf(page, "%llu\n",
3243                 (unsigned long long)mddev->dev_sectors / 2);
3244 }
3245
3246 static int update_size(mddev_t *mddev, sector_t num_sectors);
3247
3248 static ssize_t
3249 size_store(mddev_t *mddev, const char *buf, size_t len)
3250 {
3251         /* If array is inactive, we can reduce the component size, but
3252          * not increase it (except from 0).
3253          * If array is active, we can try an on-line resize
3254          */
3255         sector_t sectors;
3256         int err = strict_blocks_to_sectors(buf, &sectors);
3257
3258         if (err < 0)
3259                 return err;
3260         if (mddev->pers) {
3261                 err = update_size(mddev, sectors);
3262                 md_update_sb(mddev, 1);
3263         } else {
3264                 if (mddev->dev_sectors == 0 ||
3265                     mddev->dev_sectors > sectors)
3266                         mddev->dev_sectors = sectors;
3267                 else
3268                         err = -ENOSPC;
3269         }
3270         return err ? err : len;
3271 }
3272
3273 static struct md_sysfs_entry md_size =
3274 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
3275
3276
3277 /* Metdata version.
3278  * This is one of
3279  *   'none' for arrays with no metadata (good luck...)
3280  *   'external' for arrays with externally managed metadata,
3281  * or N.M for internally known formats
3282  */
3283 static ssize_t
3284 metadata_show(mddev_t *mddev, char *page)
3285 {
3286         if (mddev->persistent)
3287                 return sprintf(page, "%d.%d\n",
3288                                mddev->major_version, mddev->minor_version);
3289         else if (mddev->external)
3290                 return sprintf(page, "external:%s\n", mddev->metadata_type);
3291         else
3292                 return sprintf(page, "none\n");
3293 }
3294
3295 static ssize_t
3296 metadata_store(mddev_t *mddev, const char *buf, size_t len)
3297 {
3298         int major, minor;
3299         char *e;
3300         /* Changing the details of 'external' metadata is
3301          * always permitted.  Otherwise there must be
3302          * no devices attached to the array.
3303          */
3304         if (mddev->external && strncmp(buf, "external:", 9) == 0)
3305                 ;
3306         else if (!list_empty(&mddev->disks))
3307                 return -EBUSY;
3308
3309         if (cmd_match(buf, "none")) {
3310                 mddev->persistent = 0;
3311                 mddev->external = 0;
3312                 mddev->major_version = 0;
3313                 mddev->minor_version = 90;
3314                 return len;
3315         }
3316         if (strncmp(buf, "external:", 9) == 0) {
3317                 size_t namelen = len-9;
3318                 if (namelen >= sizeof(mddev->metadata_type))
3319                         namelen = sizeof(mddev->metadata_type)-1;
3320                 strncpy(mddev->metadata_type, buf+9, namelen);
3321                 mddev->metadata_type[namelen] = 0;
3322                 if (namelen && mddev->metadata_type[namelen-1] == '\n')
3323                         mddev->metadata_type[--namelen] = 0;
3324                 mddev->persistent = 0;
3325                 mddev->external = 1;
3326                 mddev->major_version = 0;
3327                 mddev->minor_version = 90;
3328                 return len;
3329         }
3330         major = simple_strtoul(buf, &e, 10);
3331         if (e==buf || *e != '.')
3332                 return -EINVAL;
3333         buf = e+1;
3334         minor = simple_strtoul(buf, &e, 10);
3335         if (e==buf || (*e && *e != '\n') )
3336                 return -EINVAL;
3337         if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
3338                 return -ENOENT;
3339         mddev->major_version = major;
3340         mddev->minor_version = minor;
3341         mddev->persistent = 1;
3342         mddev->external = 0;
3343         return len;
3344 }
3345
3346 static struct md_sysfs_entry md_metadata =
3347 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
3348
3349 static ssize_t
3350 action_show(mddev_t *mddev, char *page)
3351 {
3352         char *type = "idle";
3353         if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3354                 type = "frozen";
3355         else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3356             (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
3357                 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3358                         type = "reshape";
3359                 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3360                         if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
3361                                 type = "resync";
3362                         else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
3363                                 type = "check";
3364                         else
3365                                 type = "repair";
3366                 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
3367                         type = "recover";
3368         }
3369         return sprintf(page, "%s\n", type);
3370 }
3371
3372 static ssize_t
3373 action_store(mddev_t *mddev, const char *page, size_t len)
3374 {
3375         if (!mddev->pers || !mddev->pers->sync_request)
3376                 return -EINVAL;
3377
3378         if (cmd_match(page, "frozen"))
3379                 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3380         else
3381                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3382
3383         if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
3384                 if (mddev->sync_thread) {
3385                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3386                         md_unregister_thread(mddev->sync_thread);
3387                         mddev->sync_thread = NULL;
3388                         mddev->recovery = 0;
3389                 }
3390         } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3391                    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
3392                 return -EBUSY;
3393         else if (cmd_match(page, "resync"))
3394                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3395         else if (cmd_match(page, "recover")) {
3396                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
3397                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3398         } else if (cmd_match(page, "reshape")) {
3399                 int err;
3400                 if (mddev->pers->start_reshape == NULL)
3401                         return -EINVAL;
3402                 err = mddev->pers->start_reshape(mddev);
3403                 if (err)
3404                         return err;
3405                 sysfs_notify(&mddev->kobj, NULL, "degraded");
3406         } else {
3407                 if (cmd_match(page, "check"))
3408                         set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3409                 else if (!cmd_match(page, "repair"))
3410                         return -EINVAL;
3411                 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
3412                 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3413         }
3414         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3415         md_wakeup_thread(mddev->thread);
3416         sysfs_notify_dirent(mddev->sysfs_action);
3417         return len;
3418 }
3419
3420 static ssize_t
3421 mismatch_cnt_show(mddev_t *mddev, char *page)
3422 {
3423         return sprintf(page, "%llu\n",
3424                        (unsigned long long) mddev->resync_mismatches);
3425 }
3426
3427 static struct md_sysfs_entry md_scan_mode =
3428 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
3429
3430
3431 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
3432
3433 static ssize_t
3434 sync_min_show(mddev_t *mddev, char *page)
3435 {
3436         return sprintf(page, "%d (%s)\n", speed_min(mddev),
3437                        mddev->sync_speed_min ? "local": "system");
3438 }
3439
3440 static ssize_t
3441 sync_min_store(mddev_t *mddev, const char *buf, size_t len)
3442 {
3443         int min;
3444         char *e;
3445         if (strncmp(buf, "system", 6)==0) {
3446                 mddev->sync_speed_min = 0;
3447                 return len;
3448         }
3449         min = simple_strtoul(buf, &e, 10);
3450         if (buf == e || (*e && *e != '\n') || min <= 0)
3451                 return -EINVAL;
3452         mddev->sync_speed_min = min;
3453         return len;
3454 }
3455
3456 static struct md_sysfs_entry md_sync_min =
3457 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
3458
3459 static ssize_t
3460 sync_max_show(mddev_t *mddev, char *page)
3461 {
3462         return sprintf(page, "%d (%s)\n", speed_max(mddev),
3463                        mddev->sync_speed_max ? "local": "system");
3464 }
3465
3466 static ssize_t
3467 sync_max_store(mddev_t *mddev, const char *buf, size_t len)
3468 {
3469         int max;
3470         char *e;
3471         if (strncmp(buf, "system", 6)==0) {
3472                 mddev->sync_speed_max = 0;
3473                 return len;
3474         }
3475         max = simple_strtoul(buf, &e, 10);
3476         if (buf == e || (*e && *e != '\n') || max <= 0)
3477                 return -EINVAL;
3478         mddev->sync_speed_max = max;
3479         return len;
3480 }
3481
3482 static struct md_sysfs_entry md_sync_max =
3483 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
3484
3485 static ssize_t
3486 degraded_show(mddev_t *mddev, char *page)
3487 {
3488         return sprintf(page, "%d\n", mddev->degraded);
3489 }
3490 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
3491
3492 static ssize_t
3493 sync_force_parallel_show(mddev_t *mddev, char *page)
3494 {
3495         return sprintf(page, "%d\n", mddev->parallel_resync);
3496 }
3497
3498 static ssize_t
3499 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
3500 {
3501         long n;
3502
3503         if (strict_strtol(buf, 10, &n))
3504                 return -EINVAL;
3505
3506         if (n != 0 && n != 1)
3507                 return -EINVAL;
3508
3509         mddev->parallel_resync = n;
3510
3511         if (mddev->sync_thread)
3512                 wake_up(&resync_wait);
3513
3514         return len;
3515 }
3516
3517 /* force parallel resync, even with shared block devices */
3518 static struct md_sysfs_entry md_sync_force_parallel =
3519 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
3520        sync_force_parallel_show, sync_force_parallel_store);
3521
3522 static ssize_t
3523 sync_speed_show(mddev_t *mddev, char *page)
3524 {
3525         unsigned long resync, dt, db;
3526         if (mddev->curr_resync == 0)
3527                 return sprintf(page, "none\n");
3528         resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3529         dt = (jiffies - mddev->resync_mark) / HZ;
3530         if (!dt) dt++;
3531         db = resync - mddev->resync_mark_cnt;
3532         return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
3533 }
3534
3535 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
3536
3537 static ssize_t
3538 sync_completed_show(mddev_t *mddev, char *page)
3539 {
3540         unsigned long max_sectors, resync;
3541
3542         if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3543                 return sprintf(page, "none\n");
3544
3545         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3546                 max_sectors = mddev->resync_max_sectors;
3547         else
3548                 max_sectors = mddev->dev_sectors;
3549
3550         resync = mddev->curr_resync_completed;
3551         return sprintf(page, "%lu / %lu\n", resync, max_sectors);
3552 }
3553
3554 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
3555
3556 static ssize_t
3557 min_sync_show(mddev_t *mddev, char *page)
3558 {
3559         return sprintf(page, "%llu\n",
3560                        (unsigned long long)mddev->resync_min);
3561 }
3562 static ssize_t
3563 min_sync_store(mddev_t *mddev, const char *buf, size_t len)
3564 {
3565         unsigned long long min;
3566         if (strict_strtoull(buf, 10, &min))
3567                 return -EINVAL;
3568         if (min > mddev->resync_max)
3569                 return -EINVAL;
3570         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3571                 return -EBUSY;
3572
3573         /* Must be a multiple of chunk_size */
3574         if (mddev->chunk_sectors) {
3575                 sector_t temp = min;
3576                 if (sector_div(temp, mddev->chunk_sectors))
3577                         return -EINVAL;
3578         }
3579         mddev->resync_min = min;
3580
3581         return len;
3582 }
3583
3584 static struct md_sysfs_entry md_min_sync =
3585 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
3586
3587 static ssize_t
3588 max_sync_show(mddev_t *mddev, char *page)
3589 {
3590         if (mddev->resync_max == MaxSector)
3591                 return sprintf(page, "max\n");
3592         else
3593                 return sprintf(page, "%llu\n",
3594                                (unsigned long long)mddev->resync_max);
3595 }
3596 static ssize_t
3597 max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3598 {
3599         if (strncmp(buf, "max", 3) == 0)
3600                 mddev->resync_max = MaxSector;
3601         else {
3602                 unsigned long long max;
3603                 if (strict_strtoull(buf, 10, &max))
3604                         return -EINVAL;
3605                 if (max < mddev->resync_min)
3606                         return -EINVAL;
3607                 if (max < mddev->resync_max &&
3608                     mddev->ro == 0 &&
3609                     test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3610                         return -EBUSY;
3611
3612                 /* Must be a multiple of chunk_size */
3613                 if (mddev->chunk_sectors) {
3614                         sector_t temp = max;
3615                         if (sector_div(temp, mddev->chunk_sectors))
3616                                 return -EINVAL;
3617                 }
3618                 mddev->resync_max = max;
3619         }
3620         wake_up(&mddev->recovery_wait);
3621         return len;
3622 }
3623
3624 static struct md_sysfs_entry md_max_sync =
3625 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
3626
3627 static ssize_t
3628 suspend_lo_show(mddev_t *mddev, char *page)
3629 {
3630         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
3631 }
3632
3633 static ssize_t
3634 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
3635 {
3636         char *e;
3637         unsigned long long new = simple_strtoull(buf, &e, 10);
3638
3639         if (mddev->pers == NULL || 
3640             mddev->pers->quiesce == NULL)
3641                 return -EINVAL;
3642         if (buf == e || (*e && *e != '\n'))
3643                 return -EINVAL;
3644         if (new >= mddev->suspend_hi ||
3645             (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
3646                 mddev->suspend_lo = new;
3647                 mddev->pers->quiesce(mddev, 2);
3648                 return len;
3649         } else
3650                 return -EINVAL;
3651 }
3652 static struct md_sysfs_entry md_suspend_lo =
3653 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
3654
3655
3656 static ssize_t
3657 suspend_hi_show(mddev_t *mddev, char *page)
3658 {
3659         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
3660 }
3661
3662 static ssize_t
3663 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
3664 {
3665         char *e;
3666         unsigned long long new = simple_strtoull(buf, &e, 10);
3667
3668         if (mddev->pers == NULL ||
3669             mddev->pers->quiesce == NULL)
3670                 return -EINVAL;
3671         if (buf == e || (*e && *e != '\n'))
3672                 return -EINVAL;
3673         if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
3674             (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
3675                 mddev->suspend_hi = new;
3676                 mddev->pers->quiesce(mddev, 1);
3677                 mddev->pers->quiesce(mddev, 0);
3678                 return len;
3679         } else
3680                 return -EINVAL;
3681 }
3682 static struct md_sysfs_entry md_suspend_hi =
3683 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
3684
3685 static ssize_t
3686 reshape_position_show(mddev_t *mddev, char *page)
3687 {
3688         if (mddev->reshape_position != MaxSector)
3689                 return sprintf(page, "%llu\n",
3690                                (unsigned long long)mddev->reshape_position);
3691         strcpy(page, "none\n");
3692         return 5;
3693 }
3694
3695 static ssize_t
3696 reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
3697 {
3698         char *e;
3699         unsigned long long new = simple_strtoull(buf, &e, 10);
3700         if (mddev->pers)
3701                 return -EBUSY;
3702         if (buf == e || (*e && *e != '\n'))
3703                 return -EINVAL;
3704         mddev->reshape_position = new;
3705         mddev->delta_disks = 0;
3706         mddev->new_level = mddev->level;
3707         mddev->new_layout = mddev->layout;
3708         mddev->new_chunk_sectors = mddev->chunk_sectors;
3709         return len;
3710 }
3711
3712 static struct md_sysfs_entry md_reshape_position =
3713 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
3714        reshape_position_store);
3715
3716 static ssize_t
3717 array_size_show(mddev_t *mddev, char *page)
3718 {
3719         if (mddev->external_size)
3720                 return sprintf(page, "%llu\n",
3721                                (unsigned long long)mddev->array_sectors/2);
3722         else
3723                 return sprintf(page, "default\n");
3724 }
3725
3726 static ssize_t
3727 array_size_store(mddev_t *mddev, const char *buf, size_t len)
3728 {
3729         sector_t sectors;
3730
3731         if (strncmp(buf, "default", 7) == 0) {
3732                 if (mddev->pers)
3733                         sectors = mddev->pers->size(mddev, 0, 0);
3734                 else
3735                         sectors = mddev->array_sectors;
3736
3737                 mddev->external_size = 0;
3738         } else {
3739                 if (strict_blocks_to_sectors(buf, &sectors) < 0)
3740                         return -EINVAL;
3741                 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
3742                         return -E2BIG;
3743
3744                 mddev->external_size = 1;
3745         }
3746
3747         mddev->array_sectors = sectors;
3748         set_capacity(mddev->gendisk, mddev->array_sectors);
3749         if (mddev->pers)
3750                 revalidate_disk(mddev->gendisk);
3751
3752         return len;
3753 }
3754
3755 static struct md_sysfs_entry md_array_size =
3756 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
3757        array_size_store);
3758
3759 static struct attribute *md_default_attrs[] = {
3760         &md_level.attr,
3761         &md_layout.attr,
3762         &md_raid_disks.attr,
3763         &md_chunk_size.attr,
3764         &md_size.attr,
3765         &md_resync_start.attr,
3766         &md_metadata.attr,
3767         &md_new_device.attr,
3768         &md_safe_delay.attr,
3769         &md_array_state.attr,
3770         &md_reshape_position.attr,
3771         &md_array_size.attr,
3772         NULL,
3773 };
3774
3775 static struct attribute *md_redundancy_attrs[] = {
3776         &md_scan_mode.attr,
3777         &md_mismatches.attr,
3778         &md_sync_min.attr,
3779         &md_sync_max.attr,
3780         &md_sync_speed.attr,
3781         &md_sync_force_parallel.attr,
3782         &md_sync_completed.attr,
3783         &md_min_sync.attr,
3784         &md_max_sync.attr,
3785         &md_suspend_lo.attr,
3786         &md_suspend_hi.attr,
3787         &md_bitmap.attr,
3788         &md_degraded.attr,
3789         NULL,
3790 };
3791 static struct attribute_group md_redundancy_group = {
3792         .name = NULL,
3793         .attrs = md_redundancy_attrs,
3794 };
3795
3796
3797 static ssize_t
3798 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3799 {
3800         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3801         mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3802         ssize_t rv;
3803
3804         if (!entry->show)
3805                 return -EIO;
3806         rv = mddev_lock(mddev);
3807         if (!rv) {
3808                 rv = entry->show(mddev, page);
3809                 mddev_unlock(mddev);
3810         }
3811         return rv;
3812 }
3813
3814 static ssize_t
3815 md_attr_store(struct kobject *kobj, struct attribute *attr,
3816               const char *page, size_t length)
3817 {
3818         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3819         mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3820         ssize_t rv;
3821
3822         if (!entry->store)
3823                 return -EIO;
3824         if (!capable(CAP_SYS_ADMIN))
3825                 return -EACCES;
3826         rv = mddev_lock(mddev);
3827         if (mddev->hold_active == UNTIL_IOCTL)
3828                 mddev->hold_active = 0;
3829         if (!rv) {
3830                 rv = entry->store(mddev, page, length);
3831                 mddev_unlock(mddev);
3832         }
3833         return rv;
3834 }
3835
3836 static void md_free(struct kobject *ko)
3837 {
3838         mddev_t *mddev = container_of(ko, mddev_t, kobj);
3839
3840         if (mddev->sysfs_state)
3841                 sysfs_put(mddev->sysfs_state);
3842
3843         if (mddev->gendisk) {
3844                 del_gendisk(mddev->gendisk);
3845                 put_disk(mddev->gendisk);
3846         }
3847         if (mddev->queue)
3848                 blk_cleanup_queue(mddev->queue);
3849
3850         kfree(mddev);
3851 }
3852
3853 static struct sysfs_ops md_sysfs_ops = {
3854         .show   = md_attr_show,
3855         .store  = md_attr_store,
3856 };
3857 static struct kobj_type md_ktype = {
3858         .release        = md_free,
3859         .sysfs_ops      = &md_sysfs_ops,
3860         .default_attrs  = md_default_attrs,
3861 };
3862
3863 int mdp_major = 0;
3864
3865 static void mddev_delayed_delete(struct work_struct *ws)
3866 {
3867         mddev_t *mddev = container_of(ws, mddev_t, del_work);
3868
3869         if (mddev->private == &md_redundancy_group) {
3870                 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3871                 if (mddev->sysfs_action)
3872                         sysfs_put(mddev->sysfs_action);
3873                 mddev->sysfs_action = NULL;
3874                 mddev->private = NULL;
3875         }
3876         kobject_del(&mddev->kobj);
3877         kobject_put(&mddev->kobj);
3878 }
3879
3880 static int md_alloc(dev_t dev, char *name)
3881 {
3882         static DEFINE_MUTEX(disks_mutex);
3883         mddev_t *mddev = mddev_find(dev);
3884         struct gendisk *disk;
3885         int partitioned;
3886         int shift;
3887         int unit;
3888         int error;
3889
3890         if (!mddev)
3891                 return -ENODEV;
3892
3893         partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
3894         shift = partitioned ? MdpMinorShift : 0;
3895         unit = MINOR(mddev->unit) >> shift;
3896
3897         /* wait for any previous instance if this device
3898          * to be completed removed (mddev_delayed_delete).
3899          */
3900         flush_scheduled_work();
3901
3902         mutex_lock(&disks_mutex);
3903         error = -EEXIST;
3904         if (mddev->gendisk)
3905                 goto abort;
3906
3907         if (name) {
3908                 /* Need to ensure that 'name' is not a duplicate.
3909                  */
3910                 mddev_t *mddev2;
3911                 spin_lock(&all_mddevs_lock);
3912
3913                 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
3914                         if (mddev2->gendisk &&
3915                             strcmp(mddev2->gendisk->disk_name, name) == 0) {
3916                                 spin_unlock(&all_mddevs_lock);
3917                                 goto abort;
3918                         }
3919                 spin_unlock(&all_mddevs_lock);
3920         }
3921
3922         error = -ENOMEM;
3923         mddev->queue = blk_alloc_queue(GFP_KERNEL);
3924         if (!mddev->queue)
3925                 goto abort;
3926         mddev->queue->queuedata = mddev;
3927
3928         /* Can be unlocked because the queue is new: no concurrency */
3929         queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
3930
3931         blk_queue_make_request(mddev->queue, md_make_request);
3932
3933         disk = alloc_disk(1 << shift);
3934         if (!disk) {
3935                 blk_cleanup_queue(mddev->queue);
3936                 mddev->queue = NULL;
3937                 goto abort;
3938         }
3939         disk->major = MAJOR(mddev->unit);
3940         disk->first_minor = unit << shift;
3941         if (name)
3942                 strcpy(disk->disk_name, name);
3943         else if (partitioned)
3944                 sprintf(disk->disk_name, "md_d%d", unit);
3945         else
3946                 sprintf(disk->disk_name, "md%d", unit);
3947         disk->fops = &md_fops;
3948         disk->private_data = mddev;
3949         disk->queue = mddev->queue;
3950         /* Allow extended partitions.  This makes the
3951          * 'mdp' device redundant, but we can't really
3952          * remove it now.
3953          */
3954         disk->flags |= GENHD_FL_EXT_DEVT;
3955         add_disk(disk);
3956         mddev->gendisk = disk;
3957         error = kobject_init_and_add(&mddev->kobj, &md_ktype,
3958                                      &disk_to_dev(disk)->kobj, "%s", "md");
3959         if (error) {
3960                 /* This isn't possible, but as kobject_init_and_add is marked
3961                  * __must_check, we must do something with the result
3962                  */
3963                 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3964                        disk->disk_name);
3965                 error = 0;
3966         }
3967  abort:
3968         mutex_unlock(&disks_mutex);
3969         if (!error) {
3970                 kobject_uevent(&mddev->kobj, KOBJ_ADD);
3971                 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
3972         }
3973         mddev_put(mddev);
3974         return error;
3975 }
3976
3977 static struct kobject *md_probe(dev_t dev, int *part, void *data)
3978 {
3979         md_alloc(dev, NULL);
3980         return NULL;
3981 }
3982
3983 static int add_named_array(const char *val, struct kernel_param *kp)
3984 {
3985         /* val must be "md_*" where * is not all digits.
3986          * We allocate an array with a large free minor number, and
3987          * set the name to val.  val must not already be an active name.
3988          */
3989         int len = strlen(val);
3990         char buf[DISK_NAME_LEN];
3991
3992         while (len && val[len-1] == '\n')
3993                 len--;
3994         if (len >= DISK_NAME_LEN)
3995                 return -E2BIG;
3996         strlcpy(buf, val, len+1);
3997         if (strncmp(buf, "md_", 3) != 0)
3998                 return -EINVAL;
3999         return md_alloc(0, buf);
4000 }
4001
4002 static void md_safemode_timeout(unsigned long data)
4003 {
4004         mddev_t *mddev = (mddev_t *) data;
4005
4006         if (!atomic_read(&mddev->writes_pending)) {
4007                 mddev->safemode = 1;
4008                 if (mddev->external)
4009                         sysfs_notify_dirent(mddev->sysfs_state);
4010         }
4011         md_wakeup_thread(mddev->thread);
4012 }
4013
4014 static int start_dirty_degraded;
4015
4016 static int do_md_run(mddev_t * mddev)
4017 {
4018         int err;
4019         mdk_rdev_t *rdev;
4020         struct gendisk *disk;
4021         struct mdk_personality *pers;
4022
4023         if (list_empty(&mddev->disks))
4024                 /* cannot run an array with no devices.. */
4025                 return -EINVAL;
4026
4027         if (mddev->pers)
4028                 return -EBUSY;
4029
4030         /*
4031          * Analyze all RAID superblock(s)
4032          */
4033         if (!mddev->raid_disks) {
4034                 if (!mddev->persistent)
4035                         return -EINVAL;
4036                 analyze_sbs(mddev);
4037         }
4038
4039         if (mddev->level != LEVEL_NONE)
4040                 request_module("md-level-%d", mddev->level);
4041         else if (mddev->clevel[0])
4042                 request_module("md-%s", mddev->clevel);
4043
4044         /*
4045          * Drop all container device buffers, from now on
4046          * the only valid external interface is through the md
4047          * device.
4048          */
4049         list_for_each_entry(rdev, &mddev->disks, same_set) {
4050                 if (test_bit(Faulty, &rdev->flags))
4051                         continue;
4052                 sync_blockdev(rdev->bdev);
4053                 invalidate_bdev(rdev->bdev);
4054
4055                 /* perform some consistency tests on the device.
4056                  * We don't want the data to overlap the metadata,
4057                  * Internal Bitmap issues have been handled elsewhere.
4058                  */
4059                 if (rdev->data_offset < rdev->sb_start) {
4060                         if (mddev->dev_sectors &&
4061                             rdev->data_offset + mddev->dev_sectors
4062                             > rdev->sb_start) {
4063                                 printk("md: %s: data overlaps metadata\n",
4064                                        mdname(mddev));
4065                                 return -EINVAL;
4066                         }
4067                 } else {
4068                         if (rdev->sb_start + rdev->sb_size/512
4069                             > rdev->data_offset) {
4070                                 printk("md: %s: metadata overlaps data\n",
4071                                        mdname(mddev));
4072                                 return -EINVAL;
4073                         }
4074                 }
4075                 sysfs_notify_dirent(rdev->sysfs_state);
4076         }
4077
4078         md_probe(mddev->unit, NULL, NULL);
4079         disk = mddev->gendisk;
4080         if (!disk)
4081                 return -ENOMEM;
4082
4083         spin_lock(&pers_lock);
4084         pers = find_pers(mddev->level, mddev->clevel);
4085         if (!pers || !try_module_get(pers->owner)) {
4086                 spin_unlock(&pers_lock);
4087                 if (mddev->level != LEVEL_NONE)
4088                         printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
4089                                mddev->level);
4090                 else
4091                         printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
4092                                mddev->clevel);
4093                 return -EINVAL;
4094         }
4095         mddev->pers = pers;
4096         spin_unlock(&pers_lock);
4097         if (mddev->level != pers->level) {
4098                 mddev->level = pers->level;
4099                 mddev->new_level = pers->level;
4100         }
4101         strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
4102
4103         if (mddev->reshape_position != MaxSector &&
4104             pers->start_reshape == NULL) {
4105                 /* This personality cannot handle reshaping... */
4106                 mddev->pers = NULL;
4107                 module_put(pers->owner);
4108                 return -EINVAL;
4109         }
4110
4111         if (pers->sync_request) {
4112                 /* Warn if this is a potentially silly
4113                  * configuration.
4114                  */
4115                 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4116                 mdk_rdev_t *rdev2;
4117                 int warned = 0;
4118
4119                 list_for_each_entry(rdev, &mddev->disks, same_set)
4120                         list_for_each_entry(rdev2, &mddev->disks, same_set) {
4121                                 if (rdev < rdev2 &&
4122                                     rdev->bdev->bd_contains ==
4123                                     rdev2->bdev->bd_contains) {
4124                                         printk(KERN_WARNING
4125                                                "%s: WARNING: %s appears to be"
4126                                                " on the same physical disk as"
4127                                                " %s.\n",
4128                                                mdname(mddev),
4129                                                bdevname(rdev->bdev,b),
4130                                                bdevname(rdev2->bdev,b2));
4131                                         warned = 1;
4132                                 }
4133                         }
4134
4135                 if (warned)
4136                         printk(KERN_WARNING
4137                                "True protection against single-disk"
4138                                " failure might be compromised.\n");
4139         }
4140
4141         mddev->recovery = 0;
4142         /* may be over-ridden by personality */
4143         mddev->resync_max_sectors = mddev->dev_sectors;
4144
4145         mddev->barriers_work = 1;
4146         mddev->ok_start_degraded = start_dirty_degraded;
4147
4148         if (start_readonly)
4149                 mddev->ro = 2; /* read-only, but switch on first write */
4150
4151         err = mddev->pers->run(mddev);
4152         if (err)
4153                 printk(KERN_ERR "md: pers->run() failed ...\n");
4154         else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
4155                 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
4156                           " but 'external_size' not in effect?\n", __func__);
4157                 printk(KERN_ERR
4158                        "md: invalid array_size %llu > default size %llu\n",
4159                        (unsigned long long)mddev->array_sectors / 2,
4160                        (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
4161                 err = -EINVAL;
4162                 mddev->pers->stop(mddev);
4163         }
4164         if (err == 0 && mddev->pers->sync_request) {
4165                 err = bitmap_create(mddev);
4166                 if (err) {
4167                         printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
4168                                mdname(mddev), err);
4169                         mddev->pers->stop(mddev);
4170                 }
4171         }
4172         if (err) {
4173                 module_put(mddev->pers->owner);
4174                 mddev->pers = NULL;
4175                 bitmap_destroy(mddev);
4176                 return err;
4177         }
4178         if (mddev->pers->sync_request) {
4179                 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4180                         printk(KERN_WARNING
4181                                "md: cannot register extra attributes for %s\n",
4182                                mdname(mddev));
4183                 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4184         } else if (mddev->ro == 2) /* auto-readonly not meaningful */
4185                 mddev->ro = 0;
4186
4187         atomic_set(&mddev->writes_pending,0);
4188         mddev->safemode = 0;
4189         mddev->safemode_timer.function = md_safemode_timeout;
4190         mddev->safemode_timer.data = (unsigned long) mddev;
4191         mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
4192         mddev->in_sync = 1;
4193
4194         list_for_each_entry(rdev, &mddev->disks, same_set)
4195                 if (rdev->raid_disk >= 0) {
4196                         char nm[20];
4197                         sprintf(nm, "rd%d", rdev->raid_disk);
4198                         if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
4199                                 printk("md: cannot register %s for %s\n",
4200                                        nm, mdname(mddev));
4201                 }
4202         
4203         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4204         
4205         if (mddev->flags)
4206                 md_update_sb(mddev, 0);
4207
4208         set_capacity(disk, mddev->array_sectors);
4209
4210         /* If there is a partially-recovered drive we need to
4211          * start recovery here.  If we leave it to md_check_recovery,
4212          * it will remove the drives and not do the right thing
4213          */
4214         if (mddev->degraded && !mddev->sync_thread) {
4215                 int spares = 0;
4216                 list_for_each_entry(rdev, &mddev->disks, same_set)
4217                         if (rdev->raid_disk >= 0 &&
4218                             !test_bit(In_sync, &rdev->flags) &&
4219                             !test_bit(Faulty, &rdev->flags))
4220                                 /* complete an interrupted recovery */
4221                                 spares++;
4222                 if (spares && mddev->pers->sync_request) {
4223                         mddev->recovery = 0;
4224                         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4225                         mddev->sync_thread = md_register_thread(md_do_sync,
4226                                                                 mddev,
4227                                                                 "resync");
4228                         if (!mddev->sync_thread) {
4229                                 printk(KERN_ERR "%s: could not start resync"
4230                                        " thread...\n",
4231                                        mdname(mddev));
4232                                 /* leave the spares where they are, it shouldn't hurt */
4233                                 mddev->recovery = 0;
4234                         }
4235                 }
4236         }
4237         md_wakeup_thread(mddev->thread);
4238         md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4239
4240         revalidate_disk(mddev->gendisk);
4241         mddev->changed = 1;
4242         md_new_event(mddev);
4243         sysfs_notify_dirent(mddev->sysfs_state);
4244         if (mddev->sysfs_action)
4245                 sysfs_notify_dirent(mddev->sysfs_action);
4246         sysfs_notify(&mddev->kobj, NULL, "degraded");
4247         kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4248         return 0;
4249 }
4250
4251 static int restart_array(mddev_t *mddev)
4252 {
4253         struct gendisk *disk = mddev->gendisk;
4254
4255         /* Complain if it has no devices */
4256         if (list_empty(&mddev->disks))
4257                 return -ENXIO;
4258         if (!mddev->pers)
4259                 return -EINVAL;
4260         if (!mddev->ro)
4261                 return -EBUSY;
4262         mddev->safemode = 0;
4263         mddev->ro = 0;
4264         set_disk_ro(disk, 0);
4265         printk(KERN_INFO "md: %s switched to read-write mode.\n",
4266                 mdname(mddev));
4267         /* Kick recovery or resync if necessary */
4268         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4269         md_wakeup_thread(mddev->thread);
4270         md_wakeup_thread(mddev->sync_thread);
4271         sysfs_notify_dirent(mddev->sysfs_state);
4272         return 0;
4273 }
4274
4275 /* similar to deny_write_access, but accounts for our holding a reference
4276  * to the file ourselves */
4277 static int deny_bitmap_write_access(struct file * file)
4278 {
4279         struct inode *inode = file->f_mapping->host;
4280
4281         spin_lock(&inode->i_lock);
4282         if (atomic_read(&inode->i_writecount) > 1) {
4283                 spin_unlock(&inode->i_lock);
4284                 return -ETXTBSY;
4285         }
4286         atomic_set(&inode->i_writecount, -1);
4287         spin_unlock(&inode->i_lock);
4288
4289         return 0;
4290 }
4291
4292 static void restore_bitmap_write_access(struct file *file)
4293 {
4294         struct inode *inode = file->f_mapping->host;
4295
4296         spin_lock(&inode->i_lock);
4297         atomic_set(&inode->i_writecount, 1);
4298         spin_unlock(&inode->i_lock);
4299 }
4300
4301 /* mode:
4302  *   0 - completely stop and dis-assemble array
4303  *   1 - switch to readonly
4304  *   2 - stop but do not disassemble array
4305  */
4306 static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4307 {
4308         int err = 0;
4309         struct gendisk *disk = mddev->gendisk;
4310         mdk_rdev_t *rdev;
4311
4312         mutex_lock(&mddev->open_mutex);
4313         if (atomic_read(&mddev->openers) > is_open) {
4314                 printk("md: %s still in use.\n",mdname(mddev));
4315                 err = -EBUSY;
4316         } else if (mddev->pers) {
4317
4318                 if (mddev->sync_thread) {
4319                         set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4320                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4321                         md_unregister_thread(mddev->sync_thread);
4322                         mddev->sync_thread = NULL;
4323                 }
4324
4325                 del_timer_sync(&mddev->safemode_timer);
4326
4327                 switch(mode) {
4328                 case 1: /* readonly */
4329                         err  = -ENXIO;
4330                         if (mddev->ro==1)
4331                                 goto out;
4332                         mddev->ro = 1;
4333                         break;
4334                 case 0: /* disassemble */
4335                 case 2: /* stop */
4336                         bitmap_flush(mddev);
4337                         md_super_wait(mddev);
4338                         if (mddev->ro)
4339                                 set_disk_ro(disk, 0);
4340
4341                         mddev->pers->stop(mddev);
4342                         mddev->queue->merge_bvec_fn = NULL;
4343                         mddev->queue->unplug_fn = NULL;
4344                         mddev->queue->backing_dev_info.congested_fn = NULL;
4345                         module_put(mddev->pers->owner);
4346                         if (mddev->pers->sync_request)
4347                                 mddev->private = &md_redundancy_group;
4348                         mddev->pers = NULL;
4349                         /* tell userspace to handle 'inactive' */
4350                         sysfs_notify_dirent(mddev->sysfs_state);
4351
4352                         list_for_each_entry(rdev, &mddev->disks, same_set)
4353                                 if (rdev->raid_disk >= 0) {
4354                                         char nm[20];
4355                                         sprintf(nm, "rd%d", rdev->raid_disk);
4356                                         sysfs_remove_link(&mddev->kobj, nm);
4357                                 }
4358
4359                         set_capacity(disk, 0);
4360                         mddev->changed = 1;
4361
4362                         if (mddev->ro)
4363                                 mddev->ro = 0;
4364                 }
4365                 if (!mddev->in_sync || mddev->flags) {
4366                         /* mark array as shutdown cleanly */
4367                         mddev->in_sync = 1;
4368                         md_update_sb(mddev, 1);
4369                 }
4370                 if (mode == 1)
4371                         set_disk_ro(disk, 1);
4372                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4373                 err = 0;
4374         }
4375 out:
4376         mutex_unlock(&mddev->open_mutex);
4377         if (err)
4378                 return err;
4379         /*
4380          * Free resources if final stop
4381          */
4382         if (mode == 0) {
4383
4384                 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
4385
4386                 bitmap_destroy(mddev);
4387                 if (mddev->bitmap_file) {
4388                         restore_bitmap_write_access(mddev->bitmap_file);
4389                         fput(mddev->bitmap_file);
4390                         mddev->bitmap_file = NULL;
4391                 }
4392                 mddev->bitmap_offset = 0;
4393
4394                 /* make sure all md_delayed_delete calls have finished */
4395                 flush_scheduled_work();
4396
4397                 export_array(mddev);
4398
4399                 mddev->array_sectors = 0;
4400                 mddev->external_size = 0;
4401                 mddev->dev_sectors = 0;
4402                 mddev->raid_disks = 0;
4403                 mddev->recovery_cp = 0;
4404                 mddev->resync_min = 0;
4405                 mddev->resync_max = MaxSector;
4406                 mddev->reshape_position = MaxSector;
4407                 mddev->external = 0;
4408                 mddev->persistent = 0;
4409                 mddev->level = LEVEL_NONE;
4410                 mddev->clevel[0] = 0;
4411                 mddev->flags = 0;
4412                 mddev->ro = 0;
4413                 mddev->metadata_type[0] = 0;
4414                 mddev->chunk_sectors = 0;
4415                 mddev->ctime = mddev->utime = 0;
4416                 mddev->layout = 0;
4417                 mddev->max_disks = 0;
4418                 mddev->events = 0;
4419                 mddev->delta_disks = 0;
4420                 mddev->new_level = LEVEL_NONE;
4421                 mddev->new_layout = 0;
4422                 mddev->new_chunk_sectors = 0;
4423                 mddev->curr_resync = 0;
4424                 mddev->resync_mismatches = 0;
4425                 mddev->suspend_lo = mddev->suspend_hi = 0;
4426                 mddev->sync_speed_min = mddev->sync_speed_max = 0;
4427                 mddev->recovery = 0;
4428                 mddev->in_sync = 0;
4429                 mddev->changed = 0;
4430                 mddev->degraded = 0;
4431                 mddev->barriers_work = 0;
4432                 mddev->safemode = 0;
4433                 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4434                 if (mddev->hold_active == UNTIL_STOP)
4435                         mddev->hold_active = 0;
4436
4437         } else if (mddev->pers)
4438                 printk(KERN_INFO "md: %s switched to read-only mode.\n",
4439                         mdname(mddev));
4440         err = 0;
4441         blk_integrity_unregister(disk);
4442         md_new_event(mddev);
4443         sysfs_notify_dirent(mddev->sysfs_state);
4444         return err;
4445 }
4446
4447 #ifndef MODULE
4448 static void autorun_array(mddev_t *mddev)
4449 {
4450         mdk_rdev_t *rdev;
4451         int err;
4452
4453         if (list_empty(&mddev->disks))
4454                 return;
4455
4456         printk(KERN_INFO "md: running: ");
4457
4458         list_for_each_entry(rdev, &mddev->disks, same_set) {
4459                 char b[BDEVNAME_SIZE];
4460                 printk("<%s>", bdevname(rdev->bdev,b));
4461         }
4462         printk("\n");
4463
4464         err = do_md_run(mddev);
4465         if (err) {
4466                 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
4467                 do_md_stop(mddev, 0, 0);
4468         }
4469 }
4470
4471 /*
4472  * lets try to run arrays based on all disks that have arrived
4473  * until now. (those are in pending_raid_disks)
4474  *
4475  * the method: pick the first pending disk, collect all disks with
4476  * the same UUID, remove all from the pending list and put them into
4477  * the 'same_array' list. Then order this list based on superblock
4478  * update time (freshest comes first), kick out 'old' disks and
4479  * compare superblocks. If everything's fine then run it.
4480  *
4481  * If "unit" is allocated, then bump its reference count
4482  */
4483 static void autorun_devices(int part)
4484 {
4485         mdk_rdev_t *rdev0, *rdev, *tmp;
4486         mddev_t *mddev;
4487         char b[BDEVNAME_SIZE];
4488
4489         printk(KERN_INFO "md: autorun ...\n");
4490         while (!list_empty(&pending_raid_disks)) {
4491                 int unit;
4492                 dev_t dev;
4493                 LIST_HEAD(candidates);
4494                 rdev0 = list_entry(pending_raid_disks.next,
4495                                          mdk_rdev_t, same_set);
4496
4497                 printk(KERN_INFO "md: considering %s ...\n",
4498                         bdevname(rdev0->bdev,b));
4499                 INIT_LIST_HEAD(&candidates);
4500                 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
4501                         if (super_90_load(rdev, rdev0, 0) >= 0) {
4502                                 printk(KERN_INFO "md:  adding %s ...\n",
4503                                         bdevname(rdev->bdev,b));
4504                                 list_move(&rdev->same_set, &candidates);
4505                         }
4506                 /*
4507                  * now we have a set of devices, with all of them having
4508                  * mostly sane superblocks. It's time to allocate the
4509                  * mddev.
4510                  */
4511                 if (part) {
4512                         dev = MKDEV(mdp_major,
4513                                     rdev0->preferred_minor << MdpMinorShift);
4514                         unit = MINOR(dev) >> MdpMinorShift;
4515                 } else {
4516                         dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
4517                         unit = MINOR(dev);
4518                 }
4519                 if (rdev0->preferred_minor != unit) {
4520                         printk(KERN_INFO "md: unit number in %s is bad: %d\n",
4521                                bdevname(rdev0->bdev, b), rdev0->preferred_minor);
4522                         break;
4523                 }
4524
4525                 md_probe(dev, NULL, NULL);
4526                 mddev = mddev_find(dev);
4527                 if (!mddev || !mddev->gendisk) {
4528                         if (mddev)
4529                                 mddev_put(mddev);
4530                         printk(KERN_ERR
4531                                 "md: cannot allocate memory for md drive.\n");
4532                         break;
4533                 }
4534                 if (mddev_lock(mddev)) 
4535                         printk(KERN_WARNING "md: %s locked, cannot run\n",
4536                                mdname(mddev));
4537                 else if (mddev->raid_disks || mddev->major_version
4538                          || !list_empty(&mddev->disks)) {
4539                         printk(KERN_WARNING 
4540                                 "md: %s already running, cannot run %s\n",
4541                                 mdname(mddev), bdevname(rdev0->bdev,b));
4542                         mddev_unlock(mddev);
4543                 } else {
4544                         printk(KERN_INFO "md: created %s\n", mdname(mddev));
4545                         mddev->persistent = 1;
4546                         rdev_for_each_list(rdev, tmp, &candidates) {
4547                                 list_del_init(&rdev->same_set);
4548                                 if (bind_rdev_to_array(rdev, mddev))
4549                                         export_rdev(rdev);
4550                         }
4551                         autorun_array(mddev);
4552                         mddev_unlock(mddev);
4553                 }
4554                 /* on success, candidates will be empty, on error
4555                  * it won't...
4556                  */
4557                 rdev_for_each_list(rdev, tmp, &candidates) {
4558                         list_del_init(&rdev->same_set);
4559                         export_rdev(rdev);
4560                 }
4561                 mddev_put(mddev);
4562         }
4563         printk(KERN_INFO "md: ... autorun DONE.\n");
4564 }
4565 #endif /* !MODULE */
4566
4567 static int get_version(void __user * arg)
4568 {
4569         mdu_version_t ver;
4570
4571         ver.major = MD_MAJOR_VERSION;
4572         ver.minor = MD_MINOR_VERSION;
4573         ver.patchlevel = MD_PATCHLEVEL_VERSION;
4574
4575         if (copy_to_user(arg, &ver, sizeof(ver)))
4576                 return -EFAULT;
4577
4578         return 0;
4579 }
4580
4581 static int get_array_info(mddev_t * mddev, void __user * arg)
4582 {
4583         mdu_array_info_t info;
4584         int nr,working,insync,failed,spare;
4585         mdk_rdev_t *rdev;
4586
4587         nr=working=insync=failed=spare=0;
4588         list_for_each_entry(rdev, &mddev->disks, same_set) {
4589                 nr++;
4590                 if (test_bit(Faulty, &rdev->flags))
4591                         failed++;
4592                 else {
4593                         working++;
4594                         if (test_bit(In_sync, &rdev->flags))
4595                                 insync++;       
4596                         else
4597                                 spare++;
4598                 }
4599         }
4600
4601         info.major_version = mddev->major_version;
4602         info.minor_version = mddev->minor_version;
4603         info.patch_version = MD_PATCHLEVEL_VERSION;
4604         info.ctime         = mddev->ctime;
4605         info.level         = mddev->level;
4606         info.size          = mddev->dev_sectors / 2;
4607         if (info.size != mddev->dev_sectors / 2) /* overflow */
4608                 info.size = -1;
4609         info.nr_disks      = nr;
4610         info.raid_disks    = mddev->raid_disks;
4611         info.md_minor      = mddev->md_minor;
4612         info.not_persistent= !mddev->persistent;
4613
4614         info.utime         = mddev->utime;
4615         info.state         = 0;
4616         if (mddev->in_sync)
4617                 info.state = (1<<MD_SB_CLEAN);
4618         if (mddev->bitmap && mddev->bitmap_offset)
4619                 info.state = (1<<MD_SB_BITMAP_PRESENT);
4620         info.active_disks  = insync;
4621         info.working_disks = working;
4622         info.failed_disks  = failed;
4623         info.spare_disks   = spare;
4624
4625         info.layout        = mddev->layout;
4626         info.chunk_size    = mddev->chunk_sectors << 9;
4627
4628         if (copy_to_user(arg, &info, sizeof(info)))
4629                 return -EFAULT;
4630
4631         return 0;
4632 }
4633
4634 static int get_bitmap_file(mddev_t * mddev, void __user * arg)
4635 {
4636         mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
4637         char *ptr, *buf = NULL;
4638         int err = -ENOMEM;
4639
4640         if (md_allow_write(mddev))
4641                 file = kmalloc(sizeof(*file), GFP_NOIO);
4642         else
4643                 file = kmalloc(sizeof(*file), GFP_KERNEL);
4644
4645         if (!file)
4646                 goto out;
4647
4648         /* bitmap disabled, zero the first byte and copy out */
4649         if (!mddev->bitmap || !mddev->bitmap->file) {
4650                 file->pathname[0] = '\0';
4651                 goto copy_out;
4652         }
4653
4654         buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
4655         if (!buf)
4656                 goto out;
4657
4658         ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
4659         if (IS_ERR(ptr))
4660                 goto out;
4661
4662         strcpy(file->pathname, ptr);
4663
4664 copy_out:
4665         err = 0;
4666         if (copy_to_user(arg, file, sizeof(*file)))
4667                 err = -EFAULT;
4668 out:
4669         kfree(buf);
4670         kfree(file);
4671         return err;
4672 }
4673
4674 static int get_disk_info(mddev_t * mddev, void __user * arg)
4675 {
4676         mdu_disk_info_t info;
4677         mdk_rdev_t *rdev;
4678
4679         if (copy_from_user(&info, arg, sizeof(info)))
4680                 return -EFAULT;
4681
4682         rdev = find_rdev_nr(mddev, info.number);
4683         if (rdev) {
4684                 info.major = MAJOR(rdev->bdev->bd_dev);
4685                 info.minor = MINOR(rdev->bdev->bd_dev);
4686                 info.raid_disk = rdev->raid_disk;
4687                 info.state = 0;
4688                 if (test_bit(Faulty, &rdev->flags))
4689                         info.state |= (1<<MD_DISK_FAULTY);
4690                 else if (test_bit(In_sync, &rdev->flags)) {
4691                         info.state |= (1<<MD_DISK_ACTIVE);
4692                         info.state |= (1<<MD_DISK_SYNC);
4693                 }
4694                 if (test_bit(WriteMostly, &rdev->flags))
4695                         info.state |= (1<<MD_DISK_WRITEMOSTLY);
4696         } else {
4697                 info.major = info.minor = 0;
4698                 info.raid_disk = -1;
4699                 info.state = (1<<MD_DISK_REMOVED);
4700         }
4701
4702         if (copy_to_user(arg, &info, sizeof(info)))
4703                 return -EFAULT;
4704
4705         return 0;
4706 }
4707
4708 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4709 {
4710         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4711         mdk_rdev_t *rdev;
4712         dev_t dev = MKDEV(info->major,info->minor);
4713
4714         if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
4715                 return -EOVERFLOW;
4716
4717         if (!mddev->raid_disks) {
4718                 int err;
4719                 /* expecting a device which has a superblock */
4720                 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
4721                 if (IS_ERR(rdev)) {
4722                         printk(KERN_WARNING 
4723                                 "md: md_import_device returned %ld\n",
4724                                 PTR_ERR(rdev));
4725                         return PTR_ERR(rdev);
4726                 }
4727                 if (!list_empty(&mddev->disks)) {
4728                         mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
4729                                                         mdk_rdev_t, same_set);
4730                         err = super_types[mddev->major_version]
4731                                 .load_super(rdev, rdev0, mddev->minor_version);
4732                         if (err < 0) {
4733                                 printk(KERN_WARNING 
4734                                         "md: %s has different UUID to %s\n",
4735                                         bdevname(rdev->bdev,b), 
4736                                         bdevname(rdev0->bdev,b2));
4737                                 export_rdev(rdev);
4738                                 return -EINVAL;
4739                         }
4740                 }
4741                 err = bind_rdev_to_array(rdev, mddev);
4742                 if (err)
4743                         export_rdev(rdev);
4744                 return err;
4745         }
4746
4747         /*
4748          * add_new_disk can be used once the array is assembled
4749          * to add "hot spares".  They must already have a superblock
4750          * written
4751          */
4752         if (mddev->pers) {
4753                 int err;
4754                 if (!mddev->pers->hot_add_disk) {
4755                         printk(KERN_WARNING 
4756                                 "%s: personality does not support diskops!\n",
4757                                mdname(mddev));
4758                         return -EINVAL;
4759                 }
4760                 if (mddev->persistent)
4761                         rdev = md_import_device(dev, mddev->major_version,
4762                                                 mddev->minor_version);
4763                 else
4764                         rdev = md_import_device(dev, -1, -1);
4765                 if (IS_ERR(rdev)) {
4766                         printk(KERN_WARNING 
4767                                 "md: md_import_device returned %ld\n",
4768                                 PTR_ERR(rdev));
4769                         return PTR_ERR(rdev);
4770                 }
4771                 /* set save_raid_disk if appropriate */
4772                 if (!mddev->persistent) {
4773                         if (info->state & (1<<MD_DISK_SYNC)  &&
4774                             info->raid_disk < mddev->raid_disks)
4775                                 rdev->raid_disk = info->raid_disk;
4776                         else
4777                                 rdev->raid_disk = -1;
4778                 } else
4779                         super_types[mddev->major_version].
4780                                 validate_super(mddev, rdev);
4781                 rdev->saved_raid_disk = rdev->raid_disk;
4782
4783                 clear_bit(In_sync, &rdev->flags); /* just to be sure */
4784                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4785                         set_bit(WriteMostly, &rdev->flags);
4786                 else
4787                         clear_bit(WriteMostly, &rdev->flags);
4788
4789                 rdev->raid_disk = -1;
4790                 err = bind_rdev_to_array(rdev, mddev);
4791                 if (!err && !mddev->pers->hot_remove_disk) {
4792                         /* If there is hot_add_disk but no hot_remove_disk
4793                          * then added disks for geometry changes,
4794                          * and should be added immediately.
4795                          */
4796                         super_types[mddev->major_version].
4797                                 validate_super(mddev, rdev);
4798                         err = mddev->pers->hot_add_disk(mddev, rdev);
4799                         if (err)
4800                                 unbind_rdev_from_array(rdev);
4801                 }
4802                 if (err)
4803                         export_rdev(rdev);
4804                 else
4805                         sysfs_notify_dirent(rdev->sysfs_state);
4806
4807                 md_update_sb(mddev, 1);
4808                 if (mddev->degraded)
4809                         set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4810                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4811                 md_wakeup_thread(mddev->thread);
4812                 return err;
4813         }
4814
4815         /* otherwise, add_new_disk is only allowed
4816          * for major_version==0 superblocks
4817          */
4818         if (mddev->major_version != 0) {
4819                 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
4820                        mdname(mddev));
4821                 return -EINVAL;
4822         }
4823
4824         if (!(info->state & (1<<MD_DISK_FAULTY))) {
4825                 int err;
4826                 rdev = md_import_device(dev, -1, 0);
4827                 if (IS_ERR(rdev)) {
4828                         printk(KERN_WARNING 
4829                                 "md: error, md_import_device() returned %ld\n",
4830                                 PTR_ERR(rdev));
4831                         return PTR_ERR(rdev);
4832                 }
4833                 rdev->desc_nr = info->number;
4834                 if (info->raid_disk < mddev->raid_disks)
4835                         rdev->raid_disk = info->raid_disk;
4836                 else
4837                         rdev->raid_disk = -1;
4838
4839                 if (rdev->raid_disk < mddev->raid_disks)
4840                         if (info->state & (1<<MD_DISK_SYNC))
4841                                 set_bit(In_sync, &rdev->flags);
4842
4843                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4844                         set_bit(WriteMostly, &rdev->flags);
4845
4846                 if (!mddev->persistent) {
4847                         printk(KERN_INFO "md: nonpersistent superblock ...\n");
4848                         rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4849                 } else 
4850                         rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4851                 rdev->sectors = rdev->sb_start;
4852
4853                 err = bind_rdev_to_array(rdev, mddev);
4854                 if (err) {
4855                         export_rdev(rdev);
4856                         return err;
4857                 }
4858         }
4859
4860         return 0;
4861 }
4862
4863 static int hot_remove_disk(mddev_t * mddev, dev_t dev)
4864 {
4865         char b[BDEVNAME_SIZE];
4866         mdk_rdev_t *rdev;
4867
4868         rdev = find_rdev(mddev, dev);
4869         if (!rdev)
4870                 return -ENXIO;
4871
4872         if (rdev->raid_disk >= 0)
4873                 goto busy;
4874
4875         kick_rdev_from_array(rdev);
4876         md_update_sb(mddev, 1);
4877         md_new_event(mddev);
4878
4879         return 0;
4880 busy:
4881         printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
4882                 bdevname(rdev->bdev,b), mdname(mddev));
4883         return -EBUSY;
4884 }
4885
4886 static int hot_add_disk(mddev_t * mddev, dev_t dev)
4887 {
4888         char b[BDEVNAME_SIZE];
4889         int err;
4890         mdk_rdev_t *rdev;
4891
4892         if (!mddev->pers)
4893                 return -ENODEV;
4894
4895         if (mddev->major_version != 0) {
4896                 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
4897                         " version-0 superblocks.\n",
4898                         mdname(mddev));
4899                 return -EINVAL;
4900         }
4901         if (!mddev->pers->hot_add_disk) {
4902                 printk(KERN_WARNING 
4903                         "%s: personality does not support diskops!\n",
4904                         mdname(mddev));
4905                 return -EINVAL;
4906         }
4907
4908         rdev = md_import_device(dev, -1, 0);
4909         if (IS_ERR(rdev)) {
4910                 printk(KERN_WARNING 
4911                         "md: error, md_import_device() returned %ld\n",
4912                         PTR_ERR(rdev));
4913                 return -EINVAL;
4914         }
4915
4916         if (mddev->persistent)
4917                 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4918         else
4919                 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4920
4921         rdev->sectors = rdev->sb_start;
4922
4923         if (test_bit(Faulty, &rdev->flags)) {
4924                 printk(KERN_WARNING 
4925                         "md: can not hot-add faulty %s disk to %s!\n",
4926                         bdevname(rdev->bdev,b), mdname(mddev));
4927                 err = -EINVAL;
4928                 goto abort_export;
4929         }
4930         clear_bit(In_sync, &rdev->flags);
4931         rdev->desc_nr = -1;
4932         rdev->saved_raid_disk = -1;
4933         err = bind_rdev_to_array(rdev, mddev);
4934         if (err)
4935                 goto abort_export;
4936
4937         /*
4938          * The rest should better be atomic, we can have disk failures
4939          * noticed in interrupt contexts ...
4940          */
4941
4942         rdev->raid_disk = -1;
4943
4944         md_update_sb(mddev, 1);
4945
4946         /*
4947          * Kick recovery, maybe this spare has to be added to the
4948          * array immediately.
4949          */
4950         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4951         md_wakeup_thread(mddev->thread);
4952         md_new_event(mddev);
4953         return 0;
4954
4955 abort_export:
4956         export_rdev(rdev);
4957         return err;
4958 }
4959
4960 static int set_bitmap_file(mddev_t *mddev, int fd)
4961 {
4962         int err;
4963
4964         if (mddev->pers) {
4965                 if (!mddev->pers->quiesce)
4966                         return -EBUSY;
4967                 if (mddev->recovery || mddev->sync_thread)
4968                         return -EBUSY;
4969                 /* we should be able to change the bitmap.. */
4970         }
4971
4972
4973         if (fd >= 0) {
4974                 if (mddev->bitmap)
4975                         return -EEXIST; /* cannot add when bitmap is present */
4976                 mddev->bitmap_file = fget(fd);
4977
4978                 if (mddev->bitmap_file == NULL) {
4979                         printk(KERN_ERR "%s: error: failed to get bitmap file\n",
4980                                mdname(mddev));
4981                         return -EBADF;
4982                 }
4983
4984                 err = deny_bitmap_write_access(mddev->bitmap_file);
4985                 if (err) {
4986                         printk(KERN_ERR "%s: error: bitmap file is already in use\n",
4987                                mdname(mddev));
4988                         fput(mddev->bitmap_file);
4989                         mddev->bitmap_file = NULL;
4990                         return err;
4991                 }
4992                 mddev->bitmap_offset = 0; /* file overrides offset */
4993         } else if (mddev->bitmap == NULL)
4994                 return -ENOENT; /* cannot remove what isn't there */
4995         err = 0;
4996         if (mddev->pers) {
4997                 mddev->pers->quiesce(mddev, 1);
4998                 if (fd >= 0)
4999                         err = bitmap_create(mddev);
5000                 if (fd < 0 || err) {
5001                         bitmap_destroy(mddev);
5002                         fd = -1; /* make sure to put the file */
5003                 }
5004                 mddev->pers->quiesce(mddev, 0);
5005         }
5006         if (fd < 0) {
5007                 if (mddev->bitmap_file) {
5008                         restore_bitmap_write_access(mddev->bitmap_file);
5009                         fput(mddev->bitmap_file);
5010                 }
5011                 mddev->bitmap_file = NULL;
5012         }
5013
5014         return err;
5015 }
5016
5017 /*
5018  * set_array_info is used two different ways
5019  * The original usage is when creating a new array.
5020  * In this usage, raid_disks is > 0 and it together with
5021  *  level, size, not_persistent,layout,chunksize determine the
5022  *  shape of the array.
5023  *  This will always create an array with a type-0.90.0 superblock.
5024  * The newer usage is when assembling an array.
5025  *  In this case raid_disks will be 0, and the major_version field is
5026  *  use to determine which style super-blocks are to be found on the devices.
5027  *  The minor and patch _version numbers are also kept incase the
5028  *  super_block handler wishes to interpret them.
5029  */
5030 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
5031 {
5032
5033         if (info->raid_disks == 0) {
5034                 /* just setting version number for superblock loading */
5035                 if (info->major_version < 0 ||
5036                     info->major_version >= ARRAY_SIZE(super_types) ||
5037                     super_types[info->major_version].name == NULL) {
5038                         /* maybe try to auto-load a module? */
5039                         printk(KERN_INFO 
5040                                 "md: superblock version %d not known\n",
5041                                 info->major_version);
5042                         return -EINVAL;
5043                 }
5044                 mddev->major_version = info->major_version;
5045                 mddev->minor_version = info->minor_version;
5046                 mddev->patch_version = info->patch_version;
5047                 mddev->persistent = !info->not_persistent;
5048                 return 0;
5049         }
5050         mddev->major_version = MD_MAJOR_VERSION;
5051         mddev->minor_version = MD_MINOR_VERSION;
5052         mddev->patch_version = MD_PATCHLEVEL_VERSION;
5053         mddev->ctime         = get_seconds();
5054
5055         mddev->level         = info->level;
5056         mddev->clevel[0]     = 0;
5057         mddev->dev_sectors   = 2 * (sector_t)info->size;
5058         mddev->raid_disks    = info->raid_disks;
5059         /* don't set md_minor, it is determined by which /dev/md* was
5060          * openned
5061          */
5062         if (info->state & (1<<MD_SB_CLEAN))
5063                 mddev->recovery_cp = MaxSector;
5064         else
5065                 mddev->recovery_cp = 0;
5066         mddev->persistent    = ! info->not_persistent;
5067         mddev->external      = 0;
5068
5069         mddev->layout        = info->layout;
5070         mddev->chunk_sectors = info->chunk_size >> 9;
5071
5072         mddev->max_disks     = MD_SB_DISKS;
5073
5074         if (mddev->persistent)
5075                 mddev->flags         = 0;
5076         set_bit(MD_CHANGE_DEVS, &mddev->flags);
5077
5078         mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
5079         mddev->bitmap_offset = 0;
5080
5081         mddev->reshape_position = MaxSector;
5082
5083         /*
5084          * Generate a 128 bit UUID
5085          */
5086         get_random_bytes(mddev->uuid, 16);
5087
5088         mddev->new_level = mddev->level;
5089         mddev->new_chunk_sectors = mddev->chunk_sectors;
5090         mddev->new_layout = mddev->layout;
5091         mddev->delta_disks = 0;
5092
5093         return 0;
5094 }
5095
5096 void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
5097 {
5098         WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
5099
5100         if (mddev->external_size)
5101                 return;
5102
5103         mddev->array_sectors = array_sectors;
5104 }
5105 EXPORT_SYMBOL(md_set_array_sectors);
5106
5107 static int update_size(mddev_t *mddev, sector_t num_sectors)
5108 {
5109         mdk_rdev_t *rdev;
5110         int rv;
5111         int fit = (num_sectors == 0);
5112
5113         if (mddev->pers->resize == NULL)
5114                 return -EINVAL;
5115         /* The "num_sectors" is the number of sectors of each device that
5116          * is used.  This can only make sense for arrays with redundancy.
5117          * linear and raid0 always use whatever space is available. We can only
5118          * consider changing this number if no resync or reconstruction is
5119          * happening, and if the new size is acceptable. It must fit before the
5120          * sb_start or, if that is <data_offset, it must fit before the size
5121          * of each device.  If num_sectors is zero, we find the largest size
5122          * that fits.
5123
5124          */
5125         if (mddev->sync_thread)
5126                 return -EBUSY;
5127         if (mddev->bitmap)
5128                 /* Sorry, cannot grow a bitmap yet, just remove it,
5129                  * grow, and re-add.
5130                  */
5131                 return -EBUSY;
5132         list_for_each_entry(rdev, &mddev->disks, same_set) {
5133                 sector_t avail = rdev->sectors;
5134
5135                 if (fit && (num_sectors == 0 || num_sectors > avail))
5136                         num_sectors = avail;
5137                 if (avail < num_sectors)
5138                         return -ENOSPC;
5139         }
5140         rv = mddev->pers->resize(mddev, num_sectors);
5141         if (!rv)
5142                 revalidate_disk(mddev->gendisk);
5143         return rv;
5144 }
5145
5146 static int update_raid_disks(mddev_t *mddev, int raid_disks)
5147 {
5148         int rv;
5149         /* change the number of raid disks */
5150         if (mddev->pers->check_reshape == NULL)
5151                 return -EINVAL;
5152         if (raid_disks <= 0 ||
5153             raid_disks >= mddev->max_disks)
5154                 return -EINVAL;
5155         if (mddev->sync_thread || mddev->reshape_position != MaxSector)
5156                 return -EBUSY;
5157         mddev->delta_disks = raid_disks - mddev->raid_disks;
5158
5159         rv = mddev->pers->check_reshape(mddev);
5160         return rv;
5161 }
5162
5163
5164 /*
5165  * update_array_info is used to change the configuration of an
5166  * on-line array.
5167  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
5168  * fields in the info are checked against the array.
5169  * Any differences that cannot be handled will cause an error.
5170  * Normally, only one change can be managed at a time.
5171  */
5172 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5173 {
5174         int rv = 0;
5175         int cnt = 0;
5176         int state = 0;
5177
5178         /* calculate expected state,ignoring low bits */
5179         if (mddev->bitmap && mddev->bitmap_offset)
5180                 state |= (1 << MD_SB_BITMAP_PRESENT);
5181
5182         if (mddev->major_version != info->major_version ||
5183             mddev->minor_version != info->minor_version ||
5184 /*          mddev->patch_version != info->patch_version || */
5185             mddev->ctime         != info->ctime         ||
5186             mddev->level         != info->level         ||
5187 /*          mddev->layout        != info->layout        || */
5188             !mddev->persistent   != info->not_persistent||
5189             mddev->chunk_sectors != info->chunk_size >> 9 ||
5190             /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
5191             ((state^info->state) & 0xfffffe00)
5192                 )
5193                 return -EINVAL;
5194         /* Check there is only one change */
5195         if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5196                 cnt++;
5197         if (mddev->raid_disks != info->raid_disks)
5198                 cnt++;
5199         if (mddev->layout != info->layout)
5200                 cnt++;
5201         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
5202                 cnt++;
5203         if (cnt == 0)
5204                 return 0;
5205         if (cnt > 1)
5206                 return -EINVAL;
5207
5208         if (mddev->layout != info->layout) {
5209                 /* Change layout
5210                  * we don't need to do anything at the md level, the
5211                  * personality will take care of it all.
5212                  */
5213                 if (mddev->pers->check_reshape == NULL)
5214                         return -EINVAL;
5215                 else {
5216                         mddev->new_layout = info->layout;
5217                         rv = mddev->pers->check_reshape(mddev);
5218                         if (rv)
5219                                 mddev->new_layout = mddev->layout;
5220                         return rv;
5221                 }
5222         }
5223         if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5224                 rv = update_size(mddev, (sector_t)info->size * 2);
5225
5226         if (mddev->raid_disks    != info->raid_disks)
5227                 rv = update_raid_disks(mddev, info->raid_disks);
5228
5229         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
5230                 if (mddev->pers->quiesce == NULL)
5231                         return -EINVAL;
5232                 if (mddev->recovery || mddev->sync_thread)
5233                         return -EBUSY;
5234                 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
5235                         /* add the bitmap */
5236                         if (mddev->bitmap)
5237                                 return -EEXIST;
5238                         if (mddev->default_bitmap_offset == 0)
5239                                 return -EINVAL;
5240                         mddev->bitmap_offset = mddev->default_bitmap_offset;
5241                         mddev->pers->quiesce(mddev, 1);
5242                         rv = bitmap_create(mddev);
5243                         if (rv)
5244                                 bitmap_destroy(mddev);
5245                         mddev->pers->quiesce(mddev, 0);
5246                 } else {
5247                         /* remove the bitmap */
5248                         if (!mddev->bitmap)
5249                                 return -ENOENT;
5250                         if (mddev->bitmap->file)
5251                                 return -EINVAL;
5252                         mddev->pers->quiesce(mddev, 1);
5253                         bitmap_destroy(mddev);
5254                         mddev->pers->quiesce(mddev, 0);
5255                         mddev->bitmap_offset = 0;
5256                 }
5257         }
5258         md_update_sb(mddev, 1);
5259         return rv;
5260 }
5261
5262 static int set_disk_faulty(mddev_t *mddev, dev_t dev)
5263 {
5264         mdk_rdev_t *rdev;
5265
5266         if (mddev->pers == NULL)
5267                 return -ENODEV;
5268
5269         rdev = find_rdev(mddev, dev);
5270         if (!rdev)
5271                 return -ENODEV;
5272
5273         md_error(mddev, rdev);
5274         return 0;
5275 }
5276
5277 /*
5278  * We have a problem here : there is no easy way to give a CHS
5279  * virtual geometry. We currently pretend that we have a 2 heads
5280  * 4 sectors (with a BIG number of cylinders...). This drives
5281  * dosfs just mad... ;-)
5282  */
5283 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
5284 {
5285         mddev_t *mddev = bdev->bd_disk->private_data;
5286
5287         geo->heads = 2;
5288         geo->sectors = 4;
5289         geo->cylinders = get_capacity(mddev->gendisk) / 8;
5290         return 0;
5291 }
5292
5293 static int md_ioctl(struct block_device *bdev, fmode_t mode,
5294                         unsigned int cmd, unsigned long arg)
5295 {
5296         int err = 0;
5297         void __user *argp = (void __user *)arg;
5298         mddev_t *mddev = NULL;
5299
5300         if (!capable(CAP_SYS_ADMIN))
5301                 return -EACCES;
5302
5303         /*
5304          * Commands dealing with the RAID driver but not any
5305          * particular array:
5306          */
5307         switch (cmd)
5308         {
5309                 case RAID_VERSION:
5310                         err = get_version(argp);
5311                         goto done;
5312
5313                 case PRINT_RAID_DEBUG:
5314                         err = 0;
5315                         md_print_devices();
5316                         goto done;
5317
5318 #ifndef MODULE
5319                 case RAID_AUTORUN:
5320                         err = 0;
5321                         autostart_arrays(arg);
5322                         goto done;
5323 #endif
5324                 default:;
5325         }
5326
5327         /*
5328          * Commands creating/starting a new array:
5329          */
5330
5331         mddev = bdev->bd_disk->private_data;
5332
5333         if (!mddev) {
5334                 BUG();
5335                 goto abort;
5336         }
5337
5338         err = mddev_lock(mddev);
5339         if (err) {
5340                 printk(KERN_INFO 
5341                         "md: ioctl lock interrupted, reason %d, cmd %d\n",
5342                         err, cmd);
5343                 goto abort;
5344         }
5345
5346         switch (cmd)
5347         {
5348                 case SET_ARRAY_INFO:
5349                         {
5350                                 mdu_array_info_t info;
5351                                 if (!arg)
5352                                         memset(&info, 0, sizeof(info));
5353                                 else if (copy_from_user(&info, argp, sizeof(info))) {
5354                                         err = -EFAULT;
5355                                         goto abort_unlock;
5356                                 }
5357                                 if (mddev->pers) {
5358                                         err = update_array_info(mddev, &info);
5359                                         if (err) {
5360                                                 printk(KERN_WARNING "md: couldn't update"
5361                                                        " array info. %d\n", err);
5362                                                 goto abort_unlock;
5363                                         }
5364                                         goto done_unlock;
5365                                 }
5366                                 if (!list_empty(&mddev->disks)) {
5367                                         printk(KERN_WARNING
5368                                                "md: array %s already has disks!\n",
5369                                                mdname(mddev));
5370                                         err = -EBUSY;
5371                                         goto abort_unlock;
5372                                 }
5373                                 if (mddev->raid_disks) {
5374                                         printk(KERN_WARNING
5375                                                "md: array %s already initialised!\n",
5376                                                mdname(mddev));
5377                                         err = -EBUSY;
5378                                         goto abort_unlock;
5379                                 }
5380                                 err = set_array_info(mddev, &info);
5381                                 if (err) {
5382                                         printk(KERN_WARNING "md: couldn't set"
5383                                                " array info. %d\n", err);
5384                                         goto abort_unlock;
5385                                 }
5386                         }
5387                         goto done_unlock;
5388
5389                 default:;
5390         }
5391
5392         /*
5393          * Commands querying/configuring an existing array:
5394          */
5395         /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
5396          * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
5397         if ((!mddev->raid_disks && !mddev->external)
5398             && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
5399             && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
5400             && cmd != GET_BITMAP_FILE) {
5401                 err = -ENODEV;
5402                 goto abort_unlock;
5403         }
5404
5405         /*
5406          * Commands even a read-only array can execute:
5407          */
5408         switch (cmd)
5409         {
5410                 case GET_ARRAY_INFO:
5411                         err = get_array_info(mddev, argp);
5412                         goto done_unlock;
5413
5414                 case GET_BITMAP_FILE:
5415                         err = get_bitmap_file(mddev, argp);
5416                         goto done_unlock;
5417
5418                 case GET_DISK_INFO:
5419                         err = get_disk_info(mddev, argp);
5420                         goto done_unlock;
5421
5422                 case RESTART_ARRAY_RW:
5423                         err = restart_array(mddev);
5424                         goto done_unlock;
5425
5426                 case STOP_ARRAY:
5427                         err = do_md_stop(mddev, 0, 1);
5428                         goto done_unlock;
5429
5430                 case STOP_ARRAY_RO:
5431                         err = do_md_stop(mddev, 1, 1);
5432                         goto done_unlock;
5433
5434         }
5435
5436         /*
5437          * The remaining ioctls are changing the state of the
5438          * superblock, so we do not allow them on read-only arrays.
5439          * However non-MD ioctls (e.g. get-size) will still come through
5440          * here and hit the 'default' below, so only disallow
5441          * 'md' ioctls, and switch to rw mode if started auto-readonly.
5442          */
5443         if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
5444                 if (mddev->ro == 2) {
5445                         mddev->ro = 0;
5446                         sysfs_notify_dirent(mddev->sysfs_state);
5447                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5448                         md_wakeup_thread(mddev->thread);
5449                 } else {
5450                         err = -EROFS;
5451                         goto abort_unlock;
5452                 }
5453         }
5454
5455         switch (cmd)
5456         {
5457                 case ADD_NEW_DISK:
5458                 {
5459                         mdu_disk_info_t info;
5460                         if (copy_from_user(&info, argp, sizeof(info)))
5461                                 err = -EFAULT;
5462                         else
5463                                 err = add_new_disk(mddev, &info);
5464                         goto done_unlock;
5465                 }
5466
5467                 case HOT_REMOVE_DISK:
5468                         err = hot_remove_disk(mddev, new_decode_dev(arg));
5469                         goto done_unlock;
5470
5471                 case HOT_ADD_DISK:
5472                         err = hot_add_disk(mddev, new_decode_dev(arg));
5473                         goto done_unlock;
5474
5475                 case SET_DISK_FAULTY:
5476                         err = set_disk_faulty(mddev, new_decode_dev(arg));
5477                         goto done_unlock;
5478
5479                 case RUN_ARRAY:
5480                         err = do_md_run(mddev);
5481                         goto done_unlock;
5482
5483                 case SET_BITMAP_FILE:
5484                         err = set_bitmap_file(mddev, (int)arg);
5485                         goto done_unlock;
5486
5487                 default:
5488                         err = -EINVAL;
5489                         goto abort_unlock;
5490         }
5491
5492 done_unlock:
5493 abort_unlock:
5494         if (mddev->hold_active == UNTIL_IOCTL &&
5495             err != -EINVAL)
5496                 mddev->hold_active = 0;
5497         mddev_unlock(mddev);
5498
5499         return err;
5500 done:
5501         if (err)
5502                 MD_BUG();
5503 abort:
5504         return err;
5505 }
5506
5507 static int md_open(struct block_device *bdev, fmode_t mode)
5508 {
5509         /*
5510          * Succeed if we can lock the mddev, which confirms that
5511          * it isn't being stopped right now.
5512          */
5513         mddev_t *mddev = mddev_find(bdev->bd_dev);
5514         int err;
5515
5516         if (mddev->gendisk != bdev->bd_disk) {
5517                 /* we are racing with mddev_put which is discarding this
5518                  * bd_disk.
5519                  */
5520                 mddev_put(mddev);
5521                 /* Wait until bdev->bd_disk is definitely gone */
5522                 flush_scheduled_work();
5523                 /* Then retry the open from the top */
5524                 return -ERESTARTSYS;
5525         }
5526         BUG_ON(mddev != bdev->bd_disk->private_data);
5527
5528         if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
5529                 goto out;
5530
5531         err = 0;
5532         atomic_inc(&mddev->openers);
5533         mutex_unlock(&mddev->open_mutex);
5534
5535         check_disk_change(bdev);
5536  out:
5537         return err;
5538 }
5539
5540 static int md_release(struct gendisk *disk, fmode_t mode)
5541 {
5542         mddev_t *mddev = disk->private_data;
5543
5544         BUG_ON(!mddev);
5545         atomic_dec(&mddev->openers);
5546         mddev_put(mddev);
5547
5548         return 0;
5549 }
5550
5551 static int md_media_changed(struct gendisk *disk)
5552 {
5553         mddev_t *mddev = disk->private_data;
5554
5555         return mddev->changed;
5556 }
5557
5558 static int md_revalidate(struct gendisk *disk)
5559 {
5560         mddev_t *mddev = disk->private_data;
5561
5562         mddev->changed = 0;
5563         return 0;
5564 }
5565 static const struct block_device_operations md_fops =
5566 {
5567         .owner          = THIS_MODULE,
5568         .open           = md_open,
5569         .release        = md_release,
5570         .ioctl          = md_ioctl,
5571         .getgeo         = md_getgeo,
5572         .media_changed  = md_media_changed,
5573         .revalidate_disk= md_revalidate,
5574 };
5575
5576 static int md_thread(void * arg)
5577 {
5578         mdk_thread_t *thread = arg;
5579
5580         /*
5581          * md_thread is a 'system-thread', it's priority should be very
5582          * high. We avoid resource deadlocks individually in each
5583          * raid personality. (RAID5 does preallocation) We also use RR and
5584          * the very same RT priority as kswapd, thus we will never get
5585          * into a priority inversion deadlock.
5586          *
5587          * we definitely have to have equal or higher priority than
5588          * bdflush, otherwise bdflush will deadlock if there are too
5589          * many dirty RAID5 blocks.
5590          */
5591
5592         allow_signal(SIGKILL);
5593         while (!kthread_should_stop()) {
5594
5595                 /* We need to wait INTERRUPTIBLE so that
5596                  * we don't add to the load-average.
5597                  * That means we need to be sure no signals are
5598                  * pending
5599                  */
5600                 if (signal_pending(current))
5601                         flush_signals(current);
5602
5603                 wait_event_interruptible_timeout
5604                         (thread->wqueue,
5605                          test_bit(THREAD_WAKEUP, &thread->flags)
5606                          || kthread_should_stop(),
5607                          thread->timeout);
5608
5609                 clear_bit(THREAD_WAKEUP, &thread->flags);
5610
5611                 thread->run(thread->mddev);
5612         }
5613
5614         return 0;
5615 }
5616
5617 void md_wakeup_thread(mdk_thread_t *thread)
5618 {
5619         if (thread) {
5620                 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
5621                 set_bit(THREAD_WAKEUP, &thread->flags);
5622                 wake_up(&thread->wqueue);
5623         }
5624 }
5625
5626 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
5627                                  const char *name)
5628 {
5629         mdk_thread_t *thread;
5630
5631         thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
5632         if (!thread)
5633                 return NULL;
5634
5635         init_waitqueue_head(&thread->wqueue);
5636
5637         thread->run = run;
5638         thread->mddev = mddev;
5639         thread->timeout = MAX_SCHEDULE_TIMEOUT;
5640         thread->tsk = kthread_run(md_thread, thread,
5641                                   "%s_%s",
5642                                   mdname(thread->mddev),
5643                                   name ?: mddev->pers->name);
5644         if (IS_ERR(thread->tsk)) {
5645                 kfree(thread);
5646                 return NULL;
5647         }
5648         return thread;
5649 }
5650
5651 void md_unregister_thread(mdk_thread_t *thread)
5652 {
5653         if (!thread)
5654                 return;
5655         dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
5656
5657         kthread_stop(thread->tsk);
5658         kfree(thread);
5659 }
5660
5661 void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5662 {
5663         if (!mddev) {
5664                 MD_BUG();
5665                 return;
5666         }
5667
5668         if (!rdev || test_bit(Faulty, &rdev->flags))
5669                 return;
5670
5671         if (mddev->external)
5672                 set_bit(Blocked, &rdev->flags);
5673 /*
5674         dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
5675                 mdname(mddev),
5676                 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
5677                 __builtin_return_address(0),__builtin_return_address(1),
5678                 __builtin_return_address(2),__builtin_return_address(3));
5679 */
5680         if (!mddev->pers)
5681                 return;
5682         if (!mddev->pers->error_handler)
5683                 return;
5684         mddev->pers->error_handler(mddev,rdev);
5685         if (mddev->degraded)
5686                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5687         set_bit(StateChanged, &rdev->flags);
5688         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5689         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5690         md_wakeup_thread(mddev->thread);
5691         md_new_event_inintr(mddev);
5692 }
5693
5694 /* seq_file implementation /proc/mdstat */
5695
5696 static void status_unused(struct seq_file *seq)
5697 {
5698         int i = 0;
5699         mdk_rdev_t *rdev;
5700
5701         seq_printf(seq, "unused devices: ");
5702
5703         list_for_each_entry(rdev, &pending_raid_disks, same_set) {
5704                 char b[BDEVNAME_SIZE];
5705                 i++;
5706                 seq_printf(seq, "%s ",
5707                               bdevname(rdev->bdev,b));
5708         }
5709         if (!i)
5710                 seq_printf(seq, "<none>");
5711
5712         seq_printf(seq, "\n");
5713 }
5714
5715
5716 static void status_resync(struct seq_file *seq, mddev_t * mddev)
5717 {
5718         sector_t max_sectors, resync, res;
5719         unsigned long dt, db;
5720         sector_t rt;
5721         int scale;
5722         unsigned int per_milli;
5723
5724         resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
5725
5726         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5727                 max_sectors = mddev->resync_max_sectors;
5728         else
5729                 max_sectors = mddev->dev_sectors;
5730
5731         /*
5732          * Should not happen.
5733          */
5734         if (!max_sectors) {
5735                 MD_BUG();
5736                 return;
5737         }
5738         /* Pick 'scale' such that (resync>>scale)*1000 will fit
5739          * in a sector_t, and (max_sectors>>scale) will fit in a
5740          * u32, as those are the requirements for sector_div.
5741          * Thus 'scale' must be at least 10
5742          */
5743         scale = 10;
5744         if (sizeof(sector_t) > sizeof(unsigned long)) {
5745                 while ( max_sectors/2 > (1ULL<<(scale+32)))
5746                         scale++;
5747         }
5748         res = (resync>>scale)*1000;
5749         sector_div(res, (u32)((max_sectors>>scale)+1));
5750
5751         per_milli = res;
5752         {
5753                 int i, x = per_milli/50, y = 20-x;
5754                 seq_printf(seq, "[");
5755                 for (i = 0; i < x; i++)
5756                         seq_printf(seq, "=");
5757                 seq_printf(seq, ">");
5758                 for (i = 0; i < y; i++)
5759                         seq_printf(seq, ".");
5760                 seq_printf(seq, "] ");
5761         }
5762         seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
5763                    (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
5764                     "reshape" :
5765                     (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
5766                      "check" :
5767                      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
5768                       "resync" : "recovery"))),
5769                    per_milli/10, per_milli % 10,
5770                    (unsigned long long) resync/2,
5771                    (unsigned long long) max_sectors/2);
5772
5773         /*
5774          * dt: time from mark until now
5775          * db: blocks written from mark until now
5776          * rt: remaining time
5777          *
5778          * rt is a sector_t, so could be 32bit or 64bit.
5779          * So we divide before multiply in case it is 32bit and close
5780          * to the limit.
5781          * We scale the divisor (db) by 32 to avoid loosing precision
5782          * near the end of resync when the number of remaining sectors
5783          * is close to 'db'.
5784          * We then divide rt by 32 after multiplying by db to compensate.
5785          * The '+1' avoids division by zero if db is very small.
5786          */
5787         dt = ((jiffies - mddev->resync_mark) / HZ);
5788         if (!dt) dt++;
5789         db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
5790                 - mddev->resync_mark_cnt;
5791
5792         rt = max_sectors - resync;    /* number of remaining sectors */
5793         sector_div(rt, db/32+1);
5794         rt *= dt;
5795         rt >>= 5;
5796
5797         seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
5798                    ((unsigned long)rt % 60)/6);
5799
5800         seq_printf(seq, " speed=%ldK/sec", db/2/dt);
5801 }
5802
5803 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
5804 {
5805         struct list_head *tmp;
5806         loff_t l = *pos;
5807         mddev_t *mddev;
5808
5809         if (l >= 0x10000)
5810                 return NULL;
5811         if (!l--)
5812                 /* header */
5813                 return (void*)1;
5814
5815         spin_lock(&all_mddevs_lock);
5816         list_for_each(tmp,&all_mddevs)
5817                 if (!l--) {
5818                         mddev = list_entry(tmp, mddev_t, all_mddevs);
5819                         mddev_get(mddev);
5820                         spin_unlock(&all_mddevs_lock);
5821                         return mddev;
5822                 }
5823         spin_unlock(&all_mddevs_lock);
5824         if (!l--)
5825                 return (void*)2;/* tail */
5826         return NULL;
5827 }
5828
5829 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
5830 {
5831         struct list_head *tmp;
5832         mddev_t *next_mddev, *mddev = v;
5833         
5834         ++*pos;
5835         if (v == (void*)2)
5836                 return NULL;
5837
5838         spin_lock(&all_mddevs_lock);
5839         if (v == (void*)1)
5840                 tmp = all_mddevs.next;
5841         else
5842                 tmp = mddev->all_mddevs.next;
5843         if (tmp != &all_mddevs)
5844                 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
5845         else {
5846                 next_mddev = (void*)2;
5847                 *pos = 0x10000;
5848         }               
5849         spin_unlock(&all_mddevs_lock);
5850
5851         if (v != (void*)1)
5852                 mddev_put(mddev);
5853         return next_mddev;
5854
5855 }
5856
5857 static void md_seq_stop(struct seq_file *seq, void *v)
5858 {
5859         mddev_t *mddev = v;
5860
5861         if (mddev && v != (void*)1 && v != (void*)2)
5862                 mddev_put(mddev);
5863 }
5864
5865 struct mdstat_info {
5866         int event;
5867 };
5868
5869 static int md_seq_show(struct seq_file *seq, void *v)
5870 {
5871         mddev_t *mddev = v;
5872         sector_t sectors;
5873         mdk_rdev_t *rdev;
5874         struct mdstat_info *mi = seq->private;
5875         struct bitmap *bitmap;
5876
5877         if (v == (void*)1) {
5878                 struct mdk_personality *pers;
5879                 seq_printf(seq, "Personalities : ");
5880                 spin_lock(&pers_lock);
5881                 list_for_each_entry(pers, &pers_list, list)
5882                         seq_printf(seq, "[%s] ", pers->name);
5883
5884                 spin_unlock(&pers_lock);
5885                 seq_printf(seq, "\n");
5886                 mi->event = atomic_read(&md_event_count);
5887                 return 0;
5888         }
5889         if (v == (void*)2) {
5890                 status_unused(seq);
5891                 return 0;
5892         }
5893
5894         if (mddev_lock(mddev) < 0)
5895                 return -EINTR;
5896
5897         if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
5898                 seq_printf(seq, "%s : %sactive", mdname(mddev),
5899                                                 mddev->pers ? "" : "in");
5900                 if (mddev->pers) {
5901                         if (mddev->ro==1)
5902                                 seq_printf(seq, " (read-only)");
5903                         if (mddev->ro==2)
5904                                 seq_printf(seq, " (auto-read-only)");
5905                         seq_printf(seq, " %s", mddev->pers->name);
5906                 }
5907
5908                 sectors = 0;
5909                 list_for_each_entry(rdev, &mddev->disks, same_set) {
5910                         char b[BDEVNAME_SIZE];
5911                         seq_printf(seq, " %s[%d]",
5912                                 bdevname(rdev->bdev,b), rdev->desc_nr);
5913                         if (test_bit(WriteMostly, &rdev->flags))
5914                                 seq_printf(seq, "(W)");
5915                         if (test_bit(Faulty, &rdev->flags)) {
5916                                 seq_printf(seq, "(F)");
5917                                 continue;
5918                         } else if (rdev->raid_disk < 0)
5919                                 seq_printf(seq, "(S)"); /* spare */
5920                         sectors += rdev->sectors;
5921                 }
5922
5923                 if (!list_empty(&mddev->disks)) {
5924                         if (mddev->pers)
5925                                 seq_printf(seq, "\n      %llu blocks",
5926                                            (unsigned long long)
5927                                            mddev->array_sectors / 2);
5928                         else
5929                                 seq_printf(seq, "\n      %llu blocks",
5930                                            (unsigned long long)sectors / 2);
5931                 }
5932                 if (mddev->persistent) {
5933                         if (mddev->major_version != 0 ||
5934                             mddev->minor_version != 90) {
5935                                 seq_printf(seq," super %d.%d",
5936                                            mddev->major_version,
5937                                            mddev->minor_version);
5938                         }
5939                 } else if (mddev->external)
5940                         seq_printf(seq, " super external:%s",
5941                                    mddev->metadata_type);
5942                 else
5943                         seq_printf(seq, " super non-persistent");
5944
5945                 if (mddev->pers) {
5946                         mddev->pers->status(seq, mddev);
5947                         seq_printf(seq, "\n      ");
5948                         if (mddev->pers->sync_request) {
5949                                 if (mddev->curr_resync > 2) {
5950                                         status_resync(seq, mddev);
5951                                         seq_printf(seq, "\n      ");
5952                                 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
5953                                         seq_printf(seq, "\tresync=DELAYED\n      ");
5954                                 else if (mddev->recovery_cp < MaxSector)
5955                                         seq_printf(seq, "\tresync=PENDING\n      ");
5956                         }
5957                 } else
5958                         seq_printf(seq, "\n       ");
5959
5960                 if ((bitmap = mddev->bitmap)) {
5961                         unsigned long chunk_kb;
5962                         unsigned long flags;
5963                         spin_lock_irqsave(&bitmap->lock, flags);
5964                         chunk_kb = bitmap->chunksize >> 10;
5965                         seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
5966                                 "%lu%s chunk",
5967                                 bitmap->pages - bitmap->missing_pages,
5968                                 bitmap->pages,
5969                                 (bitmap->pages - bitmap->missing_pages)
5970                                         << (PAGE_SHIFT - 10),
5971                                 chunk_kb ? chunk_kb : bitmap->chunksize,
5972                                 chunk_kb ? "KB" : "B");
5973                         if (bitmap->file) {
5974                                 seq_printf(seq, ", file: ");
5975                                 seq_path(seq, &bitmap->file->f_path, " \t\n");
5976                         }
5977
5978                         seq_printf(seq, "\n");
5979                         spin_unlock_irqrestore(&bitmap->lock, flags);
5980                 }
5981
5982                 seq_printf(seq, "\n");
5983         }
5984         mddev_unlock(mddev);
5985         
5986         return 0;
5987 }
5988
5989 static const struct seq_operations md_seq_ops = {
5990         .start  = md_seq_start,
5991         .next   = md_seq_next,
5992         .stop   = md_seq_stop,
5993         .show   = md_seq_show,
5994 };
5995
5996 static int md_seq_open(struct inode *inode, struct file *file)
5997 {
5998         int error;
5999         struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
6000         if (mi == NULL)
6001                 return -ENOMEM;
6002
6003         error = seq_open(file, &md_seq_ops);
6004         if (error)
6005                 kfree(mi);
6006         else {
6007                 struct seq_file *p = file->private_data;
6008                 p->private = mi;
6009                 mi->event = atomic_read(&md_event_count);
6010         }
6011         return error;
6012 }
6013
6014 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
6015 {
6016         struct seq_file *m = filp->private_data;
6017         struct mdstat_info *mi = m->private;
6018         int mask;
6019
6020         poll_wait(filp, &md_event_waiters, wait);
6021
6022         /* always allow read */
6023         mask = POLLIN | POLLRDNORM;
6024
6025         if (mi->event != atomic_read(&md_event_count))
6026                 mask |= POLLERR | POLLPRI;
6027         return mask;
6028 }
6029
6030 static const struct file_operations md_seq_fops = {
6031         .owner          = THIS_MODULE,
6032         .open           = md_seq_open,
6033         .read           = seq_read,
6034         .llseek         = seq_lseek,
6035         .release        = seq_release_private,
6036         .poll           = mdstat_poll,
6037 };
6038
6039 int register_md_personality(struct mdk_personality *p)
6040 {
6041         spin_lock(&pers_lock);
6042         list_add_tail(&p->list, &pers_list);
6043         printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
6044         spin_unlock(&pers_lock);
6045         return 0;
6046 }
6047
6048 int unregister_md_personality(struct mdk_personality *p)
6049 {
6050         printk(KERN_INFO "md: %s personality unregistered\n", p->name);
6051         spin_lock(&pers_lock);
6052         list_del_init(&p->list);
6053         spin_unlock(&pers_lock);
6054         return 0;
6055 }
6056
6057 static int is_mddev_idle(mddev_t *mddev, int init)
6058 {
6059         mdk_rdev_t * rdev;
6060         int idle;
6061         int curr_events;
6062
6063         idle = 1;
6064         rcu_read_lock();
6065         rdev_for_each_rcu(rdev, mddev) {
6066                 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
6067                 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
6068                               (int)part_stat_read(&disk->part0, sectors[1]) -
6069                               atomic_read(&disk->sync_io);
6070                 /* sync IO will cause sync_io to increase before the disk_stats
6071                  * as sync_io is counted when a request starts, and
6072                  * disk_stats is counted when it completes.
6073                  * So resync activity will cause curr_events to be smaller than
6074                  * when there was no such activity.
6075                  * non-sync IO will cause disk_stat to increase without
6076                  * increasing sync_io so curr_events will (eventually)
6077                  * be larger than it was before.  Once it becomes
6078                  * substantially larger, the test below will cause
6079                  * the array to appear non-idle, and resync will slow
6080                  * down.
6081                  * If there is a lot of outstanding resync activity when
6082                  * we set last_event to curr_events, then all that activity
6083                  * completing might cause the array to appear non-idle
6084                  * and resync will be slowed down even though there might
6085                  * not have been non-resync activity.  This will only
6086                  * happen once though.  'last_events' will soon reflect
6087                  * the state where there is little or no outstanding
6088                  * resync requests, and further resync activity will
6089                  * always make curr_events less than last_events.
6090                  *
6091                  */
6092                 if (init || curr_events - rdev->last_events > 64) {
6093                         rdev->last_events = curr_events;
6094                         idle = 0;
6095                 }
6096         }
6097         rcu_read_unlock();
6098         return idle;
6099 }
6100
6101 void md_done_sync(mddev_t *mddev, int blocks, int ok)
6102 {
6103         /* another "blocks" (512byte) blocks have been synced */
6104         atomic_sub(blocks, &mddev->recovery_active);
6105         wake_up(&mddev->recovery_wait);
6106         if (!ok) {
6107                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6108                 md_wakeup_thread(mddev->thread);
6109                 // stop recovery, signal do_sync ....
6110         }
6111 }
6112
6113
6114 /* md_write_start(mddev, bi)
6115  * If we need to update some array metadata (e.g. 'active' flag
6116  * in superblock) before writing, schedule a superblock update
6117  * and wait for it to complete.
6118  */
6119 void md_write_start(mddev_t *mddev, struct bio *bi)
6120 {
6121         int did_change = 0;
6122         if (bio_data_dir(bi) != WRITE)
6123                 return;
6124
6125         BUG_ON(mddev->ro == 1);
6126         if (mddev->ro == 2) {
6127                 /* need to switch to read/write */
6128                 mddev->ro = 0;
6129                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6130                 md_wakeup_thread(mddev->thread);
6131                 md_wakeup_thread(mddev->sync_thread);
6132                 did_change = 1;
6133         }
6134         atomic_inc(&mddev->writes_pending);
6135         if (mddev->safemode == 1)
6136                 mddev->safemode = 0;
6137         if (mddev->in_sync) {
6138                 spin_lock_irq(&mddev->write_lock);
6139                 if (mddev->in_sync) {
6140                         mddev->in_sync = 0;
6141                         set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6142                         md_wakeup_thread(mddev->thread);
6143                         did_change = 1;
6144                 }
6145                 spin_unlock_irq(&mddev->write_lock);
6146         }
6147         if (did_change)
6148                 sysfs_notify_dirent(mddev->sysfs_state);
6149         wait_event(mddev->sb_wait,
6150                    !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
6151                    !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6152 }
6153
6154 void md_write_end(mddev_t *mddev)
6155 {
6156         if (atomic_dec_and_test(&mddev->writes_pending)) {
6157                 if (mddev->safemode == 2)
6158                         md_wakeup_thread(mddev->thread);
6159                 else if (mddev->safemode_delay)
6160                         mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
6161         }
6162 }
6163
6164 /* md_allow_write(mddev)
6165  * Calling this ensures that the array is marked 'active' so that writes
6166  * may proceed without blocking.  It is important to call this before
6167  * attempting a GFP_KERNEL allocation while holding the mddev lock.
6168  * Must be called with mddev_lock held.
6169  *
6170  * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
6171  * is dropped, so return -EAGAIN after notifying userspace.
6172  */
6173 int md_allow_write(mddev_t *mddev)
6174 {
6175         if (!mddev->pers)
6176                 return 0;
6177         if (mddev->ro)
6178                 return 0;
6179         if (!mddev->pers->sync_request)
6180                 return 0;
6181
6182         spin_lock_irq(&mddev->write_lock);
6183         if (mddev->in_sync) {
6184                 mddev->in_sync = 0;
6185                 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6186                 if (mddev->safemode_delay &&
6187                     mddev->safemode == 0)
6188                         mddev->safemode = 1;
6189                 spin_unlock_irq(&mddev->write_lock);
6190                 md_update_sb(mddev, 0);
6191                 sysfs_notify_dirent(mddev->sysfs_state);
6192         } else
6193                 spin_unlock_irq(&mddev->write_lock);
6194
6195         if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
6196                 return -EAGAIN;
6197         else
6198                 return 0;
6199 }
6200 EXPORT_SYMBOL_GPL(md_allow_write);
6201
6202 #define SYNC_MARKS      10
6203 #define SYNC_MARK_STEP  (3*HZ)
6204 void md_do_sync(mddev_t *mddev)
6205 {
6206         mddev_t *mddev2;
6207         unsigned int currspeed = 0,
6208                  window;
6209         sector_t max_sectors,j, io_sectors;
6210         unsigned long mark[SYNC_MARKS];
6211         sector_t mark_cnt[SYNC_MARKS];
6212         int last_mark,m;
6213         struct list_head *tmp;
6214         sector_t last_check;
6215         int skipped = 0;
6216         mdk_rdev_t *rdev;
6217         char *desc;
6218
6219         /* just incase thread restarts... */
6220         if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
6221                 return;
6222         if (mddev->ro) /* never try to sync a read-only array */
6223                 return;
6224
6225         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6226                 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
6227                         desc = "data-check";
6228                 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6229                         desc = "requested-resync";
6230                 else
6231                         desc = "resync";
6232         } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6233                 desc = "reshape";
6234         else
6235                 desc = "recovery";
6236
6237         /* we overload curr_resync somewhat here.
6238          * 0 == not engaged in resync at all
6239          * 2 == checking that there is no conflict with another sync
6240          * 1 == like 2, but have yielded to allow conflicting resync to
6241          *              commense
6242          * other == active in resync - this many blocks
6243          *
6244          * Before starting a resync we must have set curr_resync to
6245          * 2, and then checked that every "conflicting" array has curr_resync
6246          * less than ours.  When we find one that is the same or higher
6247          * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
6248          * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
6249          * This will mean we have to start checking from the beginning again.
6250          *
6251          */
6252
6253         do {
6254                 mddev->curr_resync = 2;
6255
6256         try_again:
6257                 if (kthread_should_stop()) {
6258                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6259                         goto skip;
6260                 }
6261                 for_each_mddev(mddev2, tmp) {
6262                         if (mddev2 == mddev)
6263                                 continue;
6264                         if (!mddev->parallel_resync
6265                         &&  mddev2->curr_resync
6266                         &&  match_mddev_units(mddev, mddev2)) {
6267                                 DEFINE_WAIT(wq);
6268                                 if (mddev < mddev2 && mddev->curr_resync == 2) {
6269                                         /* arbitrarily yield */
6270                                         mddev->curr_resync = 1;
6271                                         wake_up(&resync_wait);
6272                                 }
6273                                 if (mddev > mddev2 && mddev->curr_resync == 1)
6274                                         /* no need to wait here, we can wait the next
6275                                          * time 'round when curr_resync == 2
6276                                          */
6277                                         continue;
6278                                 /* We need to wait 'interruptible' so as not to
6279                                  * contribute to the load average, and not to
6280                                  * be caught by 'softlockup'
6281                                  */
6282                                 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
6283                                 if (!kthread_should_stop() &&
6284                                     mddev2->curr_resync >= mddev->curr_resync) {
6285                                         printk(KERN_INFO "md: delaying %s of %s"
6286                                                " until %s has finished (they"
6287                                                " share one or more physical units)\n",
6288                                                desc, mdname(mddev), mdname(mddev2));
6289                                         mddev_put(mddev2);
6290                                         if (signal_pending(current))
6291                                                 flush_signals(current);
6292                                         schedule();
6293                                         finish_wait(&resync_wait, &wq);
6294                                         goto try_again;
6295                                 }
6296                                 finish_wait(&resync_wait, &wq);
6297                         }
6298                 }
6299         } while (mddev->curr_resync < 2);
6300
6301         j = 0;
6302         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6303                 /* resync follows the size requested by the personality,
6304                  * which defaults to physical size, but can be virtual size
6305                  */
6306                 max_sectors = mddev->resync_max_sectors;
6307                 mddev->resync_mismatches = 0;
6308                 /* we don't use the checkpoint if there's a bitmap */
6309                 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6310                         j = mddev->resync_min;
6311                 else if (!mddev->bitmap)
6312                         j = mddev->recovery_cp;
6313
6314         } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6315                 max_sectors = mddev->dev_sectors;
6316         else {
6317                 /* recovery follows the physical size of devices */
6318                 max_sectors = mddev->dev_sectors;
6319                 j = MaxSector;
6320                 list_for_each_entry(rdev, &mddev->disks, same_set)
6321                         if (rdev->raid_disk >= 0 &&
6322                             !test_bit(Faulty, &rdev->flags) &&
6323                             !test_bit(In_sync, &rdev->flags) &&
6324                             rdev->recovery_offset < j)
6325                                 j = rdev->recovery_offset;
6326         }
6327
6328         printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
6329         printk(KERN_INFO "md: minimum _guaranteed_  speed:"
6330                 " %d KB/sec/disk.\n", speed_min(mddev));
6331         printk(KERN_INFO "md: using maximum available idle IO bandwidth "
6332                "(but not more than %d KB/sec) for %s.\n",
6333                speed_max(mddev), desc);
6334
6335         is_mddev_idle(mddev, 1); /* this initializes IO event counters */
6336
6337         io_sectors = 0;
6338         for (m = 0; m < SYNC_MARKS; m++) {
6339                 mark[m] = jiffies;
6340                 mark_cnt[m] = io_sectors;
6341         }
6342         last_mark = 0;
6343         mddev->resync_mark = mark[last_mark];
6344         mddev->resync_mark_cnt = mark_cnt[last_mark];
6345
6346         /*
6347          * Tune reconstruction:
6348          */
6349         window = 32*(PAGE_SIZE/512);
6350         printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
6351                 window/2,(unsigned long long) max_sectors/2);
6352
6353         atomic_set(&mddev->recovery_active, 0);
6354         last_check = 0;
6355
6356         if (j>2) {
6357                 printk(KERN_INFO 
6358                        "md: resuming %s of %s from checkpoint.\n",
6359                        desc, mdname(mddev));
6360                 mddev->curr_resync = j;
6361         }
6362
6363         while (j < max_sectors) {
6364                 sector_t sectors;
6365
6366                 skipped = 0;
6367
6368                 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
6369                     ((mddev->curr_resync > mddev->curr_resync_completed &&
6370                       (mddev->curr_resync - mddev->curr_resync_completed)
6371                       > (max_sectors >> 4)) ||
6372                      (j - mddev->curr_resync_completed)*2
6373                      >= mddev->resync_max - mddev->curr_resync_completed
6374                             )) {
6375                         /* time to update curr_resync_completed */
6376                         blk_unplug(mddev->queue);
6377                         wait_event(mddev->recovery_wait,
6378                                    atomic_read(&mddev->recovery_active) == 0);
6379                         mddev->curr_resync_completed =
6380                                 mddev->curr_resync;
6381                         set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6382                         sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6383                 }
6384
6385                 while (j >= mddev->resync_max && !kthread_should_stop()) {
6386                         /* As this condition is controlled by user-space,
6387                          * we can block indefinitely, so use '_interruptible'
6388                          * to avoid triggering warnings.
6389                          */
6390                         flush_signals(current); /* just in case */
6391                         wait_event_interruptible(mddev->recovery_wait,
6392                                                  mddev->resync_max > j
6393                                                  || kthread_should_stop());
6394                 }
6395
6396                 if (kthread_should_stop())
6397                         goto interrupted;
6398
6399                 sectors = mddev->pers->sync_request(mddev, j, &skipped,
6400                                                   currspeed < speed_min(mddev));
6401                 if (sectors == 0) {
6402                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6403                         goto out;
6404                 }
6405
6406                 if (!skipped) { /* actual IO requested */
6407                         io_sectors += sectors;
6408                         atomic_add(sectors, &mddev->recovery_active);
6409                 }
6410
6411                 j += sectors;
6412                 if (j>1) mddev->curr_resync = j;
6413                 mddev->curr_mark_cnt = io_sectors;
6414                 if (last_check == 0)
6415                         /* this is the earliers that rebuilt will be
6416                          * visible in /proc/mdstat
6417                          */
6418                         md_new_event(mddev);
6419
6420                 if (last_check + window > io_sectors || j == max_sectors)
6421                         continue;
6422
6423                 last_check = io_sectors;
6424
6425                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6426                         break;
6427
6428         repeat:
6429                 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
6430                         /* step marks */
6431                         int next = (last_mark+1) % SYNC_MARKS;
6432
6433                         mddev->resync_mark = mark[next];
6434                         mddev->resync_mark_cnt = mark_cnt[next];
6435                         mark[next] = jiffies;
6436                         mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
6437                         last_mark = next;
6438                 }
6439
6440
6441                 if (kthread_should_stop())
6442                         goto interrupted;
6443
6444
6445                 /*
6446                  * this loop exits only if either when we are slower than
6447                  * the 'hard' speed limit, or the system was IO-idle for
6448                  * a jiffy.
6449                  * the system might be non-idle CPU-wise, but we only care
6450                  * about not overloading the IO subsystem. (things like an
6451                  * e2fsck being done on the RAID array should execute fast)
6452                  */
6453                 blk_unplug(mddev->queue);
6454                 cond_resched();
6455
6456                 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
6457                         /((jiffies-mddev->resync_mark)/HZ +1) +1;
6458
6459                 if (currspeed > speed_min(mddev)) {
6460                         if ((currspeed > speed_max(mddev)) ||
6461                                         !is_mddev_idle(mddev, 0)) {
6462                                 msleep(500);
6463                                 goto repeat;
6464                         }
6465                 }
6466         }
6467         printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
6468         /*
6469          * this also signals 'finished resyncing' to md_stop
6470          */
6471  out:
6472         blk_unplug(mddev->queue);
6473
6474         wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
6475
6476         /* tell personality that we are finished */
6477         mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
6478
6479         if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
6480             mddev->curr_resync > 2) {
6481                 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6482                         if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
6483                                 if (mddev->curr_resync >= mddev->recovery_cp) {
6484                                         printk(KERN_INFO
6485                                                "md: checkpointing %s of %s.\n",
6486                                                desc, mdname(mddev));
6487                                         mddev->recovery_cp = mddev->curr_resync;
6488                                 }
6489                         } else
6490                                 mddev->recovery_cp = MaxSector;
6491                 } else {
6492                         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6493                                 mddev->curr_resync = MaxSector;
6494                         list_for_each_entry(rdev, &mddev->disks, same_set)
6495                                 if (rdev->raid_disk >= 0 &&
6496                                     !test_bit(Faulty, &rdev->flags) &&
6497                                     !test_bit(In_sync, &rdev->flags) &&
6498                                     rdev->recovery_offset < mddev->curr_resync)
6499                                         rdev->recovery_offset = mddev->curr_resync;
6500                 }
6501         }
6502         set_bit(MD_CHANGE_DEVS, &mddev->flags);
6503
6504  skip:
6505         mddev->curr_resync = 0;
6506         mddev->curr_resync_completed = 0;
6507         mddev->resync_min = 0;
6508         mddev->resync_max = MaxSector;
6509         sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6510         wake_up(&resync_wait);
6511         set_bit(MD_RECOVERY_DONE, &mddev->recovery);
6512         md_wakeup_thread(mddev->thread);
6513         return;
6514
6515  interrupted:
6516         /*
6517          * got a signal, exit.
6518          */
6519         printk(KERN_INFO
6520                "md: md_do_sync() got signal ... exiting\n");
6521         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6522         goto out;
6523
6524 }
6525 EXPORT_SYMBOL_GPL(md_do_sync);
6526
6527
6528 static int remove_and_add_spares(mddev_t *mddev)
6529 {
6530         mdk_rdev_t *rdev;
6531         int spares = 0;
6532
6533         mddev->curr_resync_completed = 0;
6534
6535         list_for_each_entry(rdev, &mddev->disks, same_set)
6536                 if (rdev->raid_disk >= 0 &&
6537                     !test_bit(Blocked, &rdev->flags) &&
6538                     (test_bit(Faulty, &rdev->flags) ||
6539                      ! test_bit(In_sync, &rdev->flags)) &&
6540                     atomic_read(&rdev->nr_pending)==0) {
6541                         if (mddev->pers->hot_remove_disk(
6542                                     mddev, rdev->raid_disk)==0) {
6543                                 char nm[20];
6544                                 sprintf(nm,"rd%d", rdev->raid_disk);
6545                                 sysfs_remove_link(&mddev->kobj, nm);
6546                                 rdev->raid_disk = -1;
6547                         }
6548                 }
6549
6550         if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) {
6551                 list_for_each_entry(rdev, &mddev->disks, same_set) {
6552                         if (rdev->raid_disk >= 0 &&
6553                             !test_bit(In_sync, &rdev->flags) &&
6554                             !test_bit(Blocked, &rdev->flags))
6555                                 spares++;
6556                         if (rdev->raid_disk < 0
6557                             && !test_bit(Faulty, &rdev->flags)) {
6558                                 rdev->recovery_offset = 0;
6559                                 if (mddev->pers->
6560                                     hot_add_disk(mddev, rdev) == 0) {
6561                                         char nm[20];
6562                                         sprintf(nm, "rd%d", rdev->raid_disk);
6563                                         if (sysfs_create_link(&mddev->kobj,
6564                                                               &rdev->kobj, nm))
6565                                                 printk(KERN_WARNING
6566                                                        "md: cannot register "
6567                                                        "%s for %s\n",
6568                                                        nm, mdname(mddev));
6569                                         spares++;
6570                                         md_new_event(mddev);
6571                                 } else
6572                                         break;
6573                         }
6574                 }
6575         }
6576         return spares;
6577 }
6578 /*
6579  * This routine is regularly called by all per-raid-array threads to
6580  * deal with generic issues like resync and super-block update.
6581  * Raid personalities that don't have a thread (linear/raid0) do not
6582  * need this as they never do any recovery or update the superblock.
6583  *
6584  * It does not do any resync itself, but rather "forks" off other threads
6585  * to do that as needed.
6586  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
6587  * "->recovery" and create a thread at ->sync_thread.
6588  * When the thread finishes it sets MD_RECOVERY_DONE
6589  * and wakeups up this thread which will reap the thread and finish up.
6590  * This thread also removes any faulty devices (with nr_pending == 0).
6591  *
6592  * The overall approach is:
6593  *  1/ if the superblock needs updating, update it.
6594  *  2/ If a recovery thread is running, don't do anything else.
6595  *  3/ If recovery has finished, clean up, possibly marking spares active.
6596  *  4/ If there are any faulty devices, remove them.
6597  *  5/ If array is degraded, try to add spares devices
6598  *  6/ If array has spares or is not in-sync, start a resync thread.
6599  */
6600 void md_check_recovery(mddev_t *mddev)
6601 {
6602         mdk_rdev_t *rdev;
6603
6604
6605         if (mddev->bitmap)
6606                 bitmap_daemon_work(mddev->bitmap);
6607
6608         if (mddev->ro)
6609                 return;
6610
6611         if (signal_pending(current)) {
6612                 if (mddev->pers->sync_request && !mddev->external) {
6613                         printk(KERN_INFO "md: %s in immediate safe mode\n",
6614                                mdname(mddev));
6615                         mddev->safemode = 2;
6616                 }
6617                 flush_signals(current);
6618         }
6619
6620         if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
6621                 return;
6622         if ( ! (
6623                 (mddev->flags && !mddev->external) ||
6624                 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
6625                 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
6626                 (mddev->external == 0 && mddev->safemode == 1) ||
6627                 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
6628                  && !mddev->in_sync && mddev->recovery_cp == MaxSector)
6629                 ))
6630                 return;
6631
6632         if (mddev_trylock(mddev)) {
6633                 int spares = 0;
6634
6635                 if (mddev->ro) {
6636                         /* Only thing we do on a ro array is remove
6637                          * failed devices.
6638                          */
6639                         remove_and_add_spares(mddev);
6640                         clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6641                         goto unlock;
6642                 }
6643
6644                 if (!mddev->external) {
6645                         int did_change = 0;
6646                         spin_lock_irq(&mddev->write_lock);
6647                         if (mddev->safemode &&
6648                             !atomic_read(&mddev->writes_pending) &&
6649                             !mddev->in_sync &&
6650                             mddev->recovery_cp == MaxSector) {
6651                                 mddev->in_sync = 1;
6652                                 did_change = 1;
6653                                 if (mddev->persistent)
6654                                         set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6655                         }
6656                         if (mddev->safemode == 1)
6657                                 mddev->safemode = 0;
6658                         spin_unlock_irq(&mddev->write_lock);
6659                         if (did_change)
6660                                 sysfs_notify_dirent(mddev->sysfs_state);
6661                 }
6662
6663                 if (mddev->flags)
6664                         md_update_sb(mddev, 0);
6665
6666                 list_for_each_entry(rdev, &mddev->disks, same_set)
6667                         if (test_and_clear_bit(StateChanged, &rdev->flags))
6668                                 sysfs_notify_dirent(rdev->sysfs_state);
6669
6670
6671                 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
6672                     !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
6673                         /* resync/recovery still happening */
6674                         clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6675                         goto unlock;
6676                 }
6677                 if (mddev->sync_thread) {
6678                         /* resync has finished, collect result */
6679                         md_unregister_thread(mddev->sync_thread);
6680                         mddev->sync_thread = NULL;
6681                         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
6682                             !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
6683                                 /* success...*/
6684                                 /* activate any spares */
6685                                 if (mddev->pers->spare_active(mddev))
6686                                         sysfs_notify(&mddev->kobj, NULL,
6687                                                      "degraded");
6688                         }
6689                         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
6690                             mddev->pers->finish_reshape)
6691                                 mddev->pers->finish_reshape(mddev);
6692                         md_update_sb(mddev, 1);
6693
6694                         /* if array is no-longer degraded, then any saved_raid_disk
6695                          * information must be scrapped
6696                          */
6697                         if (!mddev->degraded)
6698                                 list_for_each_entry(rdev, &mddev->disks, same_set)
6699                                         rdev->saved_raid_disk = -1;
6700
6701                         mddev->recovery = 0;
6702                         /* flag recovery needed just to double check */
6703                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6704                         sysfs_notify_dirent(mddev->sysfs_action);
6705                         md_new_event(mddev);
6706                         goto unlock;
6707                 }
6708                 /* Set RUNNING before clearing NEEDED to avoid
6709                  * any transients in the value of "sync_action".
6710                  */
6711                 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6712                 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6713                 /* Clear some bits that don't mean anything, but
6714                  * might be left set
6715                  */
6716                 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
6717                 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
6718
6719                 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
6720                         goto unlock;
6721                 /* no recovery is running.
6722                  * remove any failed drives, then
6723                  * add spares if possible.
6724                  * Spare are also removed and re-added, to allow
6725                  * the personality to fail the re-add.
6726                  */
6727
6728                 if (mddev->reshape_position != MaxSector) {
6729                         if (mddev->pers->check_reshape == NULL ||
6730                             mddev->pers->check_reshape(mddev) != 0)
6731                                 /* Cannot proceed */
6732                                 goto unlock;
6733                         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
6734                         clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6735                 } else if ((spares = remove_and_add_spares(mddev))) {
6736                         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6737                         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
6738                         clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
6739                         set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6740                 } else if (mddev->recovery_cp < MaxSector) {
6741                         set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6742                         clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6743                 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
6744                         /* nothing to be done ... */
6745                         goto unlock;
6746
6747                 if (mddev->pers->sync_request) {
6748                         if (spares && mddev->bitmap && ! mddev->bitmap->file) {
6749                                 /* We are adding a device or devices to an array
6750                                  * which has the bitmap stored on all devices.
6751                                  * So make sure all bitmap pages get written
6752                                  */
6753                                 bitmap_write_all(mddev->bitmap);
6754                         }
6755                         mddev->sync_thread = md_register_thread(md_do_sync,
6756                                                                 mddev,
6757                                                                 "resync");
6758                         if (!mddev->sync_thread) {
6759                                 printk(KERN_ERR "%s: could not start resync"
6760                                         " thread...\n", 
6761                                         mdname(mddev));
6762                                 /* leave the spares where they are, it shouldn't hurt */
6763                                 mddev->recovery = 0;
6764                         } else
6765                                 md_wakeup_thread(mddev->sync_thread);
6766                         sysfs_notify_dirent(mddev->sysfs_action);
6767                         md_new_event(mddev);
6768                 }
6769         unlock:
6770                 if (!mddev->sync_thread) {
6771                         clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6772                         if (test_and_clear_bit(MD_RECOVERY_RECOVER,
6773                                                &mddev->recovery))
6774                                 if (mddev->sysfs_action)
6775                                         sysfs_notify_dirent(mddev->sysfs_action);
6776                 }
6777                 mddev_unlock(mddev);
6778         }
6779 }
6780
6781 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
6782 {
6783         sysfs_notify_dirent(rdev->sysfs_state);
6784         wait_event_timeout(rdev->blocked_wait,
6785                            !test_bit(Blocked, &rdev->flags),
6786                            msecs_to_jiffies(5000));
6787         rdev_dec_pending(rdev, mddev);
6788 }
6789 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
6790
6791 static int md_notify_reboot(struct notifier_block *this,
6792                             unsigned long code, void *x)
6793 {
6794         struct list_head *tmp;
6795         mddev_t *mddev;
6796
6797         if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
6798
6799                 printk(KERN_INFO "md: stopping all md devices.\n");
6800
6801                 for_each_mddev(mddev, tmp)
6802                         if (mddev_trylock(mddev)) {
6803                                 /* Force a switch to readonly even array
6804                                  * appears to still be in use.  Hence
6805                                  * the '100'.
6806                                  */
6807                                 do_md_stop(mddev, 1, 100);
6808                                 mddev_unlock(mddev);
6809                         }
6810                 /*
6811                  * certain more exotic SCSI devices are known to be
6812                  * volatile wrt too early system reboots. While the
6813                  * right place to handle this issue is the given
6814                  * driver, we do want to have a safe RAID driver ...
6815                  */
6816                 mdelay(1000*1);
6817         }
6818         return NOTIFY_DONE;
6819 }
6820
6821 static struct notifier_block md_notifier = {
6822         .notifier_call  = md_notify_reboot,
6823         .next           = NULL,
6824         .priority       = INT_MAX, /* before any real devices */
6825 };
6826
6827 static void md_geninit(void)
6828 {
6829         dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
6830
6831         proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
6832 }
6833
6834 static int __init md_init(void)
6835 {
6836         if (register_blkdev(MD_MAJOR, "md"))
6837                 return -1;
6838         if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
6839                 unregister_blkdev(MD_MAJOR, "md");
6840                 return -1;
6841         }
6842         blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
6843                             md_probe, NULL, NULL);
6844         blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
6845                             md_probe, NULL, NULL);
6846
6847         register_reboot_notifier(&md_notifier);
6848         raid_table_header = register_sysctl_table(raid_root_table);
6849
6850         md_geninit();
6851         return 0;
6852 }
6853
6854
6855 #ifndef MODULE
6856
6857 /*
6858  * Searches all registered partitions for autorun RAID arrays
6859  * at boot time.
6860  */
6861
6862 static LIST_HEAD(all_detected_devices);
6863 struct detected_devices_node {
6864         struct list_head list;
6865         dev_t dev;
6866 };
6867
6868 void md_autodetect_dev(dev_t dev)
6869 {
6870         struct detected_devices_node *node_detected_dev;
6871
6872         node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
6873         if (node_detected_dev) {
6874                 node_detected_dev->dev = dev;
6875                 list_add_tail(&node_detected_dev->list, &all_detected_devices);
6876         } else {
6877                 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
6878                         ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
6879         }
6880 }
6881
6882
6883 static void autostart_arrays(int part)
6884 {
6885         mdk_rdev_t *rdev;
6886         struct detected_devices_node *node_detected_dev;
6887         dev_t dev;
6888         int i_scanned, i_passed;
6889
6890         i_scanned = 0;
6891         i_passed = 0;
6892
6893         printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
6894
6895         while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
6896                 i_scanned++;
6897                 node_detected_dev = list_entry(all_detected_devices.next,
6898                                         struct detected_devices_node, list);
6899                 list_del(&node_detected_dev->list);
6900                 dev = node_detected_dev->dev;
6901                 kfree(node_detected_dev);
6902                 rdev = md_import_device(dev,0, 90);
6903                 if (IS_ERR(rdev))
6904                         continue;
6905
6906                 if (test_bit(Faulty, &rdev->flags)) {
6907                         MD_BUG();
6908                         continue;
6909                 }
6910                 set_bit(AutoDetected, &rdev->flags);
6911                 list_add(&rdev->same_set, &pending_raid_disks);
6912                 i_passed++;
6913         }
6914
6915         printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
6916                                                 i_scanned, i_passed);
6917
6918         autorun_devices(part);
6919 }
6920
6921 #endif /* !MODULE */
6922
6923 static __exit void md_exit(void)
6924 {
6925         mddev_t *mddev;
6926         struct list_head *tmp;
6927
6928         blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
6929         blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
6930
6931         unregister_blkdev(MD_MAJOR,"md");
6932         unregister_blkdev(mdp_major, "mdp");
6933         unregister_reboot_notifier(&md_notifier);
6934         unregister_sysctl_table(raid_table_header);
6935         remove_proc_entry("mdstat", NULL);
6936         for_each_mddev(mddev, tmp) {
6937                 export_array(mddev);
6938                 mddev->hold_active = 0;
6939         }
6940 }
6941
6942 subsys_initcall(md_init);
6943 module_exit(md_exit)
6944
6945 static int get_ro(char *buffer, struct kernel_param *kp)
6946 {
6947         return sprintf(buffer, "%d", start_readonly);
6948 }
6949 static int set_ro(const char *val, struct kernel_param *kp)
6950 {
6951         char *e;
6952         int num = simple_strtoul(val, &e, 10);
6953         if (*val && (*e == '\0' || *e == '\n')) {
6954                 start_readonly = num;
6955                 return 0;
6956         }
6957         return -EINVAL;
6958 }
6959
6960 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
6961 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
6962
6963 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
6964
6965 EXPORT_SYMBOL(register_md_personality);
6966 EXPORT_SYMBOL(unregister_md_personality);
6967 EXPORT_SYMBOL(md_error);
6968 EXPORT_SYMBOL(md_done_sync);
6969 EXPORT_SYMBOL(md_write_start);
6970 EXPORT_SYMBOL(md_write_end);
6971 EXPORT_SYMBOL(md_register_thread);
6972 EXPORT_SYMBOL(md_unregister_thread);
6973 EXPORT_SYMBOL(md_wakeup_thread);
6974 EXPORT_SYMBOL(md_check_recovery);
6975 MODULE_LICENSE("GPL");
6976 MODULE_ALIAS("md");
6977 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);