Remove deadlock potential in md_open
[pandora-kernel.git] / drivers / md / md.c
1 /*
2    md.c : Multiple Devices driver for Linux
3           Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5      completely rewritten, based on the MD driver code from Marc Zyngier
6
7    Changes:
8
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13    - kmod support by: Cyrus Durgin
14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17    - lots of fixes and improvements to the RAID1/RAID5 and generic
18      RAID code (such as request based resynchronization):
19
20      Neil Brown <neilb@cse.unsw.edu.au>.
21
22    - persistent bitmap code
23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25    This program is free software; you can redistribute it and/or modify
26    it under the terms of the GNU General Public License as published by
27    the Free Software Foundation; either version 2, or (at your option)
28    any later version.
29
30    You should have received a copy of the GNU General Public License
31    (for example /usr/src/linux/COPYING); if not, write to the Free
32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */
34
35 #include <linux/kthread.h>
36 #include <linux/blkdev.h>
37 #include <linux/sysctl.h>
38 #include <linux/seq_file.h>
39 #include <linux/buffer_head.h> /* for invalidate_bdev */
40 #include <linux/poll.h>
41 #include <linux/ctype.h>
42 #include <linux/hdreg.h>
43 #include <linux/proc_fs.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/file.h>
47 #include <linux/delay.h>
48 #include <linux/raid/md_p.h>
49 #include <linux/raid/md_u.h>
50 #include "md.h"
51 #include "bitmap.h"
52
53 #define DEBUG 0
54 #define dprintk(x...) ((void)(DEBUG && printk(x)))
55
56
57 #ifndef MODULE
58 static void autostart_arrays(int part);
59 #endif
60
61 static LIST_HEAD(pers_list);
62 static DEFINE_SPINLOCK(pers_lock);
63
64 static void md_print_devices(void);
65
66 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
67
68 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
69
70 /*
71  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
72  * is 1000 KB/sec, so the extra system load does not show up that much.
73  * Increase it if you want to have more _guaranteed_ speed. Note that
74  * the RAID driver will use the maximum available bandwidth if the IO
75  * subsystem is idle. There is also an 'absolute maximum' reconstruction
76  * speed limit - in case reconstruction slows down your system despite
77  * idle IO detection.
78  *
79  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
80  * or /sys/block/mdX/md/sync_speed_{min,max}
81  */
82
83 static int sysctl_speed_limit_min = 1000;
84 static int sysctl_speed_limit_max = 200000;
85 static inline int speed_min(mddev_t *mddev)
86 {
87         return mddev->sync_speed_min ?
88                 mddev->sync_speed_min : sysctl_speed_limit_min;
89 }
90
91 static inline int speed_max(mddev_t *mddev)
92 {
93         return mddev->sync_speed_max ?
94                 mddev->sync_speed_max : sysctl_speed_limit_max;
95 }
96
97 static struct ctl_table_header *raid_table_header;
98
99 static ctl_table raid_table[] = {
100         {
101                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MIN,
102                 .procname       = "speed_limit_min",
103                 .data           = &sysctl_speed_limit_min,
104                 .maxlen         = sizeof(int),
105                 .mode           = S_IRUGO|S_IWUSR,
106                 .proc_handler   = &proc_dointvec,
107         },
108         {
109                 .ctl_name       = DEV_RAID_SPEED_LIMIT_MAX,
110                 .procname       = "speed_limit_max",
111                 .data           = &sysctl_speed_limit_max,
112                 .maxlen         = sizeof(int),
113                 .mode           = S_IRUGO|S_IWUSR,
114                 .proc_handler   = &proc_dointvec,
115         },
116         { .ctl_name = 0 }
117 };
118
119 static ctl_table raid_dir_table[] = {
120         {
121                 .ctl_name       = DEV_RAID,
122                 .procname       = "raid",
123                 .maxlen         = 0,
124                 .mode           = S_IRUGO|S_IXUGO,
125                 .child          = raid_table,
126         },
127         { .ctl_name = 0 }
128 };
129
130 static ctl_table raid_root_table[] = {
131         {
132                 .ctl_name       = CTL_DEV,
133                 .procname       = "dev",
134                 .maxlen         = 0,
135                 .mode           = 0555,
136                 .child          = raid_dir_table,
137         },
138         { .ctl_name = 0 }
139 };
140
141 static struct block_device_operations md_fops;
142
143 static int start_readonly;
144
145 /*
146  * We have a system wide 'event count' that is incremented
147  * on any 'interesting' event, and readers of /proc/mdstat
148  * can use 'poll' or 'select' to find out when the event
149  * count increases.
150  *
151  * Events are:
152  *  start array, stop array, error, add device, remove device,
153  *  start build, activate spare
154  */
155 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
156 static atomic_t md_event_count;
157 void md_new_event(mddev_t *mddev)
158 {
159         atomic_inc(&md_event_count);
160         wake_up(&md_event_waiters);
161 }
162 EXPORT_SYMBOL_GPL(md_new_event);
163
164 /* Alternate version that can be called from interrupts
165  * when calling sysfs_notify isn't needed.
166  */
167 static void md_new_event_inintr(mddev_t *mddev)
168 {
169         atomic_inc(&md_event_count);
170         wake_up(&md_event_waiters);
171 }
172
173 /*
174  * Enables to iterate over all existing md arrays
175  * all_mddevs_lock protects this list.
176  */
177 static LIST_HEAD(all_mddevs);
178 static DEFINE_SPINLOCK(all_mddevs_lock);
179
180
181 /*
182  * iterates through all used mddevs in the system.
183  * We take care to grab the all_mddevs_lock whenever navigating
184  * the list, and to always hold a refcount when unlocked.
185  * Any code which breaks out of this loop while own
186  * a reference to the current mddev and must mddev_put it.
187  */
188 #define for_each_mddev(mddev,tmp)                                       \
189                                                                         \
190         for (({ spin_lock(&all_mddevs_lock);                            \
191                 tmp = all_mddevs.next;                                  \
192                 mddev = NULL;});                                        \
193              ({ if (tmp != &all_mddevs)                                 \
194                         mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
195                 spin_unlock(&all_mddevs_lock);                          \
196                 if (mddev) mddev_put(mddev);                            \
197                 mddev = list_entry(tmp, mddev_t, all_mddevs);           \
198                 tmp != &all_mddevs;});                                  \
199              ({ spin_lock(&all_mddevs_lock);                            \
200                 tmp = tmp->next;})                                      \
201                 )
202
203
204 /* Rather than calling directly into the personality make_request function,
205  * IO requests come here first so that we can check if the device is
206  * being suspended pending a reconfiguration.
207  * We hold a refcount over the call to ->make_request.  By the time that
208  * call has finished, the bio has been linked into some internal structure
209  * and so is visible to ->quiesce(), so we don't need the refcount any more.
210  */
211 static int md_make_request(struct request_queue *q, struct bio *bio)
212 {
213         mddev_t *mddev = q->queuedata;
214         int rv;
215         if (mddev == NULL || mddev->pers == NULL) {
216                 bio_io_error(bio);
217                 return 0;
218         }
219         rcu_read_lock();
220         if (mddev->suspended) {
221                 DEFINE_WAIT(__wait);
222                 for (;;) {
223                         prepare_to_wait(&mddev->sb_wait, &__wait,
224                                         TASK_UNINTERRUPTIBLE);
225                         if (!mddev->suspended)
226                                 break;
227                         rcu_read_unlock();
228                         schedule();
229                         rcu_read_lock();
230                 }
231                 finish_wait(&mddev->sb_wait, &__wait);
232         }
233         atomic_inc(&mddev->active_io);
234         rcu_read_unlock();
235         rv = mddev->pers->make_request(q, bio);
236         if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
237                 wake_up(&mddev->sb_wait);
238
239         return rv;
240 }
241
242 static void mddev_suspend(mddev_t *mddev)
243 {
244         BUG_ON(mddev->suspended);
245         mddev->suspended = 1;
246         synchronize_rcu();
247         wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
248         mddev->pers->quiesce(mddev, 1);
249         md_unregister_thread(mddev->thread);
250         mddev->thread = NULL;
251         /* we now know that no code is executing in the personality module,
252          * except possibly the tail end of a ->bi_end_io function, but that
253          * is certain to complete before the module has a chance to get
254          * unloaded
255          */
256 }
257
258 static void mddev_resume(mddev_t *mddev)
259 {
260         mddev->suspended = 0;
261         wake_up(&mddev->sb_wait);
262         mddev->pers->quiesce(mddev, 0);
263 }
264
265
266 static inline mddev_t *mddev_get(mddev_t *mddev)
267 {
268         atomic_inc(&mddev->active);
269         return mddev;
270 }
271
272 static void mddev_delayed_delete(struct work_struct *ws);
273
274 static void mddev_put(mddev_t *mddev)
275 {
276         if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
277                 return;
278         if (!mddev->raid_disks && list_empty(&mddev->disks) &&
279             !mddev->hold_active) {
280                 list_del(&mddev->all_mddevs);
281                 if (mddev->gendisk) {
282                         /* we did a probe so need to clean up.
283                          * Call schedule_work inside the spinlock
284                          * so that flush_scheduled_work() after
285                          * mddev_find will succeed in waiting for the
286                          * work to be done.
287                          */
288                         INIT_WORK(&mddev->del_work, mddev_delayed_delete);
289                         schedule_work(&mddev->del_work);
290                 } else
291                         kfree(mddev);
292         }
293         spin_unlock(&all_mddevs_lock);
294 }
295
296 static mddev_t * mddev_find(dev_t unit)
297 {
298         mddev_t *mddev, *new = NULL;
299
300  retry:
301         spin_lock(&all_mddevs_lock);
302
303         if (unit) {
304                 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
305                         if (mddev->unit == unit) {
306                                 mddev_get(mddev);
307                                 spin_unlock(&all_mddevs_lock);
308                                 kfree(new);
309                                 return mddev;
310                         }
311
312                 if (new) {
313                         list_add(&new->all_mddevs, &all_mddevs);
314                         spin_unlock(&all_mddevs_lock);
315                         new->hold_active = UNTIL_IOCTL;
316                         return new;
317                 }
318         } else if (new) {
319                 /* find an unused unit number */
320                 static int next_minor = 512;
321                 int start = next_minor;
322                 int is_free = 0;
323                 int dev = 0;
324                 while (!is_free) {
325                         dev = MKDEV(MD_MAJOR, next_minor);
326                         next_minor++;
327                         if (next_minor > MINORMASK)
328                                 next_minor = 0;
329                         if (next_minor == start) {
330                                 /* Oh dear, all in use. */
331                                 spin_unlock(&all_mddevs_lock);
332                                 kfree(new);
333                                 return NULL;
334                         }
335                                 
336                         is_free = 1;
337                         list_for_each_entry(mddev, &all_mddevs, all_mddevs)
338                                 if (mddev->unit == dev) {
339                                         is_free = 0;
340                                         break;
341                                 }
342                 }
343                 new->unit = dev;
344                 new->md_minor = MINOR(dev);
345                 new->hold_active = UNTIL_STOP;
346                 list_add(&new->all_mddevs, &all_mddevs);
347                 spin_unlock(&all_mddevs_lock);
348                 return new;
349         }
350         spin_unlock(&all_mddevs_lock);
351
352         new = kzalloc(sizeof(*new), GFP_KERNEL);
353         if (!new)
354                 return NULL;
355
356         new->unit = unit;
357         if (MAJOR(unit) == MD_MAJOR)
358                 new->md_minor = MINOR(unit);
359         else
360                 new->md_minor = MINOR(unit) >> MdpMinorShift;
361
362         mutex_init(&new->open_mutex);
363         mutex_init(&new->reconfig_mutex);
364         INIT_LIST_HEAD(&new->disks);
365         INIT_LIST_HEAD(&new->all_mddevs);
366         init_timer(&new->safemode_timer);
367         atomic_set(&new->active, 1);
368         atomic_set(&new->openers, 0);
369         atomic_set(&new->active_io, 0);
370         spin_lock_init(&new->write_lock);
371         init_waitqueue_head(&new->sb_wait);
372         init_waitqueue_head(&new->recovery_wait);
373         new->reshape_position = MaxSector;
374         new->resync_min = 0;
375         new->resync_max = MaxSector;
376         new->level = LEVEL_NONE;
377
378         goto retry;
379 }
380
381 static inline int mddev_lock(mddev_t * mddev)
382 {
383         return mutex_lock_interruptible(&mddev->reconfig_mutex);
384 }
385
386 static inline int mddev_is_locked(mddev_t *mddev)
387 {
388         return mutex_is_locked(&mddev->reconfig_mutex);
389 }
390
391 static inline int mddev_trylock(mddev_t * mddev)
392 {
393         return mutex_trylock(&mddev->reconfig_mutex);
394 }
395
396 static inline void mddev_unlock(mddev_t * mddev)
397 {
398         mutex_unlock(&mddev->reconfig_mutex);
399
400         md_wakeup_thread(mddev->thread);
401 }
402
403 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
404 {
405         mdk_rdev_t *rdev;
406
407         list_for_each_entry(rdev, &mddev->disks, same_set)
408                 if (rdev->desc_nr == nr)
409                         return rdev;
410
411         return NULL;
412 }
413
414 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
415 {
416         mdk_rdev_t *rdev;
417
418         list_for_each_entry(rdev, &mddev->disks, same_set)
419                 if (rdev->bdev->bd_dev == dev)
420                         return rdev;
421
422         return NULL;
423 }
424
425 static struct mdk_personality *find_pers(int level, char *clevel)
426 {
427         struct mdk_personality *pers;
428         list_for_each_entry(pers, &pers_list, list) {
429                 if (level != LEVEL_NONE && pers->level == level)
430                         return pers;
431                 if (strcmp(pers->name, clevel)==0)
432                         return pers;
433         }
434         return NULL;
435 }
436
437 /* return the offset of the super block in 512byte sectors */
438 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
439 {
440         sector_t num_sectors = bdev->bd_inode->i_size / 512;
441         return MD_NEW_SIZE_SECTORS(num_sectors);
442 }
443
444 static int alloc_disk_sb(mdk_rdev_t * rdev)
445 {
446         if (rdev->sb_page)
447                 MD_BUG();
448
449         rdev->sb_page = alloc_page(GFP_KERNEL);
450         if (!rdev->sb_page) {
451                 printk(KERN_ALERT "md: out of memory.\n");
452                 return -ENOMEM;
453         }
454
455         return 0;
456 }
457
458 static void free_disk_sb(mdk_rdev_t * rdev)
459 {
460         if (rdev->sb_page) {
461                 put_page(rdev->sb_page);
462                 rdev->sb_loaded = 0;
463                 rdev->sb_page = NULL;
464                 rdev->sb_start = 0;
465                 rdev->sectors = 0;
466         }
467 }
468
469
470 static void super_written(struct bio *bio, int error)
471 {
472         mdk_rdev_t *rdev = bio->bi_private;
473         mddev_t *mddev = rdev->mddev;
474
475         if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
476                 printk("md: super_written gets error=%d, uptodate=%d\n",
477                        error, test_bit(BIO_UPTODATE, &bio->bi_flags));
478                 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
479                 md_error(mddev, rdev);
480         }
481
482         if (atomic_dec_and_test(&mddev->pending_writes))
483                 wake_up(&mddev->sb_wait);
484         bio_put(bio);
485 }
486
487 static void super_written_barrier(struct bio *bio, int error)
488 {
489         struct bio *bio2 = bio->bi_private;
490         mdk_rdev_t *rdev = bio2->bi_private;
491         mddev_t *mddev = rdev->mddev;
492
493         if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
494             error == -EOPNOTSUPP) {
495                 unsigned long flags;
496                 /* barriers don't appear to be supported :-( */
497                 set_bit(BarriersNotsupp, &rdev->flags);
498                 mddev->barriers_work = 0;
499                 spin_lock_irqsave(&mddev->write_lock, flags);
500                 bio2->bi_next = mddev->biolist;
501                 mddev->biolist = bio2;
502                 spin_unlock_irqrestore(&mddev->write_lock, flags);
503                 wake_up(&mddev->sb_wait);
504                 bio_put(bio);
505         } else {
506                 bio_put(bio2);
507                 bio->bi_private = rdev;
508                 super_written(bio, error);
509         }
510 }
511
512 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
513                    sector_t sector, int size, struct page *page)
514 {
515         /* write first size bytes of page to sector of rdev
516          * Increment mddev->pending_writes before returning
517          * and decrement it on completion, waking up sb_wait
518          * if zero is reached.
519          * If an error occurred, call md_error
520          *
521          * As we might need to resubmit the request if BIO_RW_BARRIER
522          * causes ENOTSUPP, we allocate a spare bio...
523          */
524         struct bio *bio = bio_alloc(GFP_NOIO, 1);
525         int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
526
527         bio->bi_bdev = rdev->bdev;
528         bio->bi_sector = sector;
529         bio_add_page(bio, page, size, 0);
530         bio->bi_private = rdev;
531         bio->bi_end_io = super_written;
532         bio->bi_rw = rw;
533
534         atomic_inc(&mddev->pending_writes);
535         if (!test_bit(BarriersNotsupp, &rdev->flags)) {
536                 struct bio *rbio;
537                 rw |= (1<<BIO_RW_BARRIER);
538                 rbio = bio_clone(bio, GFP_NOIO);
539                 rbio->bi_private = bio;
540                 rbio->bi_end_io = super_written_barrier;
541                 submit_bio(rw, rbio);
542         } else
543                 submit_bio(rw, bio);
544 }
545
546 void md_super_wait(mddev_t *mddev)
547 {
548         /* wait for all superblock writes that were scheduled to complete.
549          * if any had to be retried (due to BARRIER problems), retry them
550          */
551         DEFINE_WAIT(wq);
552         for(;;) {
553                 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
554                 if (atomic_read(&mddev->pending_writes)==0)
555                         break;
556                 while (mddev->biolist) {
557                         struct bio *bio;
558                         spin_lock_irq(&mddev->write_lock);
559                         bio = mddev->biolist;
560                         mddev->biolist = bio->bi_next ;
561                         bio->bi_next = NULL;
562                         spin_unlock_irq(&mddev->write_lock);
563                         submit_bio(bio->bi_rw, bio);
564                 }
565                 schedule();
566         }
567         finish_wait(&mddev->sb_wait, &wq);
568 }
569
570 static void bi_complete(struct bio *bio, int error)
571 {
572         complete((struct completion*)bio->bi_private);
573 }
574
575 int sync_page_io(struct block_device *bdev, sector_t sector, int size,
576                    struct page *page, int rw)
577 {
578         struct bio *bio = bio_alloc(GFP_NOIO, 1);
579         struct completion event;
580         int ret;
581
582         rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
583
584         bio->bi_bdev = bdev;
585         bio->bi_sector = sector;
586         bio_add_page(bio, page, size, 0);
587         init_completion(&event);
588         bio->bi_private = &event;
589         bio->bi_end_io = bi_complete;
590         submit_bio(rw, bio);
591         wait_for_completion(&event);
592
593         ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
594         bio_put(bio);
595         return ret;
596 }
597 EXPORT_SYMBOL_GPL(sync_page_io);
598
599 static int read_disk_sb(mdk_rdev_t * rdev, int size)
600 {
601         char b[BDEVNAME_SIZE];
602         if (!rdev->sb_page) {
603                 MD_BUG();
604                 return -EINVAL;
605         }
606         if (rdev->sb_loaded)
607                 return 0;
608
609
610         if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
611                 goto fail;
612         rdev->sb_loaded = 1;
613         return 0;
614
615 fail:
616         printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
617                 bdevname(rdev->bdev,b));
618         return -EINVAL;
619 }
620
621 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
622 {
623         return  sb1->set_uuid0 == sb2->set_uuid0 &&
624                 sb1->set_uuid1 == sb2->set_uuid1 &&
625                 sb1->set_uuid2 == sb2->set_uuid2 &&
626                 sb1->set_uuid3 == sb2->set_uuid3;
627 }
628
629 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
630 {
631         int ret;
632         mdp_super_t *tmp1, *tmp2;
633
634         tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
635         tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
636
637         if (!tmp1 || !tmp2) {
638                 ret = 0;
639                 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
640                 goto abort;
641         }
642
643         *tmp1 = *sb1;
644         *tmp2 = *sb2;
645
646         /*
647          * nr_disks is not constant
648          */
649         tmp1->nr_disks = 0;
650         tmp2->nr_disks = 0;
651
652         ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
653 abort:
654         kfree(tmp1);
655         kfree(tmp2);
656         return ret;
657 }
658
659
660 static u32 md_csum_fold(u32 csum)
661 {
662         csum = (csum & 0xffff) + (csum >> 16);
663         return (csum & 0xffff) + (csum >> 16);
664 }
665
666 static unsigned int calc_sb_csum(mdp_super_t * sb)
667 {
668         u64 newcsum = 0;
669         u32 *sb32 = (u32*)sb;
670         int i;
671         unsigned int disk_csum, csum;
672
673         disk_csum = sb->sb_csum;
674         sb->sb_csum = 0;
675
676         for (i = 0; i < MD_SB_BYTES/4 ; i++)
677                 newcsum += sb32[i];
678         csum = (newcsum & 0xffffffff) + (newcsum>>32);
679
680
681 #ifdef CONFIG_ALPHA
682         /* This used to use csum_partial, which was wrong for several
683          * reasons including that different results are returned on
684          * different architectures.  It isn't critical that we get exactly
685          * the same return value as before (we always csum_fold before
686          * testing, and that removes any differences).  However as we
687          * know that csum_partial always returned a 16bit value on
688          * alphas, do a fold to maximise conformity to previous behaviour.
689          */
690         sb->sb_csum = md_csum_fold(disk_csum);
691 #else
692         sb->sb_csum = disk_csum;
693 #endif
694         return csum;
695 }
696
697
698 /*
699  * Handle superblock details.
700  * We want to be able to handle multiple superblock formats
701  * so we have a common interface to them all, and an array of
702  * different handlers.
703  * We rely on user-space to write the initial superblock, and support
704  * reading and updating of superblocks.
705  * Interface methods are:
706  *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
707  *      loads and validates a superblock on dev.
708  *      if refdev != NULL, compare superblocks on both devices
709  *    Return:
710  *      0 - dev has a superblock that is compatible with refdev
711  *      1 - dev has a superblock that is compatible and newer than refdev
712  *          so dev should be used as the refdev in future
713  *     -EINVAL superblock incompatible or invalid
714  *     -othererror e.g. -EIO
715  *
716  *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
717  *      Verify that dev is acceptable into mddev.
718  *       The first time, mddev->raid_disks will be 0, and data from
719  *       dev should be merged in.  Subsequent calls check that dev
720  *       is new enough.  Return 0 or -EINVAL
721  *
722  *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
723  *     Update the superblock for rdev with data in mddev
724  *     This does not write to disc.
725  *
726  */
727
728 struct super_type  {
729         char                *name;
730         struct module       *owner;
731         int                 (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
732                                           int minor_version);
733         int                 (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
734         void                (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
735         unsigned long long  (*rdev_size_change)(mdk_rdev_t *rdev,
736                                                 sector_t num_sectors);
737 };
738
739 /*
740  * Check that the given mddev has no bitmap.
741  *
742  * This function is called from the run method of all personalities that do not
743  * support bitmaps. It prints an error message and returns non-zero if mddev
744  * has a bitmap. Otherwise, it returns 0.
745  *
746  */
747 int md_check_no_bitmap(mddev_t *mddev)
748 {
749         if (!mddev->bitmap_file && !mddev->bitmap_offset)
750                 return 0;
751         printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
752                 mdname(mddev), mddev->pers->name);
753         return 1;
754 }
755 EXPORT_SYMBOL(md_check_no_bitmap);
756
757 /*
758  * load_super for 0.90.0 
759  */
760 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
761 {
762         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
763         mdp_super_t *sb;
764         int ret;
765
766         /*
767          * Calculate the position of the superblock (512byte sectors),
768          * it's at the end of the disk.
769          *
770          * It also happens to be a multiple of 4Kb.
771          */
772         rdev->sb_start = calc_dev_sboffset(rdev->bdev);
773
774         ret = read_disk_sb(rdev, MD_SB_BYTES);
775         if (ret) return ret;
776
777         ret = -EINVAL;
778
779         bdevname(rdev->bdev, b);
780         sb = (mdp_super_t*)page_address(rdev->sb_page);
781
782         if (sb->md_magic != MD_SB_MAGIC) {
783                 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
784                        b);
785                 goto abort;
786         }
787
788         if (sb->major_version != 0 ||
789             sb->minor_version < 90 ||
790             sb->minor_version > 91) {
791                 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
792                         sb->major_version, sb->minor_version,
793                         b);
794                 goto abort;
795         }
796
797         if (sb->raid_disks <= 0)
798                 goto abort;
799
800         if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
801                 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
802                         b);
803                 goto abort;
804         }
805
806         rdev->preferred_minor = sb->md_minor;
807         rdev->data_offset = 0;
808         rdev->sb_size = MD_SB_BYTES;
809
810         if (sb->level == LEVEL_MULTIPATH)
811                 rdev->desc_nr = -1;
812         else
813                 rdev->desc_nr = sb->this_disk.number;
814
815         if (!refdev) {
816                 ret = 1;
817         } else {
818                 __u64 ev1, ev2;
819                 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
820                 if (!uuid_equal(refsb, sb)) {
821                         printk(KERN_WARNING "md: %s has different UUID to %s\n",
822                                 b, bdevname(refdev->bdev,b2));
823                         goto abort;
824                 }
825                 if (!sb_equal(refsb, sb)) {
826                         printk(KERN_WARNING "md: %s has same UUID"
827                                " but different superblock to %s\n",
828                                b, bdevname(refdev->bdev, b2));
829                         goto abort;
830                 }
831                 ev1 = md_event(sb);
832                 ev2 = md_event(refsb);
833                 if (ev1 > ev2)
834                         ret = 1;
835                 else 
836                         ret = 0;
837         }
838         rdev->sectors = rdev->sb_start;
839
840         if (rdev->sectors < sb->size * 2 && sb->level > 1)
841                 /* "this cannot possibly happen" ... */
842                 ret = -EINVAL;
843
844  abort:
845         return ret;
846 }
847
848 /*
849  * validate_super for 0.90.0
850  */
851 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
852 {
853         mdp_disk_t *desc;
854         mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
855         __u64 ev1 = md_event(sb);
856
857         rdev->raid_disk = -1;
858         clear_bit(Faulty, &rdev->flags);
859         clear_bit(In_sync, &rdev->flags);
860         clear_bit(WriteMostly, &rdev->flags);
861         clear_bit(BarriersNotsupp, &rdev->flags);
862
863         if (mddev->raid_disks == 0) {
864                 mddev->major_version = 0;
865                 mddev->minor_version = sb->minor_version;
866                 mddev->patch_version = sb->patch_version;
867                 mddev->external = 0;
868                 mddev->chunk_sectors = sb->chunk_size >> 9;
869                 mddev->ctime = sb->ctime;
870                 mddev->utime = sb->utime;
871                 mddev->level = sb->level;
872                 mddev->clevel[0] = 0;
873                 mddev->layout = sb->layout;
874                 mddev->raid_disks = sb->raid_disks;
875                 mddev->dev_sectors = sb->size * 2;
876                 mddev->events = ev1;
877                 mddev->bitmap_offset = 0;
878                 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
879
880                 if (mddev->minor_version >= 91) {
881                         mddev->reshape_position = sb->reshape_position;
882                         mddev->delta_disks = sb->delta_disks;
883                         mddev->new_level = sb->new_level;
884                         mddev->new_layout = sb->new_layout;
885                         mddev->new_chunk_sectors = sb->new_chunk >> 9;
886                 } else {
887                         mddev->reshape_position = MaxSector;
888                         mddev->delta_disks = 0;
889                         mddev->new_level = mddev->level;
890                         mddev->new_layout = mddev->layout;
891                         mddev->new_chunk_sectors = mddev->chunk_sectors;
892                 }
893
894                 if (sb->state & (1<<MD_SB_CLEAN))
895                         mddev->recovery_cp = MaxSector;
896                 else {
897                         if (sb->events_hi == sb->cp_events_hi && 
898                                 sb->events_lo == sb->cp_events_lo) {
899                                 mddev->recovery_cp = sb->recovery_cp;
900                         } else
901                                 mddev->recovery_cp = 0;
902                 }
903
904                 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
905                 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
906                 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
907                 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
908
909                 mddev->max_disks = MD_SB_DISKS;
910
911                 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
912                     mddev->bitmap_file == NULL)
913                         mddev->bitmap_offset = mddev->default_bitmap_offset;
914
915         } else if (mddev->pers == NULL) {
916                 /* Insist on good event counter while assembling */
917                 ++ev1;
918                 if (ev1 < mddev->events) 
919                         return -EINVAL;
920         } else if (mddev->bitmap) {
921                 /* if adding to array with a bitmap, then we can accept an
922                  * older device ... but not too old.
923                  */
924                 if (ev1 < mddev->bitmap->events_cleared)
925                         return 0;
926         } else {
927                 if (ev1 < mddev->events)
928                         /* just a hot-add of a new device, leave raid_disk at -1 */
929                         return 0;
930         }
931
932         if (mddev->level != LEVEL_MULTIPATH) {
933                 desc = sb->disks + rdev->desc_nr;
934
935                 if (desc->state & (1<<MD_DISK_FAULTY))
936                         set_bit(Faulty, &rdev->flags);
937                 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
938                             desc->raid_disk < mddev->raid_disks */) {
939                         set_bit(In_sync, &rdev->flags);
940                         rdev->raid_disk = desc->raid_disk;
941                 }
942                 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
943                         set_bit(WriteMostly, &rdev->flags);
944         } else /* MULTIPATH are always insync */
945                 set_bit(In_sync, &rdev->flags);
946         return 0;
947 }
948
949 /*
950  * sync_super for 0.90.0
951  */
952 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
953 {
954         mdp_super_t *sb;
955         mdk_rdev_t *rdev2;
956         int next_spare = mddev->raid_disks;
957
958
959         /* make rdev->sb match mddev data..
960          *
961          * 1/ zero out disks
962          * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
963          * 3/ any empty disks < next_spare become removed
964          *
965          * disks[0] gets initialised to REMOVED because
966          * we cannot be sure from other fields if it has
967          * been initialised or not.
968          */
969         int i;
970         int active=0, working=0,failed=0,spare=0,nr_disks=0;
971
972         rdev->sb_size = MD_SB_BYTES;
973
974         sb = (mdp_super_t*)page_address(rdev->sb_page);
975
976         memset(sb, 0, sizeof(*sb));
977
978         sb->md_magic = MD_SB_MAGIC;
979         sb->major_version = mddev->major_version;
980         sb->patch_version = mddev->patch_version;
981         sb->gvalid_words  = 0; /* ignored */
982         memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
983         memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
984         memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
985         memcpy(&sb->set_uuid3, mddev->uuid+12,4);
986
987         sb->ctime = mddev->ctime;
988         sb->level = mddev->level;
989         sb->size = mddev->dev_sectors / 2;
990         sb->raid_disks = mddev->raid_disks;
991         sb->md_minor = mddev->md_minor;
992         sb->not_persistent = 0;
993         sb->utime = mddev->utime;
994         sb->state = 0;
995         sb->events_hi = (mddev->events>>32);
996         sb->events_lo = (u32)mddev->events;
997
998         if (mddev->reshape_position == MaxSector)
999                 sb->minor_version = 90;
1000         else {
1001                 sb->minor_version = 91;
1002                 sb->reshape_position = mddev->reshape_position;
1003                 sb->new_level = mddev->new_level;
1004                 sb->delta_disks = mddev->delta_disks;
1005                 sb->new_layout = mddev->new_layout;
1006                 sb->new_chunk = mddev->new_chunk_sectors << 9;
1007         }
1008         mddev->minor_version = sb->minor_version;
1009         if (mddev->in_sync)
1010         {
1011                 sb->recovery_cp = mddev->recovery_cp;
1012                 sb->cp_events_hi = (mddev->events>>32);
1013                 sb->cp_events_lo = (u32)mddev->events;
1014                 if (mddev->recovery_cp == MaxSector)
1015                         sb->state = (1<< MD_SB_CLEAN);
1016         } else
1017                 sb->recovery_cp = 0;
1018
1019         sb->layout = mddev->layout;
1020         sb->chunk_size = mddev->chunk_sectors << 9;
1021
1022         if (mddev->bitmap && mddev->bitmap_file == NULL)
1023                 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1024
1025         sb->disks[0].state = (1<<MD_DISK_REMOVED);
1026         list_for_each_entry(rdev2, &mddev->disks, same_set) {
1027                 mdp_disk_t *d;
1028                 int desc_nr;
1029                 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
1030                     && !test_bit(Faulty, &rdev2->flags))
1031                         desc_nr = rdev2->raid_disk;
1032                 else
1033                         desc_nr = next_spare++;
1034                 rdev2->desc_nr = desc_nr;
1035                 d = &sb->disks[rdev2->desc_nr];
1036                 nr_disks++;
1037                 d->number = rdev2->desc_nr;
1038                 d->major = MAJOR(rdev2->bdev->bd_dev);
1039                 d->minor = MINOR(rdev2->bdev->bd_dev);
1040                 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
1041                     && !test_bit(Faulty, &rdev2->flags))
1042                         d->raid_disk = rdev2->raid_disk;
1043                 else
1044                         d->raid_disk = rdev2->desc_nr; /* compatibility */
1045                 if (test_bit(Faulty, &rdev2->flags))
1046                         d->state = (1<<MD_DISK_FAULTY);
1047                 else if (test_bit(In_sync, &rdev2->flags)) {
1048                         d->state = (1<<MD_DISK_ACTIVE);
1049                         d->state |= (1<<MD_DISK_SYNC);
1050                         active++;
1051                         working++;
1052                 } else {
1053                         d->state = 0;
1054                         spare++;
1055                         working++;
1056                 }
1057                 if (test_bit(WriteMostly, &rdev2->flags))
1058                         d->state |= (1<<MD_DISK_WRITEMOSTLY);
1059         }
1060         /* now set the "removed" and "faulty" bits on any missing devices */
1061         for (i=0 ; i < mddev->raid_disks ; i++) {
1062                 mdp_disk_t *d = &sb->disks[i];
1063                 if (d->state == 0 && d->number == 0) {
1064                         d->number = i;
1065                         d->raid_disk = i;
1066                         d->state = (1<<MD_DISK_REMOVED);
1067                         d->state |= (1<<MD_DISK_FAULTY);
1068                         failed++;
1069                 }
1070         }
1071         sb->nr_disks = nr_disks;
1072         sb->active_disks = active;
1073         sb->working_disks = working;
1074         sb->failed_disks = failed;
1075         sb->spare_disks = spare;
1076
1077         sb->this_disk = sb->disks[rdev->desc_nr];
1078         sb->sb_csum = calc_sb_csum(sb);
1079 }
1080
1081 /*
1082  * rdev_size_change for 0.90.0
1083  */
1084 static unsigned long long
1085 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1086 {
1087         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1088                 return 0; /* component must fit device */
1089         if (rdev->mddev->bitmap_offset)
1090                 return 0; /* can't move bitmap */
1091         rdev->sb_start = calc_dev_sboffset(rdev->bdev);
1092         if (!num_sectors || num_sectors > rdev->sb_start)
1093                 num_sectors = rdev->sb_start;
1094         md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1095                        rdev->sb_page);
1096         md_super_wait(rdev->mddev);
1097         return num_sectors / 2; /* kB for sysfs */
1098 }
1099
1100
1101 /*
1102  * version 1 superblock
1103  */
1104
1105 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1106 {
1107         __le32 disk_csum;
1108         u32 csum;
1109         unsigned long long newcsum;
1110         int size = 256 + le32_to_cpu(sb->max_dev)*2;
1111         __le32 *isuper = (__le32*)sb;
1112         int i;
1113
1114         disk_csum = sb->sb_csum;
1115         sb->sb_csum = 0;
1116         newcsum = 0;
1117         for (i=0; size>=4; size -= 4 )
1118                 newcsum += le32_to_cpu(*isuper++);
1119
1120         if (size == 2)
1121                 newcsum += le16_to_cpu(*(__le16*) isuper);
1122
1123         csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1124         sb->sb_csum = disk_csum;
1125         return cpu_to_le32(csum);
1126 }
1127
1128 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1129 {
1130         struct mdp_superblock_1 *sb;
1131         int ret;
1132         sector_t sb_start;
1133         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1134         int bmask;
1135
1136         /*
1137          * Calculate the position of the superblock in 512byte sectors.
1138          * It is always aligned to a 4K boundary and
1139          * depeding on minor_version, it can be:
1140          * 0: At least 8K, but less than 12K, from end of device
1141          * 1: At start of device
1142          * 2: 4K from start of device.
1143          */
1144         switch(minor_version) {
1145         case 0:
1146                 sb_start = rdev->bdev->bd_inode->i_size >> 9;
1147                 sb_start -= 8*2;
1148                 sb_start &= ~(sector_t)(4*2-1);
1149                 break;
1150         case 1:
1151                 sb_start = 0;
1152                 break;
1153         case 2:
1154                 sb_start = 8;
1155                 break;
1156         default:
1157                 return -EINVAL;
1158         }
1159         rdev->sb_start = sb_start;
1160
1161         /* superblock is rarely larger than 1K, but it can be larger,
1162          * and it is safe to read 4k, so we do that
1163          */
1164         ret = read_disk_sb(rdev, 4096);
1165         if (ret) return ret;
1166
1167
1168         sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1169
1170         if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1171             sb->major_version != cpu_to_le32(1) ||
1172             le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1173             le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1174             (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1175                 return -EINVAL;
1176
1177         if (calc_sb_1_csum(sb) != sb->sb_csum) {
1178                 printk("md: invalid superblock checksum on %s\n",
1179                         bdevname(rdev->bdev,b));
1180                 return -EINVAL;
1181         }
1182         if (le64_to_cpu(sb->data_size) < 10) {
1183                 printk("md: data_size too small on %s\n",
1184                        bdevname(rdev->bdev,b));
1185                 return -EINVAL;
1186         }
1187
1188         rdev->preferred_minor = 0xffff;
1189         rdev->data_offset = le64_to_cpu(sb->data_offset);
1190         atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1191
1192         rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1193         bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1194         if (rdev->sb_size & bmask)
1195                 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1196
1197         if (minor_version
1198             && rdev->data_offset < sb_start + (rdev->sb_size/512))
1199                 return -EINVAL;
1200
1201         if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1202                 rdev->desc_nr = -1;
1203         else
1204                 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1205
1206         if (!refdev) {
1207                 ret = 1;
1208         } else {
1209                 __u64 ev1, ev2;
1210                 struct mdp_superblock_1 *refsb = 
1211                         (struct mdp_superblock_1*)page_address(refdev->sb_page);
1212
1213                 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1214                     sb->level != refsb->level ||
1215                     sb->layout != refsb->layout ||
1216                     sb->chunksize != refsb->chunksize) {
1217                         printk(KERN_WARNING "md: %s has strangely different"
1218                                 " superblock to %s\n",
1219                                 bdevname(rdev->bdev,b),
1220                                 bdevname(refdev->bdev,b2));
1221                         return -EINVAL;
1222                 }
1223                 ev1 = le64_to_cpu(sb->events);
1224                 ev2 = le64_to_cpu(refsb->events);
1225
1226                 if (ev1 > ev2)
1227                         ret = 1;
1228                 else
1229                         ret = 0;
1230         }
1231         if (minor_version)
1232                 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
1233                         le64_to_cpu(sb->data_offset);
1234         else
1235                 rdev->sectors = rdev->sb_start;
1236         if (rdev->sectors < le64_to_cpu(sb->data_size))
1237                 return -EINVAL;
1238         rdev->sectors = le64_to_cpu(sb->data_size);
1239         if (le64_to_cpu(sb->size) > rdev->sectors)
1240                 return -EINVAL;
1241         return ret;
1242 }
1243
1244 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1245 {
1246         struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1247         __u64 ev1 = le64_to_cpu(sb->events);
1248
1249         rdev->raid_disk = -1;
1250         clear_bit(Faulty, &rdev->flags);
1251         clear_bit(In_sync, &rdev->flags);
1252         clear_bit(WriteMostly, &rdev->flags);
1253         clear_bit(BarriersNotsupp, &rdev->flags);
1254
1255         if (mddev->raid_disks == 0) {
1256                 mddev->major_version = 1;
1257                 mddev->patch_version = 0;
1258                 mddev->external = 0;
1259                 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1260                 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1261                 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1262                 mddev->level = le32_to_cpu(sb->level);
1263                 mddev->clevel[0] = 0;
1264                 mddev->layout = le32_to_cpu(sb->layout);
1265                 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1266                 mddev->dev_sectors = le64_to_cpu(sb->size);
1267                 mddev->events = ev1;
1268                 mddev->bitmap_offset = 0;
1269                 mddev->default_bitmap_offset = 1024 >> 9;
1270                 
1271                 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1272                 memcpy(mddev->uuid, sb->set_uuid, 16);
1273
1274                 mddev->max_disks =  (4096-256)/2;
1275
1276                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1277                     mddev->bitmap_file == NULL )
1278                         mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1279
1280                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1281                         mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1282                         mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1283                         mddev->new_level = le32_to_cpu(sb->new_level);
1284                         mddev->new_layout = le32_to_cpu(sb->new_layout);
1285                         mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1286                 } else {
1287                         mddev->reshape_position = MaxSector;
1288                         mddev->delta_disks = 0;
1289                         mddev->new_level = mddev->level;
1290                         mddev->new_layout = mddev->layout;
1291                         mddev->new_chunk_sectors = mddev->chunk_sectors;
1292                 }
1293
1294         } else if (mddev->pers == NULL) {
1295                 /* Insist of good event counter while assembling */
1296                 ++ev1;
1297                 if (ev1 < mddev->events)
1298                         return -EINVAL;
1299         } else if (mddev->bitmap) {
1300                 /* If adding to array with a bitmap, then we can accept an
1301                  * older device, but not too old.
1302                  */
1303                 if (ev1 < mddev->bitmap->events_cleared)
1304                         return 0;
1305         } else {
1306                 if (ev1 < mddev->events)
1307                         /* just a hot-add of a new device, leave raid_disk at -1 */
1308                         return 0;
1309         }
1310         if (mddev->level != LEVEL_MULTIPATH) {
1311                 int role;
1312                 if (rdev->desc_nr < 0 ||
1313                     rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1314                         role = 0xffff;
1315                         rdev->desc_nr = -1;
1316                 } else
1317                         role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1318                 switch(role) {
1319                 case 0xffff: /* spare */
1320                         break;
1321                 case 0xfffe: /* faulty */
1322                         set_bit(Faulty, &rdev->flags);
1323                         break;
1324                 default:
1325                         if ((le32_to_cpu(sb->feature_map) &
1326                              MD_FEATURE_RECOVERY_OFFSET))
1327                                 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1328                         else
1329                                 set_bit(In_sync, &rdev->flags);
1330                         rdev->raid_disk = role;
1331                         break;
1332                 }
1333                 if (sb->devflags & WriteMostly1)
1334                         set_bit(WriteMostly, &rdev->flags);
1335         } else /* MULTIPATH are always insync */
1336                 set_bit(In_sync, &rdev->flags);
1337
1338         return 0;
1339 }
1340
1341 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1342 {
1343         struct mdp_superblock_1 *sb;
1344         mdk_rdev_t *rdev2;
1345         int max_dev, i;
1346         /* make rdev->sb match mddev and rdev data. */
1347
1348         sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1349
1350         sb->feature_map = 0;
1351         sb->pad0 = 0;
1352         sb->recovery_offset = cpu_to_le64(0);
1353         memset(sb->pad1, 0, sizeof(sb->pad1));
1354         memset(sb->pad2, 0, sizeof(sb->pad2));
1355         memset(sb->pad3, 0, sizeof(sb->pad3));
1356
1357         sb->utime = cpu_to_le64((__u64)mddev->utime);
1358         sb->events = cpu_to_le64(mddev->events);
1359         if (mddev->in_sync)
1360                 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1361         else
1362                 sb->resync_offset = cpu_to_le64(0);
1363
1364         sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1365
1366         sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1367         sb->size = cpu_to_le64(mddev->dev_sectors);
1368         sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1369         sb->level = cpu_to_le32(mddev->level);
1370         sb->layout = cpu_to_le32(mddev->layout);
1371
1372         if (mddev->bitmap && mddev->bitmap_file == NULL) {
1373                 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1374                 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1375         }
1376
1377         if (rdev->raid_disk >= 0 &&
1378             !test_bit(In_sync, &rdev->flags)) {
1379                 if (mddev->curr_resync_completed > rdev->recovery_offset)
1380                         rdev->recovery_offset = mddev->curr_resync_completed;
1381                 if (rdev->recovery_offset > 0) {
1382                         sb->feature_map |=
1383                                 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1384                         sb->recovery_offset =
1385                                 cpu_to_le64(rdev->recovery_offset);
1386                 }
1387         }
1388
1389         if (mddev->reshape_position != MaxSector) {
1390                 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1391                 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1392                 sb->new_layout = cpu_to_le32(mddev->new_layout);
1393                 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1394                 sb->new_level = cpu_to_le32(mddev->new_level);
1395                 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1396         }
1397
1398         max_dev = 0;
1399         list_for_each_entry(rdev2, &mddev->disks, same_set)
1400                 if (rdev2->desc_nr+1 > max_dev)
1401                         max_dev = rdev2->desc_nr+1;
1402
1403         if (max_dev > le32_to_cpu(sb->max_dev)) {
1404                 int bmask;
1405                 sb->max_dev = cpu_to_le32(max_dev);
1406                 rdev->sb_size = max_dev * 2 + 256;
1407                 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1408                 if (rdev->sb_size & bmask)
1409                         rdev->sb_size = (rdev->sb_size | bmask) + 1;
1410         }
1411         for (i=0; i<max_dev;i++)
1412                 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1413         
1414         list_for_each_entry(rdev2, &mddev->disks, same_set) {
1415                 i = rdev2->desc_nr;
1416                 if (test_bit(Faulty, &rdev2->flags))
1417                         sb->dev_roles[i] = cpu_to_le16(0xfffe);
1418                 else if (test_bit(In_sync, &rdev2->flags))
1419                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1420                 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1421                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1422                 else
1423                         sb->dev_roles[i] = cpu_to_le16(0xffff);
1424         }
1425
1426         sb->sb_csum = calc_sb_1_csum(sb);
1427 }
1428
1429 static unsigned long long
1430 super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1431 {
1432         struct mdp_superblock_1 *sb;
1433         sector_t max_sectors;
1434         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1435                 return 0; /* component must fit device */
1436         if (rdev->sb_start < rdev->data_offset) {
1437                 /* minor versions 1 and 2; superblock before data */
1438                 max_sectors = rdev->bdev->bd_inode->i_size >> 9;
1439                 max_sectors -= rdev->data_offset;
1440                 if (!num_sectors || num_sectors > max_sectors)
1441                         num_sectors = max_sectors;
1442         } else if (rdev->mddev->bitmap_offset) {
1443                 /* minor version 0 with bitmap we can't move */
1444                 return 0;
1445         } else {
1446                 /* minor version 0; superblock after data */
1447                 sector_t sb_start;
1448                 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1449                 sb_start &= ~(sector_t)(4*2 - 1);
1450                 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1451                 if (!num_sectors || num_sectors > max_sectors)
1452                         num_sectors = max_sectors;
1453                 rdev->sb_start = sb_start;
1454         }
1455         sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
1456         sb->data_size = cpu_to_le64(num_sectors);
1457         sb->super_offset = rdev->sb_start;
1458         sb->sb_csum = calc_sb_1_csum(sb);
1459         md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1460                        rdev->sb_page);
1461         md_super_wait(rdev->mddev);
1462         return num_sectors / 2; /* kB for sysfs */
1463 }
1464
1465 static struct super_type super_types[] = {
1466         [0] = {
1467                 .name   = "0.90.0",
1468                 .owner  = THIS_MODULE,
1469                 .load_super         = super_90_load,
1470                 .validate_super     = super_90_validate,
1471                 .sync_super         = super_90_sync,
1472                 .rdev_size_change   = super_90_rdev_size_change,
1473         },
1474         [1] = {
1475                 .name   = "md-1",
1476                 .owner  = THIS_MODULE,
1477                 .load_super         = super_1_load,
1478                 .validate_super     = super_1_validate,
1479                 .sync_super         = super_1_sync,
1480                 .rdev_size_change   = super_1_rdev_size_change,
1481         },
1482 };
1483
1484 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1485 {
1486         mdk_rdev_t *rdev, *rdev2;
1487
1488         rcu_read_lock();
1489         rdev_for_each_rcu(rdev, mddev1)
1490                 rdev_for_each_rcu(rdev2, mddev2)
1491                         if (rdev->bdev->bd_contains ==
1492                             rdev2->bdev->bd_contains) {
1493                                 rcu_read_unlock();
1494                                 return 1;
1495                         }
1496         rcu_read_unlock();
1497         return 0;
1498 }
1499
1500 static LIST_HEAD(pending_raid_disks);
1501
1502 /*
1503  * Try to register data integrity profile for an mddev
1504  *
1505  * This is called when an array is started and after a disk has been kicked
1506  * from the array. It only succeeds if all working and active component devices
1507  * are integrity capable with matching profiles.
1508  */
1509 int md_integrity_register(mddev_t *mddev)
1510 {
1511         mdk_rdev_t *rdev, *reference = NULL;
1512
1513         if (list_empty(&mddev->disks))
1514                 return 0; /* nothing to do */
1515         if (blk_get_integrity(mddev->gendisk))
1516                 return 0; /* already registered */
1517         list_for_each_entry(rdev, &mddev->disks, same_set) {
1518                 /* skip spares and non-functional disks */
1519                 if (test_bit(Faulty, &rdev->flags))
1520                         continue;
1521                 if (rdev->raid_disk < 0)
1522                         continue;
1523                 /*
1524                  * If at least one rdev is not integrity capable, we can not
1525                  * enable data integrity for the md device.
1526                  */
1527                 if (!bdev_get_integrity(rdev->bdev))
1528                         return -EINVAL;
1529                 if (!reference) {
1530                         /* Use the first rdev as the reference */
1531                         reference = rdev;
1532                         continue;
1533                 }
1534                 /* does this rdev's profile match the reference profile? */
1535                 if (blk_integrity_compare(reference->bdev->bd_disk,
1536                                 rdev->bdev->bd_disk) < 0)
1537                         return -EINVAL;
1538         }
1539         /*
1540          * All component devices are integrity capable and have matching
1541          * profiles, register the common profile for the md device.
1542          */
1543         if (blk_integrity_register(mddev->gendisk,
1544                         bdev_get_integrity(reference->bdev)) != 0) {
1545                 printk(KERN_ERR "md: failed to register integrity for %s\n",
1546                         mdname(mddev));
1547                 return -EINVAL;
1548         }
1549         printk(KERN_NOTICE "md: data integrity on %s enabled\n",
1550                 mdname(mddev));
1551         return 0;
1552 }
1553 EXPORT_SYMBOL(md_integrity_register);
1554
1555 /* Disable data integrity if non-capable/non-matching disk is being added */
1556 void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
1557 {
1558         struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
1559         struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
1560
1561         if (!bi_mddev) /* nothing to do */
1562                 return;
1563         if (rdev->raid_disk < 0) /* skip spares */
1564                 return;
1565         if (bi_rdev && blk_integrity_compare(mddev->gendisk,
1566                                              rdev->bdev->bd_disk) >= 0)
1567                 return;
1568         printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
1569         blk_integrity_unregister(mddev->gendisk);
1570 }
1571 EXPORT_SYMBOL(md_integrity_add_rdev);
1572
1573 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1574 {
1575         char b[BDEVNAME_SIZE];
1576         struct kobject *ko;
1577         char *s;
1578         int err;
1579
1580         if (rdev->mddev) {
1581                 MD_BUG();
1582                 return -EINVAL;
1583         }
1584
1585         /* prevent duplicates */
1586         if (find_rdev(mddev, rdev->bdev->bd_dev))
1587                 return -EEXIST;
1588
1589         /* make sure rdev->sectors exceeds mddev->dev_sectors */
1590         if (rdev->sectors && (mddev->dev_sectors == 0 ||
1591                         rdev->sectors < mddev->dev_sectors)) {
1592                 if (mddev->pers) {
1593                         /* Cannot change size, so fail
1594                          * If mddev->level <= 0, then we don't care
1595                          * about aligning sizes (e.g. linear)
1596                          */
1597                         if (mddev->level > 0)
1598                                 return -ENOSPC;
1599                 } else
1600                         mddev->dev_sectors = rdev->sectors;
1601         }
1602
1603         /* Verify rdev->desc_nr is unique.
1604          * If it is -1, assign a free number, else
1605          * check number is not in use
1606          */
1607         if (rdev->desc_nr < 0) {
1608                 int choice = 0;
1609                 if (mddev->pers) choice = mddev->raid_disks;
1610                 while (find_rdev_nr(mddev, choice))
1611                         choice++;
1612                 rdev->desc_nr = choice;
1613         } else {
1614                 if (find_rdev_nr(mddev, rdev->desc_nr))
1615                         return -EBUSY;
1616         }
1617         if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
1618                 printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
1619                        mdname(mddev), mddev->max_disks);
1620                 return -EBUSY;
1621         }
1622         bdevname(rdev->bdev,b);
1623         while ( (s=strchr(b, '/')) != NULL)
1624                 *s = '!';
1625
1626         rdev->mddev = mddev;
1627         printk(KERN_INFO "md: bind<%s>\n", b);
1628
1629         if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
1630                 goto fail;
1631
1632         ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
1633         if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
1634                 kobject_del(&rdev->kobj);
1635                 goto fail;
1636         }
1637         rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state");
1638
1639         list_add_rcu(&rdev->same_set, &mddev->disks);
1640         bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1641
1642         /* May as well allow recovery to be retried once */
1643         mddev->recovery_disabled = 0;
1644
1645         return 0;
1646
1647  fail:
1648         printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
1649                b, mdname(mddev));
1650         return err;
1651 }
1652
1653 static void md_delayed_delete(struct work_struct *ws)
1654 {
1655         mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
1656         kobject_del(&rdev->kobj);
1657         kobject_put(&rdev->kobj);
1658 }
1659
1660 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1661 {
1662         char b[BDEVNAME_SIZE];
1663         if (!rdev->mddev) {
1664                 MD_BUG();
1665                 return;
1666         }
1667         bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1668         list_del_rcu(&rdev->same_set);
1669         printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1670         rdev->mddev = NULL;
1671         sysfs_remove_link(&rdev->kobj, "block");
1672         sysfs_put(rdev->sysfs_state);
1673         rdev->sysfs_state = NULL;
1674         /* We need to delay this, otherwise we can deadlock when
1675          * writing to 'remove' to "dev/state".  We also need
1676          * to delay it due to rcu usage.
1677          */
1678         synchronize_rcu();
1679         INIT_WORK(&rdev->del_work, md_delayed_delete);
1680         kobject_get(&rdev->kobj);
1681         schedule_work(&rdev->del_work);
1682 }
1683
1684 /*
1685  * prevent the device from being mounted, repartitioned or
1686  * otherwise reused by a RAID array (or any other kernel
1687  * subsystem), by bd_claiming the device.
1688  */
1689 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
1690 {
1691         int err = 0;
1692         struct block_device *bdev;
1693         char b[BDEVNAME_SIZE];
1694
1695         bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1696         if (IS_ERR(bdev)) {
1697                 printk(KERN_ERR "md: could not open %s.\n",
1698                         __bdevname(dev, b));
1699                 return PTR_ERR(bdev);
1700         }
1701         err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
1702         if (err) {
1703                 printk(KERN_ERR "md: could not bd_claim %s.\n",
1704                         bdevname(bdev, b));
1705                 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1706                 return err;
1707         }
1708         if (!shared)
1709                 set_bit(AllReserved, &rdev->flags);
1710         rdev->bdev = bdev;
1711         return err;
1712 }
1713
1714 static void unlock_rdev(mdk_rdev_t *rdev)
1715 {
1716         struct block_device *bdev = rdev->bdev;
1717         rdev->bdev = NULL;
1718         if (!bdev)
1719                 MD_BUG();
1720         bd_release(bdev);
1721         blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1722 }
1723
1724 void md_autodetect_dev(dev_t dev);
1725
1726 static void export_rdev(mdk_rdev_t * rdev)
1727 {
1728         char b[BDEVNAME_SIZE];
1729         printk(KERN_INFO "md: export_rdev(%s)\n",
1730                 bdevname(rdev->bdev,b));
1731         if (rdev->mddev)
1732                 MD_BUG();
1733         free_disk_sb(rdev);
1734 #ifndef MODULE
1735         if (test_bit(AutoDetected, &rdev->flags))
1736                 md_autodetect_dev(rdev->bdev->bd_dev);
1737 #endif
1738         unlock_rdev(rdev);
1739         kobject_put(&rdev->kobj);
1740 }
1741
1742 static void kick_rdev_from_array(mdk_rdev_t * rdev)
1743 {
1744         unbind_rdev_from_array(rdev);
1745         export_rdev(rdev);
1746 }
1747
1748 static void export_array(mddev_t *mddev)
1749 {
1750         mdk_rdev_t *rdev, *tmp;
1751
1752         rdev_for_each(rdev, tmp, mddev) {
1753                 if (!rdev->mddev) {
1754                         MD_BUG();
1755                         continue;
1756                 }
1757                 kick_rdev_from_array(rdev);
1758         }
1759         if (!list_empty(&mddev->disks))
1760                 MD_BUG();
1761         mddev->raid_disks = 0;
1762         mddev->major_version = 0;
1763 }
1764
1765 static void print_desc(mdp_disk_t *desc)
1766 {
1767         printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1768                 desc->major,desc->minor,desc->raid_disk,desc->state);
1769 }
1770
1771 static void print_sb_90(mdp_super_t *sb)
1772 {
1773         int i;
1774
1775         printk(KERN_INFO 
1776                 "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1777                 sb->major_version, sb->minor_version, sb->patch_version,
1778                 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1779                 sb->ctime);
1780         printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1781                 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1782                 sb->md_minor, sb->layout, sb->chunk_size);
1783         printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
1784                 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1785                 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1786                 sb->failed_disks, sb->spare_disks,
1787                 sb->sb_csum, (unsigned long)sb->events_lo);
1788
1789         printk(KERN_INFO);
1790         for (i = 0; i < MD_SB_DISKS; i++) {
1791                 mdp_disk_t *desc;
1792
1793                 desc = sb->disks + i;
1794                 if (desc->number || desc->major || desc->minor ||
1795                     desc->raid_disk || (desc->state && (desc->state != 4))) {
1796                         printk("     D %2d: ", i);
1797                         print_desc(desc);
1798                 }
1799         }
1800         printk(KERN_INFO "md:     THIS: ");
1801         print_desc(&sb->this_disk);
1802 }
1803
1804 static void print_sb_1(struct mdp_superblock_1 *sb)
1805 {
1806         __u8 *uuid;
1807
1808         uuid = sb->set_uuid;
1809         printk(KERN_INFO
1810                "md:  SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
1811                ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
1812                "md:    Name: \"%s\" CT:%llu\n",
1813                 le32_to_cpu(sb->major_version),
1814                 le32_to_cpu(sb->feature_map),
1815                 uuid[0], uuid[1], uuid[2], uuid[3],
1816                 uuid[4], uuid[5], uuid[6], uuid[7],
1817                 uuid[8], uuid[9], uuid[10], uuid[11],
1818                 uuid[12], uuid[13], uuid[14], uuid[15],
1819                 sb->set_name,
1820                 (unsigned long long)le64_to_cpu(sb->ctime)
1821                        & MD_SUPERBLOCK_1_TIME_SEC_MASK);
1822
1823         uuid = sb->device_uuid;
1824         printk(KERN_INFO
1825                "md:       L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
1826                         " RO:%llu\n"
1827                "md:     Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
1828                         ":%02x%02x%02x%02x%02x%02x\n"
1829                "md:       (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
1830                "md:         (MaxDev:%u) \n",
1831                 le32_to_cpu(sb->level),
1832                 (unsigned long long)le64_to_cpu(sb->size),
1833                 le32_to_cpu(sb->raid_disks),
1834                 le32_to_cpu(sb->layout),
1835                 le32_to_cpu(sb->chunksize),
1836                 (unsigned long long)le64_to_cpu(sb->data_offset),
1837                 (unsigned long long)le64_to_cpu(sb->data_size),
1838                 (unsigned long long)le64_to_cpu(sb->super_offset),
1839                 (unsigned long long)le64_to_cpu(sb->recovery_offset),
1840                 le32_to_cpu(sb->dev_number),
1841                 uuid[0], uuid[1], uuid[2], uuid[3],
1842                 uuid[4], uuid[5], uuid[6], uuid[7],
1843                 uuid[8], uuid[9], uuid[10], uuid[11],
1844                 uuid[12], uuid[13], uuid[14], uuid[15],
1845                 sb->devflags,
1846                 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
1847                 (unsigned long long)le64_to_cpu(sb->events),
1848                 (unsigned long long)le64_to_cpu(sb->resync_offset),
1849                 le32_to_cpu(sb->sb_csum),
1850                 le32_to_cpu(sb->max_dev)
1851                 );
1852 }
1853
1854 static void print_rdev(mdk_rdev_t *rdev, int major_version)
1855 {
1856         char b[BDEVNAME_SIZE];
1857         printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
1858                 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
1859                 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1860                 rdev->desc_nr);
1861         if (rdev->sb_loaded) {
1862                 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
1863                 switch (major_version) {
1864                 case 0:
1865                         print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
1866                         break;
1867                 case 1:
1868                         print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
1869                         break;
1870                 }
1871         } else
1872                 printk(KERN_INFO "md: no rdev superblock!\n");
1873 }
1874
1875 static void md_print_devices(void)
1876 {
1877         struct list_head *tmp;
1878         mdk_rdev_t *rdev;
1879         mddev_t *mddev;
1880         char b[BDEVNAME_SIZE];
1881
1882         printk("\n");
1883         printk("md:     **********************************\n");
1884         printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
1885         printk("md:     **********************************\n");
1886         for_each_mddev(mddev, tmp) {
1887
1888                 if (mddev->bitmap)
1889                         bitmap_print_sb(mddev->bitmap);
1890                 else
1891                         printk("%s: ", mdname(mddev));
1892                 list_for_each_entry(rdev, &mddev->disks, same_set)
1893                         printk("<%s>", bdevname(rdev->bdev,b));
1894                 printk("\n");
1895
1896                 list_for_each_entry(rdev, &mddev->disks, same_set)
1897                         print_rdev(rdev, mddev->major_version);
1898         }
1899         printk("md:     **********************************\n");
1900         printk("\n");
1901 }
1902
1903
1904 static void sync_sbs(mddev_t * mddev, int nospares)
1905 {
1906         /* Update each superblock (in-memory image), but
1907          * if we are allowed to, skip spares which already
1908          * have the right event counter, or have one earlier
1909          * (which would mean they aren't being marked as dirty
1910          * with the rest of the array)
1911          */
1912         mdk_rdev_t *rdev;
1913
1914         list_for_each_entry(rdev, &mddev->disks, same_set) {
1915                 if (rdev->sb_events == mddev->events ||
1916                     (nospares &&
1917                      rdev->raid_disk < 0 &&
1918                      (rdev->sb_events&1)==0 &&
1919                      rdev->sb_events+1 == mddev->events)) {
1920                         /* Don't update this superblock */
1921                         rdev->sb_loaded = 2;
1922                 } else {
1923                         super_types[mddev->major_version].
1924                                 sync_super(mddev, rdev);
1925                         rdev->sb_loaded = 1;
1926                 }
1927         }
1928 }
1929
1930 static void md_update_sb(mddev_t * mddev, int force_change)
1931 {
1932         mdk_rdev_t *rdev;
1933         int sync_req;
1934         int nospares = 0;
1935
1936         mddev->utime = get_seconds();
1937         if (mddev->external)
1938                 return;
1939 repeat:
1940         spin_lock_irq(&mddev->write_lock);
1941
1942         set_bit(MD_CHANGE_PENDING, &mddev->flags);
1943         if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
1944                 force_change = 1;
1945         if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
1946                 /* just a clean<-> dirty transition, possibly leave spares alone,
1947                  * though if events isn't the right even/odd, we will have to do
1948                  * spares after all
1949                  */
1950                 nospares = 1;
1951         if (force_change)
1952                 nospares = 0;
1953         if (mddev->degraded)
1954                 /* If the array is degraded, then skipping spares is both
1955                  * dangerous and fairly pointless.
1956                  * Dangerous because a device that was removed from the array
1957                  * might have a event_count that still looks up-to-date,
1958                  * so it can be re-added without a resync.
1959                  * Pointless because if there are any spares to skip,
1960                  * then a recovery will happen and soon that array won't
1961                  * be degraded any more and the spare can go back to sleep then.
1962                  */
1963                 nospares = 0;
1964
1965         sync_req = mddev->in_sync;
1966
1967         /* If this is just a dirty<->clean transition, and the array is clean
1968          * and 'events' is odd, we can roll back to the previous clean state */
1969         if (nospares
1970             && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1971             && (mddev->events & 1)
1972             && mddev->events != 1)
1973                 mddev->events--;
1974         else {
1975                 /* otherwise we have to go forward and ... */
1976                 mddev->events ++;
1977                 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1978                         /* .. if the array isn't clean, insist on an odd 'events' */
1979                         if ((mddev->events&1)==0) {
1980                                 mddev->events++;
1981                                 nospares = 0;
1982                         }
1983                 } else {
1984                         /* otherwise insist on an even 'events' (for clean states) */
1985                         if ((mddev->events&1)) {
1986                                 mddev->events++;
1987                                 nospares = 0;
1988                         }
1989                 }
1990         }
1991
1992         if (!mddev->events) {
1993                 /*
1994                  * oops, this 64-bit counter should never wrap.
1995                  * Either we are in around ~1 trillion A.C., assuming
1996                  * 1 reboot per second, or we have a bug:
1997                  */
1998                 MD_BUG();
1999                 mddev->events --;
2000         }
2001
2002         /*
2003          * do not write anything to disk if using
2004          * nonpersistent superblocks
2005          */
2006         if (!mddev->persistent) {
2007                 if (!mddev->external)
2008                         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2009
2010                 spin_unlock_irq(&mddev->write_lock);
2011                 wake_up(&mddev->sb_wait);
2012                 return;
2013         }
2014         sync_sbs(mddev, nospares);
2015         spin_unlock_irq(&mddev->write_lock);
2016
2017         dprintk(KERN_INFO 
2018                 "md: updating %s RAID superblock on device (in sync %d)\n",
2019                 mdname(mddev),mddev->in_sync);
2020
2021         bitmap_update_sb(mddev->bitmap);
2022         list_for_each_entry(rdev, &mddev->disks, same_set) {
2023                 char b[BDEVNAME_SIZE];
2024                 dprintk(KERN_INFO "md: ");
2025                 if (rdev->sb_loaded != 1)
2026                         continue; /* no noise on spare devices */
2027                 if (test_bit(Faulty, &rdev->flags))
2028                         dprintk("(skipping faulty ");
2029
2030                 dprintk("%s ", bdevname(rdev->bdev,b));
2031                 if (!test_bit(Faulty, &rdev->flags)) {
2032                         md_super_write(mddev,rdev,
2033                                        rdev->sb_start, rdev->sb_size,
2034                                        rdev->sb_page);
2035                         dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
2036                                 bdevname(rdev->bdev,b),
2037                                 (unsigned long long)rdev->sb_start);
2038                         rdev->sb_events = mddev->events;
2039
2040                 } else
2041                         dprintk(")\n");
2042                 if (mddev->level == LEVEL_MULTIPATH)
2043                         /* only need to write one superblock... */
2044                         break;
2045         }
2046         md_super_wait(mddev);
2047         /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
2048
2049         spin_lock_irq(&mddev->write_lock);
2050         if (mddev->in_sync != sync_req ||
2051             test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2052                 /* have to write it out again */
2053                 spin_unlock_irq(&mddev->write_lock);
2054                 goto repeat;
2055         }
2056         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2057         spin_unlock_irq(&mddev->write_lock);
2058         wake_up(&mddev->sb_wait);
2059         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2060                 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2061
2062 }
2063
2064 /* words written to sysfs files may, or may not, be \n terminated.
2065  * We want to accept with case. For this we use cmd_match.
2066  */
2067 static int cmd_match(const char *cmd, const char *str)
2068 {
2069         /* See if cmd, written into a sysfs file, matches
2070          * str.  They must either be the same, or cmd can
2071          * have a trailing newline
2072          */
2073         while (*cmd && *str && *cmd == *str) {
2074                 cmd++;
2075                 str++;
2076         }
2077         if (*cmd == '\n')
2078                 cmd++;
2079         if (*str || *cmd)
2080                 return 0;
2081         return 1;
2082 }
2083
2084 struct rdev_sysfs_entry {
2085         struct attribute attr;
2086         ssize_t (*show)(mdk_rdev_t *, char *);
2087         ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
2088 };
2089
2090 static ssize_t
2091 state_show(mdk_rdev_t *rdev, char *page)
2092 {
2093         char *sep = "";
2094         size_t len = 0;
2095
2096         if (test_bit(Faulty, &rdev->flags)) {
2097                 len+= sprintf(page+len, "%sfaulty",sep);
2098                 sep = ",";
2099         }
2100         if (test_bit(In_sync, &rdev->flags)) {
2101                 len += sprintf(page+len, "%sin_sync",sep);
2102                 sep = ",";
2103         }
2104         if (test_bit(WriteMostly, &rdev->flags)) {
2105                 len += sprintf(page+len, "%swrite_mostly",sep);
2106                 sep = ",";
2107         }
2108         if (test_bit(Blocked, &rdev->flags)) {
2109                 len += sprintf(page+len, "%sblocked", sep);
2110                 sep = ",";
2111         }
2112         if (!test_bit(Faulty, &rdev->flags) &&
2113             !test_bit(In_sync, &rdev->flags)) {
2114                 len += sprintf(page+len, "%sspare", sep);
2115                 sep = ",";
2116         }
2117         return len+sprintf(page+len, "\n");
2118 }
2119
2120 static ssize_t
2121 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2122 {
2123         /* can write
2124          *  faulty  - simulates and error
2125          *  remove  - disconnects the device
2126          *  writemostly - sets write_mostly
2127          *  -writemostly - clears write_mostly
2128          *  blocked - sets the Blocked flag
2129          *  -blocked - clears the Blocked flag
2130          *  insync - sets Insync providing device isn't active
2131          */
2132         int err = -EINVAL;
2133         if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2134                 md_error(rdev->mddev, rdev);
2135                 err = 0;
2136         } else if (cmd_match(buf, "remove")) {
2137                 if (rdev->raid_disk >= 0)
2138                         err = -EBUSY;
2139                 else {
2140                         mddev_t *mddev = rdev->mddev;
2141                         kick_rdev_from_array(rdev);
2142                         if (mddev->pers)
2143                                 md_update_sb(mddev, 1);
2144                         md_new_event(mddev);
2145                         err = 0;
2146                 }
2147         } else if (cmd_match(buf, "writemostly")) {
2148                 set_bit(WriteMostly, &rdev->flags);
2149                 err = 0;
2150         } else if (cmd_match(buf, "-writemostly")) {
2151                 clear_bit(WriteMostly, &rdev->flags);
2152                 err = 0;
2153         } else if (cmd_match(buf, "blocked")) {
2154                 set_bit(Blocked, &rdev->flags);
2155                 err = 0;
2156         } else if (cmd_match(buf, "-blocked")) {
2157                 clear_bit(Blocked, &rdev->flags);
2158                 wake_up(&rdev->blocked_wait);
2159                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2160                 md_wakeup_thread(rdev->mddev->thread);
2161
2162                 err = 0;
2163         } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2164                 set_bit(In_sync, &rdev->flags);
2165                 err = 0;
2166         }
2167         if (!err && rdev->sysfs_state)
2168                 sysfs_notify_dirent(rdev->sysfs_state);
2169         return err ? err : len;
2170 }
2171 static struct rdev_sysfs_entry rdev_state =
2172 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
2173
2174 static ssize_t
2175 errors_show(mdk_rdev_t *rdev, char *page)
2176 {
2177         return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2178 }
2179
2180 static ssize_t
2181 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2182 {
2183         char *e;
2184         unsigned long n = simple_strtoul(buf, &e, 10);
2185         if (*buf && (*e == 0 || *e == '\n')) {
2186                 atomic_set(&rdev->corrected_errors, n);
2187                 return len;
2188         }
2189         return -EINVAL;
2190 }
2191 static struct rdev_sysfs_entry rdev_errors =
2192 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2193
2194 static ssize_t
2195 slot_show(mdk_rdev_t *rdev, char *page)
2196 {
2197         if (rdev->raid_disk < 0)
2198                 return sprintf(page, "none\n");
2199         else
2200                 return sprintf(page, "%d\n", rdev->raid_disk);
2201 }
2202
2203 static ssize_t
2204 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2205 {
2206         char *e;
2207         int err;
2208         char nm[20];
2209         int slot = simple_strtoul(buf, &e, 10);
2210         if (strncmp(buf, "none", 4)==0)
2211                 slot = -1;
2212         else if (e==buf || (*e && *e!= '\n'))
2213                 return -EINVAL;
2214         if (rdev->mddev->pers && slot == -1) {
2215                 /* Setting 'slot' on an active array requires also
2216                  * updating the 'rd%d' link, and communicating
2217                  * with the personality with ->hot_*_disk.
2218                  * For now we only support removing
2219                  * failed/spare devices.  This normally happens automatically,
2220                  * but not when the metadata is externally managed.
2221                  */
2222                 if (rdev->raid_disk == -1)
2223                         return -EEXIST;
2224                 /* personality does all needed checks */
2225                 if (rdev->mddev->pers->hot_add_disk == NULL)
2226                         return -EINVAL;
2227                 err = rdev->mddev->pers->
2228                         hot_remove_disk(rdev->mddev, rdev->raid_disk);
2229                 if (err)
2230                         return err;
2231                 sprintf(nm, "rd%d", rdev->raid_disk);
2232                 sysfs_remove_link(&rdev->mddev->kobj, nm);
2233                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2234                 md_wakeup_thread(rdev->mddev->thread);
2235         } else if (rdev->mddev->pers) {
2236                 mdk_rdev_t *rdev2;
2237                 /* Activating a spare .. or possibly reactivating
2238                  * if we ever get bitmaps working here.
2239                  */
2240
2241                 if (rdev->raid_disk != -1)
2242                         return -EBUSY;
2243
2244                 if (rdev->mddev->pers->hot_add_disk == NULL)
2245                         return -EINVAL;
2246
2247                 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
2248                         if (rdev2->raid_disk == slot)
2249                                 return -EEXIST;
2250
2251                 rdev->raid_disk = slot;
2252                 if (test_bit(In_sync, &rdev->flags))
2253                         rdev->saved_raid_disk = slot;
2254                 else
2255                         rdev->saved_raid_disk = -1;
2256                 err = rdev->mddev->pers->
2257                         hot_add_disk(rdev->mddev, rdev);
2258                 if (err) {
2259                         rdev->raid_disk = -1;
2260                         return err;
2261                 } else
2262                         sysfs_notify_dirent(rdev->sysfs_state);
2263                 sprintf(nm, "rd%d", rdev->raid_disk);
2264                 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2265                         printk(KERN_WARNING
2266                                "md: cannot register "
2267                                "%s for %s\n",
2268                                nm, mdname(rdev->mddev));
2269
2270                 /* don't wakeup anyone, leave that to userspace. */
2271         } else {
2272                 if (slot >= rdev->mddev->raid_disks)
2273                         return -ENOSPC;
2274                 rdev->raid_disk = slot;
2275                 /* assume it is working */
2276                 clear_bit(Faulty, &rdev->flags);
2277                 clear_bit(WriteMostly, &rdev->flags);
2278                 set_bit(In_sync, &rdev->flags);
2279                 sysfs_notify_dirent(rdev->sysfs_state);
2280         }
2281         return len;
2282 }
2283
2284
2285 static struct rdev_sysfs_entry rdev_slot =
2286 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2287
2288 static ssize_t
2289 offset_show(mdk_rdev_t *rdev, char *page)
2290 {
2291         return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2292 }
2293
2294 static ssize_t
2295 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2296 {
2297         char *e;
2298         unsigned long long offset = simple_strtoull(buf, &e, 10);
2299         if (e==buf || (*e && *e != '\n'))
2300                 return -EINVAL;
2301         if (rdev->mddev->pers && rdev->raid_disk >= 0)
2302                 return -EBUSY;
2303         if (rdev->sectors && rdev->mddev->external)
2304                 /* Must set offset before size, so overlap checks
2305                  * can be sane */
2306                 return -EBUSY;
2307         rdev->data_offset = offset;
2308         return len;
2309 }
2310
2311 static struct rdev_sysfs_entry rdev_offset =
2312 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2313
2314 static ssize_t
2315 rdev_size_show(mdk_rdev_t *rdev, char *page)
2316 {
2317         return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2318 }
2319
2320 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2321 {
2322         /* check if two start/length pairs overlap */
2323         if (s1+l1 <= s2)
2324                 return 0;
2325         if (s2+l2 <= s1)
2326                 return 0;
2327         return 1;
2328 }
2329
2330 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2331 {
2332         unsigned long long blocks;
2333         sector_t new;
2334
2335         if (strict_strtoull(buf, 10, &blocks) < 0)
2336                 return -EINVAL;
2337
2338         if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2339                 return -EINVAL; /* sector conversion overflow */
2340
2341         new = blocks * 2;
2342         if (new != blocks * 2)
2343                 return -EINVAL; /* unsigned long long to sector_t overflow */
2344
2345         *sectors = new;
2346         return 0;
2347 }
2348
2349 static ssize_t
2350 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2351 {
2352         mddev_t *my_mddev = rdev->mddev;
2353         sector_t oldsectors = rdev->sectors;
2354         sector_t sectors;
2355
2356         if (strict_blocks_to_sectors(buf, &sectors) < 0)
2357                 return -EINVAL;
2358         if (my_mddev->pers && rdev->raid_disk >= 0) {
2359                 if (my_mddev->persistent) {
2360                         sectors = super_types[my_mddev->major_version].
2361                                 rdev_size_change(rdev, sectors);
2362                         if (!sectors)
2363                                 return -EBUSY;
2364                 } else if (!sectors)
2365                         sectors = (rdev->bdev->bd_inode->i_size >> 9) -
2366                                 rdev->data_offset;
2367         }
2368         if (sectors < my_mddev->dev_sectors)
2369                 return -EINVAL; /* component must fit device */
2370
2371         rdev->sectors = sectors;
2372         if (sectors > oldsectors && my_mddev->external) {
2373                 /* need to check that all other rdevs with the same ->bdev
2374                  * do not overlap.  We need to unlock the mddev to avoid
2375                  * a deadlock.  We have already changed rdev->sectors, and if
2376                  * we have to change it back, we will have the lock again.
2377                  */
2378                 mddev_t *mddev;
2379                 int overlap = 0;
2380                 struct list_head *tmp;
2381
2382                 mddev_unlock(my_mddev);
2383                 for_each_mddev(mddev, tmp) {
2384                         mdk_rdev_t *rdev2;
2385
2386                         mddev_lock(mddev);
2387                         list_for_each_entry(rdev2, &mddev->disks, same_set)
2388                                 if (test_bit(AllReserved, &rdev2->flags) ||
2389                                     (rdev->bdev == rdev2->bdev &&
2390                                      rdev != rdev2 &&
2391                                      overlaps(rdev->data_offset, rdev->sectors,
2392                                               rdev2->data_offset,
2393                                               rdev2->sectors))) {
2394                                         overlap = 1;
2395                                         break;
2396                                 }
2397                         mddev_unlock(mddev);
2398                         if (overlap) {
2399                                 mddev_put(mddev);
2400                                 break;
2401                         }
2402                 }
2403                 mddev_lock(my_mddev);
2404                 if (overlap) {
2405                         /* Someone else could have slipped in a size
2406                          * change here, but doing so is just silly.
2407                          * We put oldsectors back because we *know* it is
2408                          * safe, and trust userspace not to race with
2409                          * itself
2410                          */
2411                         rdev->sectors = oldsectors;
2412                         return -EBUSY;
2413                 }
2414         }
2415         return len;
2416 }
2417
2418 static struct rdev_sysfs_entry rdev_size =
2419 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2420
2421 static struct attribute *rdev_default_attrs[] = {
2422         &rdev_state.attr,
2423         &rdev_errors.attr,
2424         &rdev_slot.attr,
2425         &rdev_offset.attr,
2426         &rdev_size.attr,
2427         NULL,
2428 };
2429 static ssize_t
2430 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2431 {
2432         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2433         mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2434         mddev_t *mddev = rdev->mddev;
2435         ssize_t rv;
2436
2437         if (!entry->show)
2438                 return -EIO;
2439
2440         rv = mddev ? mddev_lock(mddev) : -EBUSY;
2441         if (!rv) {
2442                 if (rdev->mddev == NULL)
2443                         rv = -EBUSY;
2444                 else
2445                         rv = entry->show(rdev, page);
2446                 mddev_unlock(mddev);
2447         }
2448         return rv;
2449 }
2450
2451 static ssize_t
2452 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2453               const char *page, size_t length)
2454 {
2455         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2456         mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2457         ssize_t rv;
2458         mddev_t *mddev = rdev->mddev;
2459
2460         if (!entry->store)
2461                 return -EIO;
2462         if (!capable(CAP_SYS_ADMIN))
2463                 return -EACCES;
2464         rv = mddev ? mddev_lock(mddev): -EBUSY;
2465         if (!rv) {
2466                 if (rdev->mddev == NULL)
2467                         rv = -EBUSY;
2468                 else
2469                         rv = entry->store(rdev, page, length);
2470                 mddev_unlock(mddev);
2471         }
2472         return rv;
2473 }
2474
2475 static void rdev_free(struct kobject *ko)
2476 {
2477         mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
2478         kfree(rdev);
2479 }
2480 static struct sysfs_ops rdev_sysfs_ops = {
2481         .show           = rdev_attr_show,
2482         .store          = rdev_attr_store,
2483 };
2484 static struct kobj_type rdev_ktype = {
2485         .release        = rdev_free,
2486         .sysfs_ops      = &rdev_sysfs_ops,
2487         .default_attrs  = rdev_default_attrs,
2488 };
2489
2490 /*
2491  * Import a device. If 'super_format' >= 0, then sanity check the superblock
2492  *
2493  * mark the device faulty if:
2494  *
2495  *   - the device is nonexistent (zero size)
2496  *   - the device has no valid superblock
2497  *
2498  * a faulty rdev _never_ has rdev->sb set.
2499  */
2500 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
2501 {
2502         char b[BDEVNAME_SIZE];
2503         int err;
2504         mdk_rdev_t *rdev;
2505         sector_t size;
2506
2507         rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
2508         if (!rdev) {
2509                 printk(KERN_ERR "md: could not alloc mem for new device!\n");
2510                 return ERR_PTR(-ENOMEM);
2511         }
2512
2513         if ((err = alloc_disk_sb(rdev)))
2514                 goto abort_free;
2515
2516         err = lock_rdev(rdev, newdev, super_format == -2);
2517         if (err)
2518                 goto abort_free;
2519
2520         kobject_init(&rdev->kobj, &rdev_ktype);
2521
2522         rdev->desc_nr = -1;
2523         rdev->saved_raid_disk = -1;
2524         rdev->raid_disk = -1;
2525         rdev->flags = 0;
2526         rdev->data_offset = 0;
2527         rdev->sb_events = 0;
2528         atomic_set(&rdev->nr_pending, 0);
2529         atomic_set(&rdev->read_errors, 0);
2530         atomic_set(&rdev->corrected_errors, 0);
2531
2532         size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2533         if (!size) {
2534                 printk(KERN_WARNING 
2535                         "md: %s has zero or unknown size, marking faulty!\n",
2536                         bdevname(rdev->bdev,b));
2537                 err = -EINVAL;
2538                 goto abort_free;
2539         }
2540
2541         if (super_format >= 0) {
2542                 err = super_types[super_format].
2543                         load_super(rdev, NULL, super_minor);
2544                 if (err == -EINVAL) {
2545                         printk(KERN_WARNING
2546                                 "md: %s does not have a valid v%d.%d "
2547                                "superblock, not importing!\n",
2548                                 bdevname(rdev->bdev,b),
2549                                super_format, super_minor);
2550                         goto abort_free;
2551                 }
2552                 if (err < 0) {
2553                         printk(KERN_WARNING 
2554                                 "md: could not read %s's sb, not importing!\n",
2555                                 bdevname(rdev->bdev,b));
2556                         goto abort_free;
2557                 }
2558         }
2559
2560         INIT_LIST_HEAD(&rdev->same_set);
2561         init_waitqueue_head(&rdev->blocked_wait);
2562
2563         return rdev;
2564
2565 abort_free:
2566         if (rdev->sb_page) {
2567                 if (rdev->bdev)
2568                         unlock_rdev(rdev);
2569                 free_disk_sb(rdev);
2570         }
2571         kfree(rdev);
2572         return ERR_PTR(err);
2573 }
2574
2575 /*
2576  * Check a full RAID array for plausibility
2577  */
2578
2579
2580 static void analyze_sbs(mddev_t * mddev)
2581 {
2582         int i;
2583         mdk_rdev_t *rdev, *freshest, *tmp;
2584         char b[BDEVNAME_SIZE];
2585
2586         freshest = NULL;
2587         rdev_for_each(rdev, tmp, mddev)
2588                 switch (super_types[mddev->major_version].
2589                         load_super(rdev, freshest, mddev->minor_version)) {
2590                 case 1:
2591                         freshest = rdev;
2592                         break;
2593                 case 0:
2594                         break;
2595                 default:
2596                         printk( KERN_ERR \
2597                                 "md: fatal superblock inconsistency in %s"
2598                                 " -- removing from array\n", 
2599                                 bdevname(rdev->bdev,b));
2600                         kick_rdev_from_array(rdev);
2601                 }
2602
2603
2604         super_types[mddev->major_version].
2605                 validate_super(mddev, freshest);
2606
2607         i = 0;
2608         rdev_for_each(rdev, tmp, mddev) {
2609                 if (rdev->desc_nr >= mddev->max_disks ||
2610                     i > mddev->max_disks) {
2611                         printk(KERN_WARNING
2612                                "md: %s: %s: only %d devices permitted\n",
2613                                mdname(mddev), bdevname(rdev->bdev, b),
2614                                mddev->max_disks);
2615                         kick_rdev_from_array(rdev);
2616                         continue;
2617                 }
2618                 if (rdev != freshest)
2619                         if (super_types[mddev->major_version].
2620                             validate_super(mddev, rdev)) {
2621                                 printk(KERN_WARNING "md: kicking non-fresh %s"
2622                                         " from array!\n",
2623                                         bdevname(rdev->bdev,b));
2624                                 kick_rdev_from_array(rdev);
2625                                 continue;
2626                         }
2627                 if (mddev->level == LEVEL_MULTIPATH) {
2628                         rdev->desc_nr = i++;
2629                         rdev->raid_disk = rdev->desc_nr;
2630                         set_bit(In_sync, &rdev->flags);
2631                 } else if (rdev->raid_disk >= mddev->raid_disks) {
2632                         rdev->raid_disk = -1;
2633                         clear_bit(In_sync, &rdev->flags);
2634                 }
2635         }
2636 }
2637
2638 static void md_safemode_timeout(unsigned long data);
2639
2640 static ssize_t
2641 safe_delay_show(mddev_t *mddev, char *page)
2642 {
2643         int msec = (mddev->safemode_delay*1000)/HZ;
2644         return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2645 }
2646 static ssize_t
2647 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2648 {
2649         int scale=1;
2650         int dot=0;
2651         int i;
2652         unsigned long msec;
2653         char buf[30];
2654
2655         /* remove a period, and count digits after it */
2656         if (len >= sizeof(buf))
2657                 return -EINVAL;
2658         strlcpy(buf, cbuf, sizeof(buf));
2659         for (i=0; i<len; i++) {
2660                 if (dot) {
2661                         if (isdigit(buf[i])) {
2662                                 buf[i-1] = buf[i];
2663                                 scale *= 10;
2664                         }
2665                         buf[i] = 0;
2666                 } else if (buf[i] == '.') {
2667                         dot=1;
2668                         buf[i] = 0;
2669                 }
2670         }
2671         if (strict_strtoul(buf, 10, &msec) < 0)
2672                 return -EINVAL;
2673         msec = (msec * 1000) / scale;
2674         if (msec == 0)
2675                 mddev->safemode_delay = 0;
2676         else {
2677                 unsigned long old_delay = mddev->safemode_delay;
2678                 mddev->safemode_delay = (msec*HZ)/1000;
2679                 if (mddev->safemode_delay == 0)
2680                         mddev->safemode_delay = 1;
2681                 if (mddev->safemode_delay < old_delay)
2682                         md_safemode_timeout((unsigned long)mddev);
2683         }
2684         return len;
2685 }
2686 static struct md_sysfs_entry md_safe_delay =
2687 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2688
2689 static ssize_t
2690 level_show(mddev_t *mddev, char *page)
2691 {
2692         struct mdk_personality *p = mddev->pers;
2693         if (p)
2694                 return sprintf(page, "%s\n", p->name);
2695         else if (mddev->clevel[0])
2696                 return sprintf(page, "%s\n", mddev->clevel);
2697         else if (mddev->level != LEVEL_NONE)
2698                 return sprintf(page, "%d\n", mddev->level);
2699         else
2700                 return 0;
2701 }
2702
2703 static ssize_t
2704 level_store(mddev_t *mddev, const char *buf, size_t len)
2705 {
2706         char level[16];
2707         ssize_t rv = len;
2708         struct mdk_personality *pers;
2709         void *priv;
2710         mdk_rdev_t *rdev;
2711
2712         if (mddev->pers == NULL) {
2713                 if (len == 0)
2714                         return 0;
2715                 if (len >= sizeof(mddev->clevel))
2716                         return -ENOSPC;
2717                 strncpy(mddev->clevel, buf, len);
2718                 if (mddev->clevel[len-1] == '\n')
2719                         len--;
2720                 mddev->clevel[len] = 0;
2721                 mddev->level = LEVEL_NONE;
2722                 return rv;
2723         }
2724
2725         /* request to change the personality.  Need to ensure:
2726          *  - array is not engaged in resync/recovery/reshape
2727          *  - old personality can be suspended
2728          *  - new personality will access other array.
2729          */
2730
2731         if (mddev->sync_thread || mddev->reshape_position != MaxSector)
2732                 return -EBUSY;
2733
2734         if (!mddev->pers->quiesce) {
2735                 printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
2736                        mdname(mddev), mddev->pers->name);
2737                 return -EINVAL;
2738         }
2739
2740         /* Now find the new personality */
2741         if (len == 0 || len >= sizeof(level))
2742                 return -EINVAL;
2743         strncpy(level, buf, len);
2744         if (level[len-1] == '\n')
2745                 len--;
2746         level[len] = 0;
2747
2748         request_module("md-%s", level);
2749         spin_lock(&pers_lock);
2750         pers = find_pers(LEVEL_NONE, level);
2751         if (!pers || !try_module_get(pers->owner)) {
2752                 spin_unlock(&pers_lock);
2753                 printk(KERN_WARNING "md: personality %s not loaded\n", level);
2754                 return -EINVAL;
2755         }
2756         spin_unlock(&pers_lock);
2757
2758         if (pers == mddev->pers) {
2759                 /* Nothing to do! */
2760                 module_put(pers->owner);
2761                 return rv;
2762         }
2763         if (!pers->takeover) {
2764                 module_put(pers->owner);
2765                 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
2766                        mdname(mddev), level);
2767                 return -EINVAL;
2768         }
2769
2770         /* ->takeover must set new_* and/or delta_disks
2771          * if it succeeds, and may set them when it fails.
2772          */
2773         priv = pers->takeover(mddev);
2774         if (IS_ERR(priv)) {
2775                 mddev->new_level = mddev->level;
2776                 mddev->new_layout = mddev->layout;
2777                 mddev->new_chunk_sectors = mddev->chunk_sectors;
2778                 mddev->raid_disks -= mddev->delta_disks;
2779                 mddev->delta_disks = 0;
2780                 module_put(pers->owner);
2781                 printk(KERN_WARNING "md: %s: %s would not accept array\n",
2782                        mdname(mddev), level);
2783                 return PTR_ERR(priv);
2784         }
2785
2786         /* Looks like we have a winner */
2787         mddev_suspend(mddev);
2788         mddev->pers->stop(mddev);
2789         module_put(mddev->pers->owner);
2790         /* Invalidate devices that are now superfluous */
2791         list_for_each_entry(rdev, &mddev->disks, same_set)
2792                 if (rdev->raid_disk >= mddev->raid_disks) {
2793                         rdev->raid_disk = -1;
2794                         clear_bit(In_sync, &rdev->flags);
2795                 }
2796         mddev->pers = pers;
2797         mddev->private = priv;
2798         strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
2799         mddev->level = mddev->new_level;
2800         mddev->layout = mddev->new_layout;
2801         mddev->chunk_sectors = mddev->new_chunk_sectors;
2802         mddev->delta_disks = 0;
2803         pers->run(mddev);
2804         mddev_resume(mddev);
2805         set_bit(MD_CHANGE_DEVS, &mddev->flags);
2806         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2807         md_wakeup_thread(mddev->thread);
2808         return rv;
2809 }
2810
2811 static struct md_sysfs_entry md_level =
2812 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2813
2814
2815 static ssize_t
2816 layout_show(mddev_t *mddev, char *page)
2817 {
2818         /* just a number, not meaningful for all levels */
2819         if (mddev->reshape_position != MaxSector &&
2820             mddev->layout != mddev->new_layout)
2821                 return sprintf(page, "%d (%d)\n",
2822                                mddev->new_layout, mddev->layout);
2823         return sprintf(page, "%d\n", mddev->layout);
2824 }
2825
2826 static ssize_t
2827 layout_store(mddev_t *mddev, const char *buf, size_t len)
2828 {
2829         char *e;
2830         unsigned long n = simple_strtoul(buf, &e, 10);
2831
2832         if (!*buf || (*e && *e != '\n'))
2833                 return -EINVAL;
2834
2835         if (mddev->pers) {
2836                 int err;
2837                 if (mddev->pers->check_reshape == NULL)
2838                         return -EBUSY;
2839                 mddev->new_layout = n;
2840                 err = mddev->pers->check_reshape(mddev);
2841                 if (err) {
2842                         mddev->new_layout = mddev->layout;
2843                         return err;
2844                 }
2845         } else {
2846                 mddev->new_layout = n;
2847                 if (mddev->reshape_position == MaxSector)
2848                         mddev->layout = n;
2849         }
2850         return len;
2851 }
2852 static struct md_sysfs_entry md_layout =
2853 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2854
2855
2856 static ssize_t
2857 raid_disks_show(mddev_t *mddev, char *page)
2858 {
2859         if (mddev->raid_disks == 0)
2860                 return 0;
2861         if (mddev->reshape_position != MaxSector &&
2862             mddev->delta_disks != 0)
2863                 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
2864                                mddev->raid_disks - mddev->delta_disks);
2865         return sprintf(page, "%d\n", mddev->raid_disks);
2866 }
2867
2868 static int update_raid_disks(mddev_t *mddev, int raid_disks);
2869
2870 static ssize_t
2871 raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2872 {
2873         char *e;
2874         int rv = 0;
2875         unsigned long n = simple_strtoul(buf, &e, 10);
2876
2877         if (!*buf || (*e && *e != '\n'))
2878                 return -EINVAL;
2879
2880         if (mddev->pers)
2881                 rv = update_raid_disks(mddev, n);
2882         else if (mddev->reshape_position != MaxSector) {
2883                 int olddisks = mddev->raid_disks - mddev->delta_disks;
2884                 mddev->delta_disks = n - olddisks;
2885                 mddev->raid_disks = n;
2886         } else
2887                 mddev->raid_disks = n;
2888         return rv ? rv : len;
2889 }
2890 static struct md_sysfs_entry md_raid_disks =
2891 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2892
2893 static ssize_t
2894 chunk_size_show(mddev_t *mddev, char *page)
2895 {
2896         if (mddev->reshape_position != MaxSector &&
2897             mddev->chunk_sectors != mddev->new_chunk_sectors)
2898                 return sprintf(page, "%d (%d)\n",
2899                                mddev->new_chunk_sectors << 9,
2900                                mddev->chunk_sectors << 9);
2901         return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
2902 }
2903
2904 static ssize_t
2905 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2906 {
2907         char *e;
2908         unsigned long n = simple_strtoul(buf, &e, 10);
2909
2910         if (!*buf || (*e && *e != '\n'))
2911                 return -EINVAL;
2912
2913         if (mddev->pers) {
2914                 int err;
2915                 if (mddev->pers->check_reshape == NULL)
2916                         return -EBUSY;
2917                 mddev->new_chunk_sectors = n >> 9;
2918                 err = mddev->pers->check_reshape(mddev);
2919                 if (err) {
2920                         mddev->new_chunk_sectors = mddev->chunk_sectors;
2921                         return err;
2922                 }
2923         } else {
2924                 mddev->new_chunk_sectors = n >> 9;
2925                 if (mddev->reshape_position == MaxSector)
2926                         mddev->chunk_sectors = n >> 9;
2927         }
2928         return len;
2929 }
2930 static struct md_sysfs_entry md_chunk_size =
2931 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2932
2933 static ssize_t
2934 resync_start_show(mddev_t *mddev, char *page)
2935 {
2936         if (mddev->recovery_cp == MaxSector)
2937                 return sprintf(page, "none\n");
2938         return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2939 }
2940
2941 static ssize_t
2942 resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2943 {
2944         char *e;
2945         unsigned long long n = simple_strtoull(buf, &e, 10);
2946
2947         if (mddev->pers)
2948                 return -EBUSY;
2949         if (!*buf || (*e && *e != '\n'))
2950                 return -EINVAL;
2951
2952         mddev->recovery_cp = n;
2953         return len;
2954 }
2955 static struct md_sysfs_entry md_resync_start =
2956 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2957
2958 /*
2959  * The array state can be:
2960  *
2961  * clear
2962  *     No devices, no size, no level
2963  *     Equivalent to STOP_ARRAY ioctl
2964  * inactive
2965  *     May have some settings, but array is not active
2966  *        all IO results in error
2967  *     When written, doesn't tear down array, but just stops it
2968  * suspended (not supported yet)
2969  *     All IO requests will block. The array can be reconfigured.
2970  *     Writing this, if accepted, will block until array is quiescent
2971  * readonly
2972  *     no resync can happen.  no superblocks get written.
2973  *     write requests fail
2974  * read-auto
2975  *     like readonly, but behaves like 'clean' on a write request.
2976  *
2977  * clean - no pending writes, but otherwise active.
2978  *     When written to inactive array, starts without resync
2979  *     If a write request arrives then
2980  *       if metadata is known, mark 'dirty' and switch to 'active'.
2981  *       if not known, block and switch to write-pending
2982  *     If written to an active array that has pending writes, then fails.
2983  * active
2984  *     fully active: IO and resync can be happening.
2985  *     When written to inactive array, starts with resync
2986  *
2987  * write-pending
2988  *     clean, but writes are blocked waiting for 'active' to be written.
2989  *
2990  * active-idle
2991  *     like active, but no writes have been seen for a while (100msec).
2992  *
2993  */
2994 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2995                    write_pending, active_idle, bad_word};
2996 static char *array_states[] = {
2997         "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
2998         "write-pending", "active-idle", NULL };
2999
3000 static int match_word(const char *word, char **list)
3001 {
3002         int n;
3003         for (n=0; list[n]; n++)
3004                 if (cmd_match(word, list[n]))
3005                         break;
3006         return n;
3007 }
3008
3009 static ssize_t
3010 array_state_show(mddev_t *mddev, char *page)
3011 {
3012         enum array_state st = inactive;
3013
3014         if (mddev->pers)
3015                 switch(mddev->ro) {
3016                 case 1:
3017                         st = readonly;
3018                         break;
3019                 case 2:
3020                         st = read_auto;
3021                         break;
3022                 case 0:
3023                         if (mddev->in_sync)
3024                                 st = clean;
3025                         else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
3026                                 st = write_pending;
3027                         else if (mddev->safemode)
3028                                 st = active_idle;
3029                         else
3030                                 st = active;
3031                 }
3032         else {
3033                 if (list_empty(&mddev->disks) &&
3034                     mddev->raid_disks == 0 &&
3035                     mddev->dev_sectors == 0)
3036                         st = clear;
3037                 else
3038                         st = inactive;
3039         }
3040         return sprintf(page, "%s\n", array_states[st]);
3041 }
3042
3043 static int do_md_stop(mddev_t * mddev, int ro, int is_open);
3044 static int do_md_run(mddev_t * mddev);
3045 static int restart_array(mddev_t *mddev);
3046
3047 static ssize_t
3048 array_state_store(mddev_t *mddev, const char *buf, size_t len)
3049 {
3050         int err = -EINVAL;
3051         enum array_state st = match_word(buf, array_states);
3052         switch(st) {
3053         case bad_word:
3054                 break;
3055         case clear:
3056                 /* stopping an active array */
3057                 if (atomic_read(&mddev->openers) > 0)
3058                         return -EBUSY;
3059                 err = do_md_stop(mddev, 0, 0);
3060                 break;
3061         case inactive:
3062                 /* stopping an active array */
3063                 if (mddev->pers) {
3064                         if (atomic_read(&mddev->openers) > 0)
3065                                 return -EBUSY;
3066                         err = do_md_stop(mddev, 2, 0);
3067                 } else
3068                         err = 0; /* already inactive */
3069                 break;
3070         case suspended:
3071                 break; /* not supported yet */
3072         case readonly:
3073                 if (mddev->pers)
3074                         err = do_md_stop(mddev, 1, 0);
3075                 else {
3076                         mddev->ro = 1;
3077                         set_disk_ro(mddev->gendisk, 1);
3078                         err = do_md_run(mddev);
3079                 }
3080                 break;
3081         case read_auto:
3082                 if (mddev->pers) {
3083                         if (mddev->ro == 0)
3084                                 err = do_md_stop(mddev, 1, 0);
3085                         else if (mddev->ro == 1)
3086                                 err = restart_array(mddev);
3087                         if (err == 0) {
3088                                 mddev->ro = 2;
3089                                 set_disk_ro(mddev->gendisk, 0);
3090                         }
3091                 } else {
3092                         mddev->ro = 2;
3093                         err = do_md_run(mddev);
3094                 }
3095                 break;
3096         case clean:
3097                 if (mddev->pers) {
3098                         restart_array(mddev);
3099                         spin_lock_irq(&mddev->write_lock);
3100                         if (atomic_read(&mddev->writes_pending) == 0) {
3101                                 if (mddev->in_sync == 0) {
3102                                         mddev->in_sync = 1;
3103                                         if (mddev->safemode == 1)
3104                                                 mddev->safemode = 0;
3105                                         if (mddev->persistent)
3106                                                 set_bit(MD_CHANGE_CLEAN,
3107                                                         &mddev->flags);
3108                                 }
3109                                 err = 0;
3110                         } else
3111                                 err = -EBUSY;
3112                         spin_unlock_irq(&mddev->write_lock);
3113                 } else
3114                         err = -EINVAL;
3115                 break;
3116         case active:
3117                 if (mddev->pers) {
3118                         restart_array(mddev);
3119                         if (mddev->external)
3120                                 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
3121                         wake_up(&mddev->sb_wait);
3122                         err = 0;
3123                 } else {
3124                         mddev->ro = 0;
3125                         set_disk_ro(mddev->gendisk, 0);
3126                         err = do_md_run(mddev);
3127                 }
3128                 break;
3129         case write_pending:
3130         case active_idle:
3131                 /* these cannot be set */
3132                 break;
3133         }
3134         if (err)
3135                 return err;
3136         else {
3137                 sysfs_notify_dirent(mddev->sysfs_state);
3138                 return len;
3139         }
3140 }
3141 static struct md_sysfs_entry md_array_state =
3142 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3143
3144 static ssize_t
3145 null_show(mddev_t *mddev, char *page)
3146 {
3147         return -EINVAL;
3148 }
3149
3150 static ssize_t
3151 new_dev_store(mddev_t *mddev, const char *buf, size_t len)
3152 {
3153         /* buf must be %d:%d\n? giving major and minor numbers */
3154         /* The new device is added to the array.
3155          * If the array has a persistent superblock, we read the
3156          * superblock to initialise info and check validity.
3157          * Otherwise, only checking done is that in bind_rdev_to_array,
3158          * which mainly checks size.
3159          */
3160         char *e;
3161         int major = simple_strtoul(buf, &e, 10);
3162         int minor;
3163         dev_t dev;
3164         mdk_rdev_t *rdev;
3165         int err;
3166
3167         if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
3168                 return -EINVAL;
3169         minor = simple_strtoul(e+1, &e, 10);
3170         if (*e && *e != '\n')
3171                 return -EINVAL;
3172         dev = MKDEV(major, minor);
3173         if (major != MAJOR(dev) ||
3174             minor != MINOR(dev))
3175                 return -EOVERFLOW;
3176
3177
3178         if (mddev->persistent) {
3179                 rdev = md_import_device(dev, mddev->major_version,
3180                                         mddev->minor_version);
3181                 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
3182                         mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3183                                                        mdk_rdev_t, same_set);
3184                         err = super_types[mddev->major_version]
3185                                 .load_super(rdev, rdev0, mddev->minor_version);
3186                         if (err < 0)
3187                                 goto out;
3188                 }
3189         } else if (mddev->external)
3190                 rdev = md_import_device(dev, -2, -1);
3191         else
3192                 rdev = md_import_device(dev, -1, -1);
3193
3194         if (IS_ERR(rdev))
3195                 return PTR_ERR(rdev);
3196         err = bind_rdev_to_array(rdev, mddev);
3197  out:
3198         if (err)
3199                 export_rdev(rdev);
3200         return err ? err : len;
3201 }
3202
3203 static struct md_sysfs_entry md_new_device =
3204 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
3205
3206 static ssize_t
3207 bitmap_store(mddev_t *mddev, const char *buf, size_t len)
3208 {
3209         char *end;
3210         unsigned long chunk, end_chunk;
3211
3212         if (!mddev->bitmap)
3213                 goto out;
3214         /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
3215         while (*buf) {
3216                 chunk = end_chunk = simple_strtoul(buf, &end, 0);
3217                 if (buf == end) break;
3218                 if (*end == '-') { /* range */
3219                         buf = end + 1;
3220                         end_chunk = simple_strtoul(buf, &end, 0);
3221                         if (buf == end) break;
3222                 }
3223                 if (*end && !isspace(*end)) break;
3224                 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
3225                 buf = end;
3226                 while (isspace(*buf)) buf++;
3227         }
3228         bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
3229 out:
3230         return len;
3231 }
3232
3233 static struct md_sysfs_entry md_bitmap =
3234 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
3235
3236 static ssize_t
3237 size_show(mddev_t *mddev, char *page)
3238 {
3239         return sprintf(page, "%llu\n",
3240                 (unsigned long long)mddev->dev_sectors / 2);
3241 }
3242
3243 static int update_size(mddev_t *mddev, sector_t num_sectors);
3244
3245 static ssize_t
3246 size_store(mddev_t *mddev, const char *buf, size_t len)
3247 {
3248         /* If array is inactive, we can reduce the component size, but
3249          * not increase it (except from 0).
3250          * If array is active, we can try an on-line resize
3251          */
3252         sector_t sectors;
3253         int err = strict_blocks_to_sectors(buf, &sectors);
3254
3255         if (err < 0)
3256                 return err;
3257         if (mddev->pers) {
3258                 err = update_size(mddev, sectors);
3259                 md_update_sb(mddev, 1);
3260         } else {
3261                 if (mddev->dev_sectors == 0 ||
3262                     mddev->dev_sectors > sectors)
3263                         mddev->dev_sectors = sectors;
3264                 else
3265                         err = -ENOSPC;
3266         }
3267         return err ? err : len;
3268 }
3269
3270 static struct md_sysfs_entry md_size =
3271 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
3272
3273
3274 /* Metdata version.
3275  * This is one of
3276  *   'none' for arrays with no metadata (good luck...)
3277  *   'external' for arrays with externally managed metadata,
3278  * or N.M for internally known formats
3279  */
3280 static ssize_t
3281 metadata_show(mddev_t *mddev, char *page)
3282 {
3283         if (mddev->persistent)
3284                 return sprintf(page, "%d.%d\n",
3285                                mddev->major_version, mddev->minor_version);
3286         else if (mddev->external)
3287                 return sprintf(page, "external:%s\n", mddev->metadata_type);
3288         else
3289                 return sprintf(page, "none\n");
3290 }
3291
3292 static ssize_t
3293 metadata_store(mddev_t *mddev, const char *buf, size_t len)
3294 {
3295         int major, minor;
3296         char *e;
3297         /* Changing the details of 'external' metadata is
3298          * always permitted.  Otherwise there must be
3299          * no devices attached to the array.
3300          */
3301         if (mddev->external && strncmp(buf, "external:", 9) == 0)
3302                 ;
3303         else if (!list_empty(&mddev->disks))
3304                 return -EBUSY;
3305
3306         if (cmd_match(buf, "none")) {
3307                 mddev->persistent = 0;
3308                 mddev->external = 0;
3309                 mddev->major_version = 0;
3310                 mddev->minor_version = 90;
3311                 return len;
3312         }
3313         if (strncmp(buf, "external:", 9) == 0) {
3314                 size_t namelen = len-9;
3315                 if (namelen >= sizeof(mddev->metadata_type))
3316                         namelen = sizeof(mddev->metadata_type)-1;
3317                 strncpy(mddev->metadata_type, buf+9, namelen);
3318                 mddev->metadata_type[namelen] = 0;
3319                 if (namelen && mddev->metadata_type[namelen-1] == '\n')
3320                         mddev->metadata_type[--namelen] = 0;
3321                 mddev->persistent = 0;
3322                 mddev->external = 1;
3323                 mddev->major_version = 0;
3324                 mddev->minor_version = 90;
3325                 return len;
3326         }
3327         major = simple_strtoul(buf, &e, 10);
3328         if (e==buf || *e != '.')
3329                 return -EINVAL;
3330         buf = e+1;
3331         minor = simple_strtoul(buf, &e, 10);
3332         if (e==buf || (*e && *e != '\n') )
3333                 return -EINVAL;
3334         if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
3335                 return -ENOENT;
3336         mddev->major_version = major;
3337         mddev->minor_version = minor;
3338         mddev->persistent = 1;
3339         mddev->external = 0;
3340         return len;
3341 }
3342
3343 static struct md_sysfs_entry md_metadata =
3344 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
3345
3346 static ssize_t
3347 action_show(mddev_t *mddev, char *page)
3348 {
3349         char *type = "idle";
3350         if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3351                 type = "frozen";
3352         else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3353             (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
3354                 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3355                         type = "reshape";
3356                 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3357                         if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
3358                                 type = "resync";
3359                         else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
3360                                 type = "check";
3361                         else
3362                                 type = "repair";
3363                 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
3364                         type = "recover";
3365         }
3366         return sprintf(page, "%s\n", type);
3367 }
3368
3369 static ssize_t
3370 action_store(mddev_t *mddev, const char *page, size_t len)
3371 {
3372         if (!mddev->pers || !mddev->pers->sync_request)
3373                 return -EINVAL;
3374
3375         if (cmd_match(page, "frozen"))
3376                 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3377         else
3378                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3379
3380         if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
3381                 if (mddev->sync_thread) {
3382                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3383                         md_unregister_thread(mddev->sync_thread);
3384                         mddev->sync_thread = NULL;
3385                         mddev->recovery = 0;
3386                 }
3387         } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3388                    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
3389                 return -EBUSY;
3390         else if (cmd_match(page, "resync"))
3391                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3392         else if (cmd_match(page, "recover")) {
3393                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
3394                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3395         } else if (cmd_match(page, "reshape")) {
3396                 int err;
3397                 if (mddev->pers->start_reshape == NULL)
3398                         return -EINVAL;
3399                 err = mddev->pers->start_reshape(mddev);
3400                 if (err)
3401                         return err;
3402                 sysfs_notify(&mddev->kobj, NULL, "degraded");
3403         } else {
3404                 if (cmd_match(page, "check"))
3405                         set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3406                 else if (!cmd_match(page, "repair"))
3407                         return -EINVAL;
3408                 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
3409                 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3410         }
3411         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3412         md_wakeup_thread(mddev->thread);
3413         sysfs_notify_dirent(mddev->sysfs_action);
3414         return len;
3415 }
3416
3417 static ssize_t
3418 mismatch_cnt_show(mddev_t *mddev, char *page)
3419 {
3420         return sprintf(page, "%llu\n",
3421                        (unsigned long long) mddev->resync_mismatches);
3422 }
3423
3424 static struct md_sysfs_entry md_scan_mode =
3425 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
3426
3427
3428 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
3429
3430 static ssize_t
3431 sync_min_show(mddev_t *mddev, char *page)
3432 {
3433         return sprintf(page, "%d (%s)\n", speed_min(mddev),
3434                        mddev->sync_speed_min ? "local": "system");
3435 }
3436
3437 static ssize_t
3438 sync_min_store(mddev_t *mddev, const char *buf, size_t len)
3439 {
3440         int min;
3441         char *e;
3442         if (strncmp(buf, "system", 6)==0) {
3443                 mddev->sync_speed_min = 0;
3444                 return len;
3445         }
3446         min = simple_strtoul(buf, &e, 10);
3447         if (buf == e || (*e && *e != '\n') || min <= 0)
3448                 return -EINVAL;
3449         mddev->sync_speed_min = min;
3450         return len;
3451 }
3452
3453 static struct md_sysfs_entry md_sync_min =
3454 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
3455
3456 static ssize_t
3457 sync_max_show(mddev_t *mddev, char *page)
3458 {
3459         return sprintf(page, "%d (%s)\n", speed_max(mddev),
3460                        mddev->sync_speed_max ? "local": "system");
3461 }
3462
3463 static ssize_t
3464 sync_max_store(mddev_t *mddev, const char *buf, size_t len)
3465 {
3466         int max;
3467         char *e;
3468         if (strncmp(buf, "system", 6)==0) {
3469                 mddev->sync_speed_max = 0;
3470                 return len;
3471         }
3472         max = simple_strtoul(buf, &e, 10);
3473         if (buf == e || (*e && *e != '\n') || max <= 0)
3474                 return -EINVAL;
3475         mddev->sync_speed_max = max;
3476         return len;
3477 }
3478
3479 static struct md_sysfs_entry md_sync_max =
3480 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
3481
3482 static ssize_t
3483 degraded_show(mddev_t *mddev, char *page)
3484 {
3485         return sprintf(page, "%d\n", mddev->degraded);
3486 }
3487 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
3488
3489 static ssize_t
3490 sync_force_parallel_show(mddev_t *mddev, char *page)
3491 {
3492         return sprintf(page, "%d\n", mddev->parallel_resync);
3493 }
3494
3495 static ssize_t
3496 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
3497 {
3498         long n;
3499
3500         if (strict_strtol(buf, 10, &n))
3501                 return -EINVAL;
3502
3503         if (n != 0 && n != 1)
3504                 return -EINVAL;
3505
3506         mddev->parallel_resync = n;
3507
3508         if (mddev->sync_thread)
3509                 wake_up(&resync_wait);
3510
3511         return len;
3512 }
3513
3514 /* force parallel resync, even with shared block devices */
3515 static struct md_sysfs_entry md_sync_force_parallel =
3516 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
3517        sync_force_parallel_show, sync_force_parallel_store);
3518
3519 static ssize_t
3520 sync_speed_show(mddev_t *mddev, char *page)
3521 {
3522         unsigned long resync, dt, db;
3523         if (mddev->curr_resync == 0)
3524                 return sprintf(page, "none\n");
3525         resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3526         dt = (jiffies - mddev->resync_mark) / HZ;
3527         if (!dt) dt++;
3528         db = resync - mddev->resync_mark_cnt;
3529         return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
3530 }
3531
3532 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
3533
3534 static ssize_t
3535 sync_completed_show(mddev_t *mddev, char *page)
3536 {
3537         unsigned long max_sectors, resync;
3538
3539         if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3540                 return sprintf(page, "none\n");
3541
3542         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3543                 max_sectors = mddev->resync_max_sectors;
3544         else
3545                 max_sectors = mddev->dev_sectors;
3546
3547         resync = mddev->curr_resync_completed;
3548         return sprintf(page, "%lu / %lu\n", resync, max_sectors);
3549 }
3550
3551 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
3552
3553 static ssize_t
3554 min_sync_show(mddev_t *mddev, char *page)
3555 {
3556         return sprintf(page, "%llu\n",
3557                        (unsigned long long)mddev->resync_min);
3558 }
3559 static ssize_t
3560 min_sync_store(mddev_t *mddev, const char *buf, size_t len)
3561 {
3562         unsigned long long min;
3563         if (strict_strtoull(buf, 10, &min))
3564                 return -EINVAL;
3565         if (min > mddev->resync_max)
3566                 return -EINVAL;
3567         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3568                 return -EBUSY;
3569
3570         /* Must be a multiple of chunk_size */
3571         if (mddev->chunk_sectors) {
3572                 sector_t temp = min;
3573                 if (sector_div(temp, mddev->chunk_sectors))
3574                         return -EINVAL;
3575         }
3576         mddev->resync_min = min;
3577
3578         return len;
3579 }
3580
3581 static struct md_sysfs_entry md_min_sync =
3582 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
3583
3584 static ssize_t
3585 max_sync_show(mddev_t *mddev, char *page)
3586 {
3587         if (mddev->resync_max == MaxSector)
3588                 return sprintf(page, "max\n");
3589         else
3590                 return sprintf(page, "%llu\n",
3591                                (unsigned long long)mddev->resync_max);
3592 }
3593 static ssize_t
3594 max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3595 {
3596         if (strncmp(buf, "max", 3) == 0)
3597                 mddev->resync_max = MaxSector;
3598         else {
3599                 unsigned long long max;
3600                 if (strict_strtoull(buf, 10, &max))
3601                         return -EINVAL;
3602                 if (max < mddev->resync_min)
3603                         return -EINVAL;
3604                 if (max < mddev->resync_max &&
3605                     test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3606                         return -EBUSY;
3607
3608                 /* Must be a multiple of chunk_size */
3609                 if (mddev->chunk_sectors) {
3610                         sector_t temp = max;
3611                         if (sector_div(temp, mddev->chunk_sectors))
3612                                 return -EINVAL;
3613                 }
3614                 mddev->resync_max = max;
3615         }
3616         wake_up(&mddev->recovery_wait);
3617         return len;
3618 }
3619
3620 static struct md_sysfs_entry md_max_sync =
3621 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
3622
3623 static ssize_t
3624 suspend_lo_show(mddev_t *mddev, char *page)
3625 {
3626         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
3627 }
3628
3629 static ssize_t
3630 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
3631 {
3632         char *e;
3633         unsigned long long new = simple_strtoull(buf, &e, 10);
3634
3635         if (mddev->pers == NULL || 
3636             mddev->pers->quiesce == NULL)
3637                 return -EINVAL;
3638         if (buf == e || (*e && *e != '\n'))
3639                 return -EINVAL;
3640         if (new >= mddev->suspend_hi ||
3641             (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
3642                 mddev->suspend_lo = new;
3643                 mddev->pers->quiesce(mddev, 2);
3644                 return len;
3645         } else
3646                 return -EINVAL;
3647 }
3648 static struct md_sysfs_entry md_suspend_lo =
3649 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
3650
3651
3652 static ssize_t
3653 suspend_hi_show(mddev_t *mddev, char *page)
3654 {
3655         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
3656 }
3657
3658 static ssize_t
3659 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
3660 {
3661         char *e;
3662         unsigned long long new = simple_strtoull(buf, &e, 10);
3663
3664         if (mddev->pers == NULL ||
3665             mddev->pers->quiesce == NULL)
3666                 return -EINVAL;
3667         if (buf == e || (*e && *e != '\n'))
3668                 return -EINVAL;
3669         if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
3670             (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
3671                 mddev->suspend_hi = new;
3672                 mddev->pers->quiesce(mddev, 1);
3673                 mddev->pers->quiesce(mddev, 0);
3674                 return len;
3675         } else
3676                 return -EINVAL;
3677 }
3678 static struct md_sysfs_entry md_suspend_hi =
3679 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
3680
3681 static ssize_t
3682 reshape_position_show(mddev_t *mddev, char *page)
3683 {
3684         if (mddev->reshape_position != MaxSector)
3685                 return sprintf(page, "%llu\n",
3686                                (unsigned long long)mddev->reshape_position);
3687         strcpy(page, "none\n");
3688         return 5;
3689 }
3690
3691 static ssize_t
3692 reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
3693 {
3694         char *e;
3695         unsigned long long new = simple_strtoull(buf, &e, 10);
3696         if (mddev->pers)
3697                 return -EBUSY;
3698         if (buf == e || (*e && *e != '\n'))
3699                 return -EINVAL;
3700         mddev->reshape_position = new;
3701         mddev->delta_disks = 0;
3702         mddev->new_level = mddev->level;
3703         mddev->new_layout = mddev->layout;
3704         mddev->new_chunk_sectors = mddev->chunk_sectors;
3705         return len;
3706 }
3707
3708 static struct md_sysfs_entry md_reshape_position =
3709 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
3710        reshape_position_store);
3711
3712 static ssize_t
3713 array_size_show(mddev_t *mddev, char *page)
3714 {
3715         if (mddev->external_size)
3716                 return sprintf(page, "%llu\n",
3717                                (unsigned long long)mddev->array_sectors/2);
3718         else
3719                 return sprintf(page, "default\n");
3720 }
3721
3722 static ssize_t
3723 array_size_store(mddev_t *mddev, const char *buf, size_t len)
3724 {
3725         sector_t sectors;
3726
3727         if (strncmp(buf, "default", 7) == 0) {
3728                 if (mddev->pers)
3729                         sectors = mddev->pers->size(mddev, 0, 0);
3730                 else
3731                         sectors = mddev->array_sectors;
3732
3733                 mddev->external_size = 0;
3734         } else {
3735                 if (strict_blocks_to_sectors(buf, &sectors) < 0)
3736                         return -EINVAL;
3737                 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
3738                         return -E2BIG;
3739
3740                 mddev->external_size = 1;
3741         }
3742
3743         mddev->array_sectors = sectors;
3744         set_capacity(mddev->gendisk, mddev->array_sectors);
3745         if (mddev->pers)
3746                 revalidate_disk(mddev->gendisk);
3747
3748         return len;
3749 }
3750
3751 static struct md_sysfs_entry md_array_size =
3752 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
3753        array_size_store);
3754
3755 static struct attribute *md_default_attrs[] = {
3756         &md_level.attr,
3757         &md_layout.attr,
3758         &md_raid_disks.attr,
3759         &md_chunk_size.attr,
3760         &md_size.attr,
3761         &md_resync_start.attr,
3762         &md_metadata.attr,
3763         &md_new_device.attr,
3764         &md_safe_delay.attr,
3765         &md_array_state.attr,
3766         &md_reshape_position.attr,
3767         &md_array_size.attr,
3768         NULL,
3769 };
3770
3771 static struct attribute *md_redundancy_attrs[] = {
3772         &md_scan_mode.attr,
3773         &md_mismatches.attr,
3774         &md_sync_min.attr,
3775         &md_sync_max.attr,
3776         &md_sync_speed.attr,
3777         &md_sync_force_parallel.attr,
3778         &md_sync_completed.attr,
3779         &md_min_sync.attr,
3780         &md_max_sync.attr,
3781         &md_suspend_lo.attr,
3782         &md_suspend_hi.attr,
3783         &md_bitmap.attr,
3784         &md_degraded.attr,
3785         NULL,
3786 };
3787 static struct attribute_group md_redundancy_group = {
3788         .name = NULL,
3789         .attrs = md_redundancy_attrs,
3790 };
3791
3792
3793 static ssize_t
3794 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3795 {
3796         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3797         mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3798         ssize_t rv;
3799
3800         if (!entry->show)
3801                 return -EIO;
3802         rv = mddev_lock(mddev);
3803         if (!rv) {
3804                 rv = entry->show(mddev, page);
3805                 mddev_unlock(mddev);
3806         }
3807         return rv;
3808 }
3809
3810 static ssize_t
3811 md_attr_store(struct kobject *kobj, struct attribute *attr,
3812               const char *page, size_t length)
3813 {
3814         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3815         mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3816         ssize_t rv;
3817
3818         if (!entry->store)
3819                 return -EIO;
3820         if (!capable(CAP_SYS_ADMIN))
3821                 return -EACCES;
3822         rv = mddev_lock(mddev);
3823         if (mddev->hold_active == UNTIL_IOCTL)
3824                 mddev->hold_active = 0;
3825         if (!rv) {
3826                 rv = entry->store(mddev, page, length);
3827                 mddev_unlock(mddev);
3828         }
3829         return rv;
3830 }
3831
3832 static void md_free(struct kobject *ko)
3833 {
3834         mddev_t *mddev = container_of(ko, mddev_t, kobj);
3835
3836         if (mddev->sysfs_state)
3837                 sysfs_put(mddev->sysfs_state);
3838
3839         if (mddev->gendisk) {
3840                 del_gendisk(mddev->gendisk);
3841                 put_disk(mddev->gendisk);
3842         }
3843         if (mddev->queue)
3844                 blk_cleanup_queue(mddev->queue);
3845
3846         kfree(mddev);
3847 }
3848
3849 static struct sysfs_ops md_sysfs_ops = {
3850         .show   = md_attr_show,
3851         .store  = md_attr_store,
3852 };
3853 static struct kobj_type md_ktype = {
3854         .release        = md_free,
3855         .sysfs_ops      = &md_sysfs_ops,
3856         .default_attrs  = md_default_attrs,
3857 };
3858
3859 int mdp_major = 0;
3860
3861 static void mddev_delayed_delete(struct work_struct *ws)
3862 {
3863         mddev_t *mddev = container_of(ws, mddev_t, del_work);
3864
3865         if (mddev->private == &md_redundancy_group) {
3866                 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3867                 if (mddev->sysfs_action)
3868                         sysfs_put(mddev->sysfs_action);
3869                 mddev->sysfs_action = NULL;
3870                 mddev->private = NULL;
3871         }
3872         kobject_del(&mddev->kobj);
3873         kobject_put(&mddev->kobj);
3874 }
3875
3876 static int md_alloc(dev_t dev, char *name)
3877 {
3878         static DEFINE_MUTEX(disks_mutex);
3879         mddev_t *mddev = mddev_find(dev);
3880         struct gendisk *disk;
3881         int partitioned;
3882         int shift;
3883         int unit;
3884         int error;
3885
3886         if (!mddev)
3887                 return -ENODEV;
3888
3889         partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
3890         shift = partitioned ? MdpMinorShift : 0;
3891         unit = MINOR(mddev->unit) >> shift;
3892
3893         /* wait for any previous instance if this device
3894          * to be completed removed (mddev_delayed_delete).
3895          */
3896         flush_scheduled_work();
3897
3898         mutex_lock(&disks_mutex);
3899         error = -EEXIST;
3900         if (mddev->gendisk)
3901                 goto abort;
3902
3903         if (name) {
3904                 /* Need to ensure that 'name' is not a duplicate.
3905                  */
3906                 mddev_t *mddev2;
3907                 spin_lock(&all_mddevs_lock);
3908
3909                 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
3910                         if (mddev2->gendisk &&
3911                             strcmp(mddev2->gendisk->disk_name, name) == 0) {
3912                                 spin_unlock(&all_mddevs_lock);
3913                                 goto abort;
3914                         }
3915                 spin_unlock(&all_mddevs_lock);
3916         }
3917
3918         error = -ENOMEM;
3919         mddev->queue = blk_alloc_queue(GFP_KERNEL);
3920         if (!mddev->queue)
3921                 goto abort;
3922         mddev->queue->queuedata = mddev;
3923
3924         /* Can be unlocked because the queue is new: no concurrency */
3925         queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
3926
3927         blk_queue_make_request(mddev->queue, md_make_request);
3928
3929         disk = alloc_disk(1 << shift);
3930         if (!disk) {
3931                 blk_cleanup_queue(mddev->queue);
3932                 mddev->queue = NULL;
3933                 goto abort;
3934         }
3935         disk->major = MAJOR(mddev->unit);
3936         disk->first_minor = unit << shift;
3937         if (name)
3938                 strcpy(disk->disk_name, name);
3939         else if (partitioned)
3940                 sprintf(disk->disk_name, "md_d%d", unit);
3941         else
3942                 sprintf(disk->disk_name, "md%d", unit);
3943         disk->fops = &md_fops;
3944         disk->private_data = mddev;
3945         disk->queue = mddev->queue;
3946         /* Allow extended partitions.  This makes the
3947          * 'mdp' device redundant, but we can't really
3948          * remove it now.
3949          */
3950         disk->flags |= GENHD_FL_EXT_DEVT;
3951         add_disk(disk);
3952         mddev->gendisk = disk;
3953         error = kobject_init_and_add(&mddev->kobj, &md_ktype,
3954                                      &disk_to_dev(disk)->kobj, "%s", "md");
3955         if (error) {
3956                 /* This isn't possible, but as kobject_init_and_add is marked
3957                  * __must_check, we must do something with the result
3958                  */
3959                 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3960                        disk->disk_name);
3961                 error = 0;
3962         }
3963  abort:
3964         mutex_unlock(&disks_mutex);
3965         if (!error) {
3966                 kobject_uevent(&mddev->kobj, KOBJ_ADD);
3967                 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
3968         }
3969         mddev_put(mddev);
3970         return error;
3971 }
3972
3973 static struct kobject *md_probe(dev_t dev, int *part, void *data)
3974 {
3975         md_alloc(dev, NULL);
3976         return NULL;
3977 }
3978
3979 static int add_named_array(const char *val, struct kernel_param *kp)
3980 {
3981         /* val must be "md_*" where * is not all digits.
3982          * We allocate an array with a large free minor number, and
3983          * set the name to val.  val must not already be an active name.
3984          */
3985         int len = strlen(val);
3986         char buf[DISK_NAME_LEN];
3987
3988         while (len && val[len-1] == '\n')
3989                 len--;
3990         if (len >= DISK_NAME_LEN)
3991                 return -E2BIG;
3992         strlcpy(buf, val, len+1);
3993         if (strncmp(buf, "md_", 3) != 0)
3994                 return -EINVAL;
3995         return md_alloc(0, buf);
3996 }
3997
3998 static void md_safemode_timeout(unsigned long data)
3999 {
4000         mddev_t *mddev = (mddev_t *) data;
4001
4002         if (!atomic_read(&mddev->writes_pending)) {
4003                 mddev->safemode = 1;
4004                 if (mddev->external)
4005                         sysfs_notify_dirent(mddev->sysfs_state);
4006         }
4007         md_wakeup_thread(mddev->thread);
4008 }
4009
4010 static int start_dirty_degraded;
4011
4012 static int do_md_run(mddev_t * mddev)
4013 {
4014         int err;
4015         mdk_rdev_t *rdev;
4016         struct gendisk *disk;
4017         struct mdk_personality *pers;
4018
4019         if (list_empty(&mddev->disks))
4020                 /* cannot run an array with no devices.. */
4021                 return -EINVAL;
4022
4023         if (mddev->pers)
4024                 return -EBUSY;
4025
4026         /*
4027          * Analyze all RAID superblock(s)
4028          */
4029         if (!mddev->raid_disks) {
4030                 if (!mddev->persistent)
4031                         return -EINVAL;
4032                 analyze_sbs(mddev);
4033         }
4034
4035         if (mddev->level != LEVEL_NONE)
4036                 request_module("md-level-%d", mddev->level);
4037         else if (mddev->clevel[0])
4038                 request_module("md-%s", mddev->clevel);
4039
4040         /*
4041          * Drop all container device buffers, from now on
4042          * the only valid external interface is through the md
4043          * device.
4044          */
4045         list_for_each_entry(rdev, &mddev->disks, same_set) {
4046                 if (test_bit(Faulty, &rdev->flags))
4047                         continue;
4048                 sync_blockdev(rdev->bdev);
4049                 invalidate_bdev(rdev->bdev);
4050
4051                 /* perform some consistency tests on the device.
4052                  * We don't want the data to overlap the metadata,
4053                  * Internal Bitmap issues have been handled elsewhere.
4054                  */
4055                 if (rdev->data_offset < rdev->sb_start) {
4056                         if (mddev->dev_sectors &&
4057                             rdev->data_offset + mddev->dev_sectors
4058                             > rdev->sb_start) {
4059                                 printk("md: %s: data overlaps metadata\n",
4060                                        mdname(mddev));
4061                                 return -EINVAL;
4062                         }
4063                 } else {
4064                         if (rdev->sb_start + rdev->sb_size/512
4065                             > rdev->data_offset) {
4066                                 printk("md: %s: metadata overlaps data\n",
4067                                        mdname(mddev));
4068                                 return -EINVAL;
4069                         }
4070                 }
4071                 sysfs_notify_dirent(rdev->sysfs_state);
4072         }
4073
4074         md_probe(mddev->unit, NULL, NULL);
4075         disk = mddev->gendisk;
4076         if (!disk)
4077                 return -ENOMEM;
4078
4079         spin_lock(&pers_lock);
4080         pers = find_pers(mddev->level, mddev->clevel);
4081         if (!pers || !try_module_get(pers->owner)) {
4082                 spin_unlock(&pers_lock);
4083                 if (mddev->level != LEVEL_NONE)
4084                         printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
4085                                mddev->level);
4086                 else
4087                         printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
4088                                mddev->clevel);
4089                 return -EINVAL;
4090         }
4091         mddev->pers = pers;
4092         spin_unlock(&pers_lock);
4093         if (mddev->level != pers->level) {
4094                 mddev->level = pers->level;
4095                 mddev->new_level = pers->level;
4096         }
4097         strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
4098
4099         if (mddev->reshape_position != MaxSector &&
4100             pers->start_reshape == NULL) {
4101                 /* This personality cannot handle reshaping... */
4102                 mddev->pers = NULL;
4103                 module_put(pers->owner);
4104                 return -EINVAL;
4105         }
4106
4107         if (pers->sync_request) {
4108                 /* Warn if this is a potentially silly
4109                  * configuration.
4110                  */
4111                 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4112                 mdk_rdev_t *rdev2;
4113                 int warned = 0;
4114
4115                 list_for_each_entry(rdev, &mddev->disks, same_set)
4116                         list_for_each_entry(rdev2, &mddev->disks, same_set) {
4117                                 if (rdev < rdev2 &&
4118                                     rdev->bdev->bd_contains ==
4119                                     rdev2->bdev->bd_contains) {
4120                                         printk(KERN_WARNING
4121                                                "%s: WARNING: %s appears to be"
4122                                                " on the same physical disk as"
4123                                                " %s.\n",
4124                                                mdname(mddev),
4125                                                bdevname(rdev->bdev,b),
4126                                                bdevname(rdev2->bdev,b2));
4127                                         warned = 1;
4128                                 }
4129                         }
4130
4131                 if (warned)
4132                         printk(KERN_WARNING
4133                                "True protection against single-disk"
4134                                " failure might be compromised.\n");
4135         }
4136
4137         mddev->recovery = 0;
4138         /* may be over-ridden by personality */
4139         mddev->resync_max_sectors = mddev->dev_sectors;
4140
4141         mddev->barriers_work = 1;
4142         mddev->ok_start_degraded = start_dirty_degraded;
4143
4144         if (start_readonly)
4145                 mddev->ro = 2; /* read-only, but switch on first write */
4146
4147         err = mddev->pers->run(mddev);
4148         if (err)
4149                 printk(KERN_ERR "md: pers->run() failed ...\n");
4150         else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
4151                 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
4152                           " but 'external_size' not in effect?\n", __func__);
4153                 printk(KERN_ERR
4154                        "md: invalid array_size %llu > default size %llu\n",
4155                        (unsigned long long)mddev->array_sectors / 2,
4156                        (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
4157                 err = -EINVAL;
4158                 mddev->pers->stop(mddev);
4159         }
4160         if (err == 0 && mddev->pers->sync_request) {
4161                 err = bitmap_create(mddev);
4162                 if (err) {
4163                         printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
4164                                mdname(mddev), err);
4165                         mddev->pers->stop(mddev);
4166                 }
4167         }
4168         if (err) {
4169                 module_put(mddev->pers->owner);
4170                 mddev->pers = NULL;
4171                 bitmap_destroy(mddev);
4172                 return err;
4173         }
4174         if (mddev->pers->sync_request) {
4175                 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4176                         printk(KERN_WARNING
4177                                "md: cannot register extra attributes for %s\n",
4178                                mdname(mddev));
4179                 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4180         } else if (mddev->ro == 2) /* auto-readonly not meaningful */
4181                 mddev->ro = 0;
4182
4183         atomic_set(&mddev->writes_pending,0);
4184         mddev->safemode = 0;
4185         mddev->safemode_timer.function = md_safemode_timeout;
4186         mddev->safemode_timer.data = (unsigned long) mddev;
4187         mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
4188         mddev->in_sync = 1;
4189
4190         list_for_each_entry(rdev, &mddev->disks, same_set)
4191                 if (rdev->raid_disk >= 0) {
4192                         char nm[20];
4193                         sprintf(nm, "rd%d", rdev->raid_disk);
4194                         if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
4195                                 printk("md: cannot register %s for %s\n",
4196                                        nm, mdname(mddev));
4197                 }
4198         
4199         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4200         
4201         if (mddev->flags)
4202                 md_update_sb(mddev, 0);
4203
4204         set_capacity(disk, mddev->array_sectors);
4205
4206         /* If there is a partially-recovered drive we need to
4207          * start recovery here.  If we leave it to md_check_recovery,
4208          * it will remove the drives and not do the right thing
4209          */
4210         if (mddev->degraded && !mddev->sync_thread) {
4211                 int spares = 0;
4212                 list_for_each_entry(rdev, &mddev->disks, same_set)
4213                         if (rdev->raid_disk >= 0 &&
4214                             !test_bit(In_sync, &rdev->flags) &&
4215                             !test_bit(Faulty, &rdev->flags))
4216                                 /* complete an interrupted recovery */
4217                                 spares++;
4218                 if (spares && mddev->pers->sync_request) {
4219                         mddev->recovery = 0;
4220                         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4221                         mddev->sync_thread = md_register_thread(md_do_sync,
4222                                                                 mddev,
4223                                                                 "%s_resync");
4224                         if (!mddev->sync_thread) {
4225                                 printk(KERN_ERR "%s: could not start resync"
4226                                        " thread...\n",
4227                                        mdname(mddev));
4228                                 /* leave the spares where they are, it shouldn't hurt */
4229                                 mddev->recovery = 0;
4230                         }
4231                 }
4232         }
4233         md_wakeup_thread(mddev->thread);
4234         md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4235
4236         revalidate_disk(mddev->gendisk);
4237         mddev->changed = 1;
4238         md_new_event(mddev);
4239         sysfs_notify_dirent(mddev->sysfs_state);
4240         if (mddev->sysfs_action)
4241                 sysfs_notify_dirent(mddev->sysfs_action);
4242         sysfs_notify(&mddev->kobj, NULL, "degraded");
4243         kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4244         return 0;
4245 }
4246
4247 static int restart_array(mddev_t *mddev)
4248 {
4249         struct gendisk *disk = mddev->gendisk;
4250
4251         /* Complain if it has no devices */
4252         if (list_empty(&mddev->disks))
4253                 return -ENXIO;
4254         if (!mddev->pers)
4255                 return -EINVAL;
4256         if (!mddev->ro)
4257                 return -EBUSY;
4258         mddev->safemode = 0;
4259         mddev->ro = 0;
4260         set_disk_ro(disk, 0);
4261         printk(KERN_INFO "md: %s switched to read-write mode.\n",
4262                 mdname(mddev));
4263         /* Kick recovery or resync if necessary */
4264         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4265         md_wakeup_thread(mddev->thread);
4266         md_wakeup_thread(mddev->sync_thread);
4267         sysfs_notify_dirent(mddev->sysfs_state);
4268         return 0;
4269 }
4270
4271 /* similar to deny_write_access, but accounts for our holding a reference
4272  * to the file ourselves */
4273 static int deny_bitmap_write_access(struct file * file)
4274 {
4275         struct inode *inode = file->f_mapping->host;
4276
4277         spin_lock(&inode->i_lock);
4278         if (atomic_read(&inode->i_writecount) > 1) {
4279                 spin_unlock(&inode->i_lock);
4280                 return -ETXTBSY;
4281         }
4282         atomic_set(&inode->i_writecount, -1);
4283         spin_unlock(&inode->i_lock);
4284
4285         return 0;
4286 }
4287
4288 static void restore_bitmap_write_access(struct file *file)
4289 {
4290         struct inode *inode = file->f_mapping->host;
4291
4292         spin_lock(&inode->i_lock);
4293         atomic_set(&inode->i_writecount, 1);
4294         spin_unlock(&inode->i_lock);
4295 }
4296
4297 /* mode:
4298  *   0 - completely stop and dis-assemble array
4299  *   1 - switch to readonly
4300  *   2 - stop but do not disassemble array
4301  */
4302 static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4303 {
4304         int err = 0;
4305         struct gendisk *disk = mddev->gendisk;
4306         mdk_rdev_t *rdev;
4307
4308         mutex_lock(&mddev->open_mutex);
4309         if (atomic_read(&mddev->openers) > is_open) {
4310                 printk("md: %s still in use.\n",mdname(mddev));
4311                 err = -EBUSY;
4312         } else if (mddev->pers) {
4313
4314                 if (mddev->sync_thread) {
4315                         set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4316                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4317                         md_unregister_thread(mddev->sync_thread);
4318                         mddev->sync_thread = NULL;
4319                 }
4320
4321                 del_timer_sync(&mddev->safemode_timer);
4322
4323                 switch(mode) {
4324                 case 1: /* readonly */
4325                         err  = -ENXIO;
4326                         if (mddev->ro==1)
4327                                 goto out;
4328                         mddev->ro = 1;
4329                         break;
4330                 case 0: /* disassemble */
4331                 case 2: /* stop */
4332                         bitmap_flush(mddev);
4333                         md_super_wait(mddev);
4334                         if (mddev->ro)
4335                                 set_disk_ro(disk, 0);
4336
4337                         mddev->pers->stop(mddev);
4338                         mddev->queue->merge_bvec_fn = NULL;
4339                         mddev->queue->unplug_fn = NULL;
4340                         mddev->queue->backing_dev_info.congested_fn = NULL;
4341                         module_put(mddev->pers->owner);
4342                         if (mddev->pers->sync_request)
4343                                 mddev->private = &md_redundancy_group;
4344                         mddev->pers = NULL;
4345                         /* tell userspace to handle 'inactive' */
4346                         sysfs_notify_dirent(mddev->sysfs_state);
4347
4348                         list_for_each_entry(rdev, &mddev->disks, same_set)
4349                                 if (rdev->raid_disk >= 0) {
4350                                         char nm[20];
4351                                         sprintf(nm, "rd%d", rdev->raid_disk);
4352                                         sysfs_remove_link(&mddev->kobj, nm);
4353                                 }
4354
4355                         set_capacity(disk, 0);
4356                         mddev->changed = 1;
4357
4358                         if (mddev->ro)
4359                                 mddev->ro = 0;
4360                 }
4361                 if (!mddev->in_sync || mddev->flags) {
4362                         /* mark array as shutdown cleanly */
4363                         mddev->in_sync = 1;
4364                         md_update_sb(mddev, 1);
4365                 }
4366                 if (mode == 1)
4367                         set_disk_ro(disk, 1);
4368                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4369         }
4370 out:
4371         mutex_unlock(&mddev->open_mutex);
4372         if (err)
4373                 return err;
4374         /*
4375          * Free resources if final stop
4376          */
4377         if (mode == 0) {
4378
4379                 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
4380
4381                 bitmap_destroy(mddev);
4382                 if (mddev->bitmap_file) {
4383                         restore_bitmap_write_access(mddev->bitmap_file);
4384                         fput(mddev->bitmap_file);
4385                         mddev->bitmap_file = NULL;
4386                 }
4387                 mddev->bitmap_offset = 0;
4388
4389                 /* make sure all md_delayed_delete calls have finished */
4390                 flush_scheduled_work();
4391
4392                 export_array(mddev);
4393
4394                 mddev->array_sectors = 0;
4395                 mddev->external_size = 0;
4396                 mddev->dev_sectors = 0;
4397                 mddev->raid_disks = 0;
4398                 mddev->recovery_cp = 0;
4399                 mddev->resync_min = 0;
4400                 mddev->resync_max = MaxSector;
4401                 mddev->reshape_position = MaxSector;
4402                 mddev->external = 0;
4403                 mddev->persistent = 0;
4404                 mddev->level = LEVEL_NONE;
4405                 mddev->clevel[0] = 0;
4406                 mddev->flags = 0;
4407                 mddev->ro = 0;
4408                 mddev->metadata_type[0] = 0;
4409                 mddev->chunk_sectors = 0;
4410                 mddev->ctime = mddev->utime = 0;
4411                 mddev->layout = 0;
4412                 mddev->max_disks = 0;
4413                 mddev->events = 0;
4414                 mddev->delta_disks = 0;
4415                 mddev->new_level = LEVEL_NONE;
4416                 mddev->new_layout = 0;
4417                 mddev->new_chunk_sectors = 0;
4418                 mddev->curr_resync = 0;
4419                 mddev->resync_mismatches = 0;
4420                 mddev->suspend_lo = mddev->suspend_hi = 0;
4421                 mddev->sync_speed_min = mddev->sync_speed_max = 0;
4422                 mddev->recovery = 0;
4423                 mddev->in_sync = 0;
4424                 mddev->changed = 0;
4425                 mddev->degraded = 0;
4426                 mddev->barriers_work = 0;
4427                 mddev->safemode = 0;
4428                 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4429                 if (mddev->hold_active == UNTIL_STOP)
4430                         mddev->hold_active = 0;
4431
4432         } else if (mddev->pers)
4433                 printk(KERN_INFO "md: %s switched to read-only mode.\n",
4434                         mdname(mddev));
4435         err = 0;
4436         blk_integrity_unregister(disk);
4437         md_new_event(mddev);
4438         sysfs_notify_dirent(mddev->sysfs_state);
4439         return err;
4440 }
4441
4442 #ifndef MODULE
4443 static void autorun_array(mddev_t *mddev)
4444 {
4445         mdk_rdev_t *rdev;
4446         int err;
4447
4448         if (list_empty(&mddev->disks))
4449                 return;
4450
4451         printk(KERN_INFO "md: running: ");
4452
4453         list_for_each_entry(rdev, &mddev->disks, same_set) {
4454                 char b[BDEVNAME_SIZE];
4455                 printk("<%s>", bdevname(rdev->bdev,b));
4456         }
4457         printk("\n");
4458
4459         err = do_md_run(mddev);
4460         if (err) {
4461                 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
4462                 do_md_stop(mddev, 0, 0);
4463         }
4464 }
4465
4466 /*
4467  * lets try to run arrays based on all disks that have arrived
4468  * until now. (those are in pending_raid_disks)
4469  *
4470  * the method: pick the first pending disk, collect all disks with
4471  * the same UUID, remove all from the pending list and put them into
4472  * the 'same_array' list. Then order this list based on superblock
4473  * update time (freshest comes first), kick out 'old' disks and
4474  * compare superblocks. If everything's fine then run it.
4475  *
4476  * If "unit" is allocated, then bump its reference count
4477  */
4478 static void autorun_devices(int part)
4479 {
4480         mdk_rdev_t *rdev0, *rdev, *tmp;
4481         mddev_t *mddev;
4482         char b[BDEVNAME_SIZE];
4483
4484         printk(KERN_INFO "md: autorun ...\n");
4485         while (!list_empty(&pending_raid_disks)) {
4486                 int unit;
4487                 dev_t dev;
4488                 LIST_HEAD(candidates);
4489                 rdev0 = list_entry(pending_raid_disks.next,
4490                                          mdk_rdev_t, same_set);
4491
4492                 printk(KERN_INFO "md: considering %s ...\n",
4493                         bdevname(rdev0->bdev,b));
4494                 INIT_LIST_HEAD(&candidates);
4495                 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
4496                         if (super_90_load(rdev, rdev0, 0) >= 0) {
4497                                 printk(KERN_INFO "md:  adding %s ...\n",
4498                                         bdevname(rdev->bdev,b));
4499                                 list_move(&rdev->same_set, &candidates);
4500                         }
4501                 /*
4502                  * now we have a set of devices, with all of them having
4503                  * mostly sane superblocks. It's time to allocate the
4504                  * mddev.
4505                  */
4506                 if (part) {
4507                         dev = MKDEV(mdp_major,
4508                                     rdev0->preferred_minor << MdpMinorShift);
4509                         unit = MINOR(dev) >> MdpMinorShift;
4510                 } else {
4511                         dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
4512                         unit = MINOR(dev);
4513                 }
4514                 if (rdev0->preferred_minor != unit) {
4515                         printk(KERN_INFO "md: unit number in %s is bad: %d\n",
4516                                bdevname(rdev0->bdev, b), rdev0->preferred_minor);
4517                         break;
4518                 }
4519
4520                 md_probe(dev, NULL, NULL);
4521                 mddev = mddev_find(dev);
4522                 if (!mddev || !mddev->gendisk) {
4523                         if (mddev)
4524                                 mddev_put(mddev);
4525                         printk(KERN_ERR
4526                                 "md: cannot allocate memory for md drive.\n");
4527                         break;
4528                 }
4529                 if (mddev_lock(mddev)) 
4530                         printk(KERN_WARNING "md: %s locked, cannot run\n",
4531                                mdname(mddev));
4532                 else if (mddev->raid_disks || mddev->major_version
4533                          || !list_empty(&mddev->disks)) {
4534                         printk(KERN_WARNING 
4535                                 "md: %s already running, cannot run %s\n",
4536                                 mdname(mddev), bdevname(rdev0->bdev,b));
4537                         mddev_unlock(mddev);
4538                 } else {
4539                         printk(KERN_INFO "md: created %s\n", mdname(mddev));
4540                         mddev->persistent = 1;
4541                         rdev_for_each_list(rdev, tmp, &candidates) {
4542                                 list_del_init(&rdev->same_set);
4543                                 if (bind_rdev_to_array(rdev, mddev))
4544                                         export_rdev(rdev);
4545                         }
4546                         autorun_array(mddev);
4547                         mddev_unlock(mddev);
4548                 }
4549                 /* on success, candidates will be empty, on error
4550                  * it won't...
4551                  */
4552                 rdev_for_each_list(rdev, tmp, &candidates) {
4553                         list_del_init(&rdev->same_set);
4554                         export_rdev(rdev);
4555                 }
4556                 mddev_put(mddev);
4557         }
4558         printk(KERN_INFO "md: ... autorun DONE.\n");
4559 }
4560 #endif /* !MODULE */
4561
4562 static int get_version(void __user * arg)
4563 {
4564         mdu_version_t ver;
4565
4566         ver.major = MD_MAJOR_VERSION;
4567         ver.minor = MD_MINOR_VERSION;
4568         ver.patchlevel = MD_PATCHLEVEL_VERSION;
4569
4570         if (copy_to_user(arg, &ver, sizeof(ver)))
4571                 return -EFAULT;
4572
4573         return 0;
4574 }
4575
4576 static int get_array_info(mddev_t * mddev, void __user * arg)
4577 {
4578         mdu_array_info_t info;
4579         int nr,working,active,failed,spare;
4580         mdk_rdev_t *rdev;
4581
4582         nr=working=active=failed=spare=0;
4583         list_for_each_entry(rdev, &mddev->disks, same_set) {
4584                 nr++;
4585                 if (test_bit(Faulty, &rdev->flags))
4586                         failed++;
4587                 else {
4588                         working++;
4589                         if (test_bit(In_sync, &rdev->flags))
4590                                 active++;       
4591                         else
4592                                 spare++;
4593                 }
4594         }
4595
4596         info.major_version = mddev->major_version;
4597         info.minor_version = mddev->minor_version;
4598         info.patch_version = MD_PATCHLEVEL_VERSION;
4599         info.ctime         = mddev->ctime;
4600         info.level         = mddev->level;
4601         info.size          = mddev->dev_sectors / 2;
4602         if (info.size != mddev->dev_sectors / 2) /* overflow */
4603                 info.size = -1;
4604         info.nr_disks      = nr;
4605         info.raid_disks    = mddev->raid_disks;
4606         info.md_minor      = mddev->md_minor;
4607         info.not_persistent= !mddev->persistent;
4608
4609         info.utime         = mddev->utime;
4610         info.state         = 0;
4611         if (mddev->in_sync)
4612                 info.state = (1<<MD_SB_CLEAN);
4613         if (mddev->bitmap && mddev->bitmap_offset)
4614                 info.state = (1<<MD_SB_BITMAP_PRESENT);
4615         info.active_disks  = active;
4616         info.working_disks = working;
4617         info.failed_disks  = failed;
4618         info.spare_disks   = spare;
4619
4620         info.layout        = mddev->layout;
4621         info.chunk_size    = mddev->chunk_sectors << 9;
4622
4623         if (copy_to_user(arg, &info, sizeof(info)))
4624                 return -EFAULT;
4625
4626         return 0;
4627 }
4628
4629 static int get_bitmap_file(mddev_t * mddev, void __user * arg)
4630 {
4631         mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
4632         char *ptr, *buf = NULL;
4633         int err = -ENOMEM;
4634
4635         if (md_allow_write(mddev))
4636                 file = kmalloc(sizeof(*file), GFP_NOIO);
4637         else
4638                 file = kmalloc(sizeof(*file), GFP_KERNEL);
4639
4640         if (!file)
4641                 goto out;
4642
4643         /* bitmap disabled, zero the first byte and copy out */
4644         if (!mddev->bitmap || !mddev->bitmap->file) {
4645                 file->pathname[0] = '\0';
4646                 goto copy_out;
4647         }
4648
4649         buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
4650         if (!buf)
4651                 goto out;
4652
4653         ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
4654         if (IS_ERR(ptr))
4655                 goto out;
4656
4657         strcpy(file->pathname, ptr);
4658
4659 copy_out:
4660         err = 0;
4661         if (copy_to_user(arg, file, sizeof(*file)))
4662                 err = -EFAULT;
4663 out:
4664         kfree(buf);
4665         kfree(file);
4666         return err;
4667 }
4668
4669 static int get_disk_info(mddev_t * mddev, void __user * arg)
4670 {
4671         mdu_disk_info_t info;
4672         mdk_rdev_t *rdev;
4673
4674         if (copy_from_user(&info, arg, sizeof(info)))
4675                 return -EFAULT;
4676
4677         rdev = find_rdev_nr(mddev, info.number);
4678         if (rdev) {
4679                 info.major = MAJOR(rdev->bdev->bd_dev);
4680                 info.minor = MINOR(rdev->bdev->bd_dev);
4681                 info.raid_disk = rdev->raid_disk;
4682                 info.state = 0;
4683                 if (test_bit(Faulty, &rdev->flags))
4684                         info.state |= (1<<MD_DISK_FAULTY);
4685                 else if (test_bit(In_sync, &rdev->flags)) {
4686                         info.state |= (1<<MD_DISK_ACTIVE);
4687                         info.state |= (1<<MD_DISK_SYNC);
4688                 }
4689                 if (test_bit(WriteMostly, &rdev->flags))
4690                         info.state |= (1<<MD_DISK_WRITEMOSTLY);
4691         } else {
4692                 info.major = info.minor = 0;
4693                 info.raid_disk = -1;
4694                 info.state = (1<<MD_DISK_REMOVED);
4695         }
4696
4697         if (copy_to_user(arg, &info, sizeof(info)))
4698                 return -EFAULT;
4699
4700         return 0;
4701 }
4702
4703 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4704 {
4705         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4706         mdk_rdev_t *rdev;
4707         dev_t dev = MKDEV(info->major,info->minor);
4708
4709         if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
4710                 return -EOVERFLOW;
4711
4712         if (!mddev->raid_disks) {
4713                 int err;
4714                 /* expecting a device which has a superblock */
4715                 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
4716                 if (IS_ERR(rdev)) {
4717                         printk(KERN_WARNING 
4718                                 "md: md_import_device returned %ld\n",
4719                                 PTR_ERR(rdev));
4720                         return PTR_ERR(rdev);
4721                 }
4722                 if (!list_empty(&mddev->disks)) {
4723                         mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
4724                                                         mdk_rdev_t, same_set);
4725                         int err = super_types[mddev->major_version]
4726                                 .load_super(rdev, rdev0, mddev->minor_version);
4727                         if (err < 0) {
4728                                 printk(KERN_WARNING 
4729                                         "md: %s has different UUID to %s\n",
4730                                         bdevname(rdev->bdev,b), 
4731                                         bdevname(rdev0->bdev,b2));
4732                                 export_rdev(rdev);
4733                                 return -EINVAL;
4734                         }
4735                 }
4736                 err = bind_rdev_to_array(rdev, mddev);
4737                 if (err)
4738                         export_rdev(rdev);
4739                 return err;
4740         }
4741
4742         /*
4743          * add_new_disk can be used once the array is assembled
4744          * to add "hot spares".  They must already have a superblock
4745          * written
4746          */
4747         if (mddev->pers) {
4748                 int err;
4749                 if (!mddev->pers->hot_add_disk) {
4750                         printk(KERN_WARNING 
4751                                 "%s: personality does not support diskops!\n",
4752                                mdname(mddev));
4753                         return -EINVAL;
4754                 }
4755                 if (mddev->persistent)
4756                         rdev = md_import_device(dev, mddev->major_version,
4757                                                 mddev->minor_version);
4758                 else
4759                         rdev = md_import_device(dev, -1, -1);
4760                 if (IS_ERR(rdev)) {
4761                         printk(KERN_WARNING 
4762                                 "md: md_import_device returned %ld\n",
4763                                 PTR_ERR(rdev));
4764                         return PTR_ERR(rdev);
4765                 }
4766                 /* set save_raid_disk if appropriate */
4767                 if (!mddev->persistent) {
4768                         if (info->state & (1<<MD_DISK_SYNC)  &&
4769                             info->raid_disk < mddev->raid_disks)
4770                                 rdev->raid_disk = info->raid_disk;
4771                         else
4772                                 rdev->raid_disk = -1;
4773                 } else
4774                         super_types[mddev->major_version].
4775                                 validate_super(mddev, rdev);
4776                 rdev->saved_raid_disk = rdev->raid_disk;
4777
4778                 clear_bit(In_sync, &rdev->flags); /* just to be sure */
4779                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4780                         set_bit(WriteMostly, &rdev->flags);
4781                 else
4782                         clear_bit(WriteMostly, &rdev->flags);
4783
4784                 rdev->raid_disk = -1;
4785                 err = bind_rdev_to_array(rdev, mddev);
4786                 if (!err && !mddev->pers->hot_remove_disk) {
4787                         /* If there is hot_add_disk but no hot_remove_disk
4788                          * then added disks for geometry changes,
4789                          * and should be added immediately.
4790                          */
4791                         super_types[mddev->major_version].
4792                                 validate_super(mddev, rdev);
4793                         err = mddev->pers->hot_add_disk(mddev, rdev);
4794                         if (err)
4795                                 unbind_rdev_from_array(rdev);
4796                 }
4797                 if (err)
4798                         export_rdev(rdev);
4799                 else
4800                         sysfs_notify_dirent(rdev->sysfs_state);
4801
4802                 md_update_sb(mddev, 1);
4803                 if (mddev->degraded)
4804                         set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4805                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4806                 md_wakeup_thread(mddev->thread);
4807                 return err;
4808         }
4809
4810         /* otherwise, add_new_disk is only allowed
4811          * for major_version==0 superblocks
4812          */
4813         if (mddev->major_version != 0) {
4814                 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
4815                        mdname(mddev));
4816                 return -EINVAL;
4817         }
4818
4819         if (!(info->state & (1<<MD_DISK_FAULTY))) {
4820                 int err;
4821                 rdev = md_import_device(dev, -1, 0);
4822                 if (IS_ERR(rdev)) {
4823                         printk(KERN_WARNING 
4824                                 "md: error, md_import_device() returned %ld\n",
4825                                 PTR_ERR(rdev));
4826                         return PTR_ERR(rdev);
4827                 }
4828                 rdev->desc_nr = info->number;
4829                 if (info->raid_disk < mddev->raid_disks)
4830                         rdev->raid_disk = info->raid_disk;
4831                 else
4832                         rdev->raid_disk = -1;
4833
4834                 if (rdev->raid_disk < mddev->raid_disks)
4835                         if (info->state & (1<<MD_DISK_SYNC))
4836                                 set_bit(In_sync, &rdev->flags);
4837
4838                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4839                         set_bit(WriteMostly, &rdev->flags);
4840
4841                 if (!mddev->persistent) {
4842                         printk(KERN_INFO "md: nonpersistent superblock ...\n");
4843                         rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4844                 } else 
4845                         rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4846                 rdev->sectors = rdev->sb_start;
4847
4848                 err = bind_rdev_to_array(rdev, mddev);
4849                 if (err) {
4850                         export_rdev(rdev);
4851                         return err;
4852                 }
4853         }
4854
4855         return 0;
4856 }
4857
4858 static int hot_remove_disk(mddev_t * mddev, dev_t dev)
4859 {
4860         char b[BDEVNAME_SIZE];
4861         mdk_rdev_t *rdev;
4862
4863         rdev = find_rdev(mddev, dev);
4864         if (!rdev)
4865                 return -ENXIO;
4866
4867         if (rdev->raid_disk >= 0)
4868                 goto busy;
4869
4870         kick_rdev_from_array(rdev);
4871         md_update_sb(mddev, 1);
4872         md_new_event(mddev);
4873
4874         return 0;
4875 busy:
4876         printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
4877                 bdevname(rdev->bdev,b), mdname(mddev));
4878         return -EBUSY;
4879 }
4880
4881 static int hot_add_disk(mddev_t * mddev, dev_t dev)
4882 {
4883         char b[BDEVNAME_SIZE];
4884         int err;
4885         mdk_rdev_t *rdev;
4886
4887         if (!mddev->pers)
4888                 return -ENODEV;
4889
4890         if (mddev->major_version != 0) {
4891                 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
4892                         " version-0 superblocks.\n",
4893                         mdname(mddev));
4894                 return -EINVAL;
4895         }
4896         if (!mddev->pers->hot_add_disk) {
4897                 printk(KERN_WARNING 
4898                         "%s: personality does not support diskops!\n",
4899                         mdname(mddev));
4900                 return -EINVAL;
4901         }
4902
4903         rdev = md_import_device(dev, -1, 0);
4904         if (IS_ERR(rdev)) {
4905                 printk(KERN_WARNING 
4906                         "md: error, md_import_device() returned %ld\n",
4907                         PTR_ERR(rdev));
4908                 return -EINVAL;
4909         }
4910
4911         if (mddev->persistent)
4912                 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4913         else
4914                 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4915
4916         rdev->sectors = rdev->sb_start;
4917
4918         if (test_bit(Faulty, &rdev->flags)) {
4919                 printk(KERN_WARNING 
4920                         "md: can not hot-add faulty %s disk to %s!\n",
4921                         bdevname(rdev->bdev,b), mdname(mddev));
4922                 err = -EINVAL;
4923                 goto abort_export;
4924         }
4925         clear_bit(In_sync, &rdev->flags);
4926         rdev->desc_nr = -1;
4927         rdev->saved_raid_disk = -1;
4928         err = bind_rdev_to_array(rdev, mddev);
4929         if (err)
4930                 goto abort_export;
4931
4932         /*
4933          * The rest should better be atomic, we can have disk failures
4934          * noticed in interrupt contexts ...
4935          */
4936
4937         rdev->raid_disk = -1;
4938
4939         md_update_sb(mddev, 1);
4940
4941         /*
4942          * Kick recovery, maybe this spare has to be added to the
4943          * array immediately.
4944          */
4945         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4946         md_wakeup_thread(mddev->thread);
4947         md_new_event(mddev);
4948         return 0;
4949
4950 abort_export:
4951         export_rdev(rdev);
4952         return err;
4953 }
4954
4955 static int set_bitmap_file(mddev_t *mddev, int fd)
4956 {
4957         int err;
4958
4959         if (mddev->pers) {
4960                 if (!mddev->pers->quiesce)
4961                         return -EBUSY;
4962                 if (mddev->recovery || mddev->sync_thread)
4963                         return -EBUSY;
4964                 /* we should be able to change the bitmap.. */
4965         }
4966
4967
4968         if (fd >= 0) {
4969                 if (mddev->bitmap)
4970                         return -EEXIST; /* cannot add when bitmap is present */
4971                 mddev->bitmap_file = fget(fd);
4972
4973                 if (mddev->bitmap_file == NULL) {
4974                         printk(KERN_ERR "%s: error: failed to get bitmap file\n",
4975                                mdname(mddev));
4976                         return -EBADF;
4977                 }
4978
4979                 err = deny_bitmap_write_access(mddev->bitmap_file);
4980                 if (err) {
4981                         printk(KERN_ERR "%s: error: bitmap file is already in use\n",
4982                                mdname(mddev));
4983                         fput(mddev->bitmap_file);
4984                         mddev->bitmap_file = NULL;
4985                         return err;
4986                 }
4987                 mddev->bitmap_offset = 0; /* file overrides offset */
4988         } else if (mddev->bitmap == NULL)
4989                 return -ENOENT; /* cannot remove what isn't there */
4990         err = 0;
4991         if (mddev->pers) {
4992                 mddev->pers->quiesce(mddev, 1);
4993                 if (fd >= 0)
4994                         err = bitmap_create(mddev);
4995                 if (fd < 0 || err) {
4996                         bitmap_destroy(mddev);
4997                         fd = -1; /* make sure to put the file */
4998                 }
4999                 mddev->pers->quiesce(mddev, 0);
5000         }
5001         if (fd < 0) {
5002                 if (mddev->bitmap_file) {
5003                         restore_bitmap_write_access(mddev->bitmap_file);
5004                         fput(mddev->bitmap_file);
5005                 }
5006                 mddev->bitmap_file = NULL;
5007         }
5008
5009         return err;
5010 }
5011
5012 /*
5013  * set_array_info is used two different ways
5014  * The original usage is when creating a new array.
5015  * In this usage, raid_disks is > 0 and it together with
5016  *  level, size, not_persistent,layout,chunksize determine the
5017  *  shape of the array.
5018  *  This will always create an array with a type-0.90.0 superblock.
5019  * The newer usage is when assembling an array.
5020  *  In this case raid_disks will be 0, and the major_version field is
5021  *  use to determine which style super-blocks are to be found on the devices.
5022  *  The minor and patch _version numbers are also kept incase the
5023  *  super_block handler wishes to interpret them.
5024  */
5025 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
5026 {
5027
5028         if (info->raid_disks == 0) {
5029                 /* just setting version number for superblock loading */
5030                 if (info->major_version < 0 ||
5031                     info->major_version >= ARRAY_SIZE(super_types) ||
5032                     super_types[info->major_version].name == NULL) {
5033                         /* maybe try to auto-load a module? */
5034                         printk(KERN_INFO 
5035                                 "md: superblock version %d not known\n",
5036                                 info->major_version);
5037                         return -EINVAL;
5038                 }
5039                 mddev->major_version = info->major_version;
5040                 mddev->minor_version = info->minor_version;
5041                 mddev->patch_version = info->patch_version;
5042                 mddev->persistent = !info->not_persistent;
5043                 return 0;
5044         }
5045         mddev->major_version = MD_MAJOR_VERSION;
5046         mddev->minor_version = MD_MINOR_VERSION;
5047         mddev->patch_version = MD_PATCHLEVEL_VERSION;
5048         mddev->ctime         = get_seconds();
5049
5050         mddev->level         = info->level;
5051         mddev->clevel[0]     = 0;
5052         mddev->dev_sectors   = 2 * (sector_t)info->size;
5053         mddev->raid_disks    = info->raid_disks;
5054         /* don't set md_minor, it is determined by which /dev/md* was
5055          * openned
5056          */
5057         if (info->state & (1<<MD_SB_CLEAN))
5058                 mddev->recovery_cp = MaxSector;
5059         else
5060                 mddev->recovery_cp = 0;
5061         mddev->persistent    = ! info->not_persistent;
5062         mddev->external      = 0;
5063
5064         mddev->layout        = info->layout;
5065         mddev->chunk_sectors = info->chunk_size >> 9;
5066
5067         mddev->max_disks     = MD_SB_DISKS;
5068
5069         if (mddev->persistent)
5070                 mddev->flags         = 0;
5071         set_bit(MD_CHANGE_DEVS, &mddev->flags);
5072
5073         mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
5074         mddev->bitmap_offset = 0;
5075
5076         mddev->reshape_position = MaxSector;
5077
5078         /*
5079          * Generate a 128 bit UUID
5080          */
5081         get_random_bytes(mddev->uuid, 16);
5082
5083         mddev->new_level = mddev->level;
5084         mddev->new_chunk_sectors = mddev->chunk_sectors;
5085         mddev->new_layout = mddev->layout;
5086         mddev->delta_disks = 0;
5087
5088         return 0;
5089 }
5090
5091 void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
5092 {
5093         WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
5094
5095         if (mddev->external_size)
5096                 return;
5097
5098         mddev->array_sectors = array_sectors;
5099 }
5100 EXPORT_SYMBOL(md_set_array_sectors);
5101
5102 static int update_size(mddev_t *mddev, sector_t num_sectors)
5103 {
5104         mdk_rdev_t *rdev;
5105         int rv;
5106         int fit = (num_sectors == 0);
5107
5108         if (mddev->pers->resize == NULL)
5109                 return -EINVAL;
5110         /* The "num_sectors" is the number of sectors of each device that
5111          * is used.  This can only make sense for arrays with redundancy.
5112          * linear and raid0 always use whatever space is available. We can only
5113          * consider changing this number if no resync or reconstruction is
5114          * happening, and if the new size is acceptable. It must fit before the
5115          * sb_start or, if that is <data_offset, it must fit before the size
5116          * of each device.  If num_sectors is zero, we find the largest size
5117          * that fits.
5118
5119          */
5120         if (mddev->sync_thread)
5121                 return -EBUSY;
5122         if (mddev->bitmap)
5123                 /* Sorry, cannot grow a bitmap yet, just remove it,
5124                  * grow, and re-add.
5125                  */
5126                 return -EBUSY;
5127         list_for_each_entry(rdev, &mddev->disks, same_set) {
5128                 sector_t avail = rdev->sectors;
5129
5130                 if (fit && (num_sectors == 0 || num_sectors > avail))
5131                         num_sectors = avail;
5132                 if (avail < num_sectors)
5133                         return -ENOSPC;
5134         }
5135         rv = mddev->pers->resize(mddev, num_sectors);
5136         if (!rv)
5137                 revalidate_disk(mddev->gendisk);
5138         return rv;
5139 }
5140
5141 static int update_raid_disks(mddev_t *mddev, int raid_disks)
5142 {
5143         int rv;
5144         /* change the number of raid disks */
5145         if (mddev->pers->check_reshape == NULL)
5146                 return -EINVAL;
5147         if (raid_disks <= 0 ||
5148             raid_disks >= mddev->max_disks)
5149                 return -EINVAL;
5150         if (mddev->sync_thread || mddev->reshape_position != MaxSector)
5151                 return -EBUSY;
5152         mddev->delta_disks = raid_disks - mddev->raid_disks;
5153
5154         rv = mddev->pers->check_reshape(mddev);
5155         return rv;
5156 }
5157
5158
5159 /*
5160  * update_array_info is used to change the configuration of an
5161  * on-line array.
5162  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
5163  * fields in the info are checked against the array.
5164  * Any differences that cannot be handled will cause an error.
5165  * Normally, only one change can be managed at a time.
5166  */
5167 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5168 {
5169         int rv = 0;
5170         int cnt = 0;
5171         int state = 0;
5172
5173         /* calculate expected state,ignoring low bits */
5174         if (mddev->bitmap && mddev->bitmap_offset)
5175                 state |= (1 << MD_SB_BITMAP_PRESENT);
5176
5177         if (mddev->major_version != info->major_version ||
5178             mddev->minor_version != info->minor_version ||
5179 /*          mddev->patch_version != info->patch_version || */
5180             mddev->ctime         != info->ctime         ||
5181             mddev->level         != info->level         ||
5182 /*          mddev->layout        != info->layout        || */
5183             !mddev->persistent   != info->not_persistent||
5184             mddev->chunk_sectors != info->chunk_size >> 9 ||
5185             /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
5186             ((state^info->state) & 0xfffffe00)
5187                 )
5188                 return -EINVAL;
5189         /* Check there is only one change */
5190         if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5191                 cnt++;
5192         if (mddev->raid_disks != info->raid_disks)
5193                 cnt++;
5194         if (mddev->layout != info->layout)
5195                 cnt++;
5196         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
5197                 cnt++;
5198         if (cnt == 0)
5199                 return 0;
5200         if (cnt > 1)
5201                 return -EINVAL;
5202
5203         if (mddev->layout != info->layout) {
5204                 /* Change layout
5205                  * we don't need to do anything at the md level, the
5206                  * personality will take care of it all.
5207                  */
5208                 if (mddev->pers->check_reshape == NULL)
5209                         return -EINVAL;
5210                 else {
5211                         mddev->new_layout = info->layout;
5212                         rv = mddev->pers->check_reshape(mddev);
5213                         if (rv)
5214                                 mddev->new_layout = mddev->layout;
5215                         return rv;
5216                 }
5217         }
5218         if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5219                 rv = update_size(mddev, (sector_t)info->size * 2);
5220
5221         if (mddev->raid_disks    != info->raid_disks)
5222                 rv = update_raid_disks(mddev, info->raid_disks);
5223
5224         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
5225                 if (mddev->pers->quiesce == NULL)
5226                         return -EINVAL;
5227                 if (mddev->recovery || mddev->sync_thread)
5228                         return -EBUSY;
5229                 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
5230                         /* add the bitmap */
5231                         if (mddev->bitmap)
5232                                 return -EEXIST;
5233                         if (mddev->default_bitmap_offset == 0)
5234                                 return -EINVAL;
5235                         mddev->bitmap_offset = mddev->default_bitmap_offset;
5236                         mddev->pers->quiesce(mddev, 1);
5237                         rv = bitmap_create(mddev);
5238                         if (rv)
5239                                 bitmap_destroy(mddev);
5240                         mddev->pers->quiesce(mddev, 0);
5241                 } else {
5242                         /* remove the bitmap */
5243                         if (!mddev->bitmap)
5244                                 return -ENOENT;
5245                         if (mddev->bitmap->file)
5246                                 return -EINVAL;
5247                         mddev->pers->quiesce(mddev, 1);
5248                         bitmap_destroy(mddev);
5249                         mddev->pers->quiesce(mddev, 0);
5250                         mddev->bitmap_offset = 0;
5251                 }
5252         }
5253         md_update_sb(mddev, 1);
5254         return rv;
5255 }
5256
5257 static int set_disk_faulty(mddev_t *mddev, dev_t dev)
5258 {
5259         mdk_rdev_t *rdev;
5260
5261         if (mddev->pers == NULL)
5262                 return -ENODEV;
5263
5264         rdev = find_rdev(mddev, dev);
5265         if (!rdev)
5266                 return -ENODEV;
5267
5268         md_error(mddev, rdev);
5269         return 0;
5270 }
5271
5272 /*
5273  * We have a problem here : there is no easy way to give a CHS
5274  * virtual geometry. We currently pretend that we have a 2 heads
5275  * 4 sectors (with a BIG number of cylinders...). This drives
5276  * dosfs just mad... ;-)
5277  */
5278 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
5279 {
5280         mddev_t *mddev = bdev->bd_disk->private_data;
5281
5282         geo->heads = 2;
5283         geo->sectors = 4;
5284         geo->cylinders = get_capacity(mddev->gendisk) / 8;
5285         return 0;
5286 }
5287
5288 static int md_ioctl(struct block_device *bdev, fmode_t mode,
5289                         unsigned int cmd, unsigned long arg)
5290 {
5291         int err = 0;
5292         void __user *argp = (void __user *)arg;
5293         mddev_t *mddev = NULL;
5294
5295         if (!capable(CAP_SYS_ADMIN))
5296                 return -EACCES;
5297
5298         /*
5299          * Commands dealing with the RAID driver but not any
5300          * particular array:
5301          */
5302         switch (cmd)
5303         {
5304                 case RAID_VERSION:
5305                         err = get_version(argp);
5306                         goto done;
5307
5308                 case PRINT_RAID_DEBUG:
5309                         err = 0;
5310                         md_print_devices();
5311                         goto done;
5312
5313 #ifndef MODULE
5314                 case RAID_AUTORUN:
5315                         err = 0;
5316                         autostart_arrays(arg);
5317                         goto done;
5318 #endif
5319                 default:;
5320         }
5321
5322         /*
5323          * Commands creating/starting a new array:
5324          */
5325
5326         mddev = bdev->bd_disk->private_data;
5327
5328         if (!mddev) {
5329                 BUG();
5330                 goto abort;
5331         }
5332
5333         err = mddev_lock(mddev);
5334         if (err) {
5335                 printk(KERN_INFO 
5336                         "md: ioctl lock interrupted, reason %d, cmd %d\n",
5337                         err, cmd);
5338                 goto abort;
5339         }
5340
5341         switch (cmd)
5342         {
5343                 case SET_ARRAY_INFO:
5344                         {
5345                                 mdu_array_info_t info;
5346                                 if (!arg)
5347                                         memset(&info, 0, sizeof(info));
5348                                 else if (copy_from_user(&info, argp, sizeof(info))) {
5349                                         err = -EFAULT;
5350                                         goto abort_unlock;
5351                                 }
5352                                 if (mddev->pers) {
5353                                         err = update_array_info(mddev, &info);
5354                                         if (err) {
5355                                                 printk(KERN_WARNING "md: couldn't update"
5356                                                        " array info. %d\n", err);
5357                                                 goto abort_unlock;
5358                                         }
5359                                         goto done_unlock;
5360                                 }
5361                                 if (!list_empty(&mddev->disks)) {
5362                                         printk(KERN_WARNING
5363                                                "md: array %s already has disks!\n",
5364                                                mdname(mddev));
5365                                         err = -EBUSY;
5366                                         goto abort_unlock;
5367                                 }
5368                                 if (mddev->raid_disks) {
5369                                         printk(KERN_WARNING
5370                                                "md: array %s already initialised!\n",
5371                                                mdname(mddev));
5372                                         err = -EBUSY;
5373                                         goto abort_unlock;
5374                                 }
5375                                 err = set_array_info(mddev, &info);
5376                                 if (err) {
5377                                         printk(KERN_WARNING "md: couldn't set"
5378                                                " array info. %d\n", err);
5379                                         goto abort_unlock;
5380                                 }
5381                         }
5382                         goto done_unlock;
5383
5384                 default:;
5385         }
5386
5387         /*
5388          * Commands querying/configuring an existing array:
5389          */
5390         /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
5391          * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
5392         if ((!mddev->raid_disks && !mddev->external)
5393             && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
5394             && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
5395             && cmd != GET_BITMAP_FILE) {
5396                 err = -ENODEV;
5397                 goto abort_unlock;
5398         }
5399
5400         /*
5401          * Commands even a read-only array can execute:
5402          */
5403         switch (cmd)
5404         {
5405                 case GET_ARRAY_INFO:
5406                         err = get_array_info(mddev, argp);
5407                         goto done_unlock;
5408
5409                 case GET_BITMAP_FILE:
5410                         err = get_bitmap_file(mddev, argp);
5411                         goto done_unlock;
5412
5413                 case GET_DISK_INFO:
5414                         err = get_disk_info(mddev, argp);
5415                         goto done_unlock;
5416
5417                 case RESTART_ARRAY_RW:
5418                         err = restart_array(mddev);
5419                         goto done_unlock;
5420
5421                 case STOP_ARRAY:
5422                         err = do_md_stop(mddev, 0, 1);
5423                         goto done_unlock;
5424
5425                 case STOP_ARRAY_RO:
5426                         err = do_md_stop(mddev, 1, 1);
5427                         goto done_unlock;
5428
5429         }
5430
5431         /*
5432          * The remaining ioctls are changing the state of the
5433          * superblock, so we do not allow them on read-only arrays.
5434          * However non-MD ioctls (e.g. get-size) will still come through
5435          * here and hit the 'default' below, so only disallow
5436          * 'md' ioctls, and switch to rw mode if started auto-readonly.
5437          */
5438         if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
5439                 if (mddev->ro == 2) {
5440                         mddev->ro = 0;
5441                         sysfs_notify_dirent(mddev->sysfs_state);
5442                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5443                         md_wakeup_thread(mddev->thread);
5444                 } else {
5445                         err = -EROFS;
5446                         goto abort_unlock;
5447                 }
5448         }
5449
5450         switch (cmd)
5451         {
5452                 case ADD_NEW_DISK:
5453                 {
5454                         mdu_disk_info_t info;
5455                         if (copy_from_user(&info, argp, sizeof(info)))
5456                                 err = -EFAULT;
5457                         else
5458                                 err = add_new_disk(mddev, &info);
5459                         goto done_unlock;
5460                 }
5461
5462                 case HOT_REMOVE_DISK:
5463                         err = hot_remove_disk(mddev, new_decode_dev(arg));
5464                         goto done_unlock;
5465
5466                 case HOT_ADD_DISK:
5467                         err = hot_add_disk(mddev, new_decode_dev(arg));
5468                         goto done_unlock;
5469
5470                 case SET_DISK_FAULTY:
5471                         err = set_disk_faulty(mddev, new_decode_dev(arg));
5472                         goto done_unlock;
5473
5474                 case RUN_ARRAY:
5475                         err = do_md_run(mddev);
5476                         goto done_unlock;
5477
5478                 case SET_BITMAP_FILE:
5479                         err = set_bitmap_file(mddev, (int)arg);
5480                         goto done_unlock;
5481
5482                 default:
5483                         err = -EINVAL;
5484                         goto abort_unlock;
5485         }
5486
5487 done_unlock:
5488 abort_unlock:
5489         if (mddev->hold_active == UNTIL_IOCTL &&
5490             err != -EINVAL)
5491                 mddev->hold_active = 0;
5492         mddev_unlock(mddev);
5493
5494         return err;
5495 done:
5496         if (err)
5497                 MD_BUG();
5498 abort:
5499         return err;
5500 }
5501
5502 static int md_open(struct block_device *bdev, fmode_t mode)
5503 {
5504         /*
5505          * Succeed if we can lock the mddev, which confirms that
5506          * it isn't being stopped right now.
5507          */
5508         mddev_t *mddev = mddev_find(bdev->bd_dev);
5509         int err;
5510
5511         if (mddev->gendisk != bdev->bd_disk) {
5512                 /* we are racing with mddev_put which is discarding this
5513                  * bd_disk.
5514                  */
5515                 mddev_put(mddev);
5516                 /* Wait until bdev->bd_disk is definitely gone */
5517                 flush_scheduled_work();
5518                 /* Then retry the open from the top */
5519                 return -ERESTARTSYS;
5520         }
5521         BUG_ON(mddev != bdev->bd_disk->private_data);
5522
5523         if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
5524                 goto out;
5525
5526         err = 0;
5527         atomic_inc(&mddev->openers);
5528         mutex_unlock(&mddev->open_mutex);
5529
5530         check_disk_change(bdev);
5531  out:
5532         return err;
5533 }
5534
5535 static int md_release(struct gendisk *disk, fmode_t mode)
5536 {
5537         mddev_t *mddev = disk->private_data;
5538
5539         BUG_ON(!mddev);
5540         atomic_dec(&mddev->openers);
5541         mddev_put(mddev);
5542
5543         return 0;
5544 }
5545
5546 static int md_media_changed(struct gendisk *disk)
5547 {
5548         mddev_t *mddev = disk->private_data;
5549
5550         return mddev->changed;
5551 }
5552
5553 static int md_revalidate(struct gendisk *disk)
5554 {
5555         mddev_t *mddev = disk->private_data;
5556
5557         mddev->changed = 0;
5558         return 0;
5559 }
5560 static struct block_device_operations md_fops =
5561 {
5562         .owner          = THIS_MODULE,
5563         .open           = md_open,
5564         .release        = md_release,
5565         .ioctl          = md_ioctl,
5566         .getgeo         = md_getgeo,
5567         .media_changed  = md_media_changed,
5568         .revalidate_disk= md_revalidate,
5569 };
5570
5571 static int md_thread(void * arg)
5572 {
5573         mdk_thread_t *thread = arg;
5574
5575         /*
5576          * md_thread is a 'system-thread', it's priority should be very
5577          * high. We avoid resource deadlocks individually in each
5578          * raid personality. (RAID5 does preallocation) We also use RR and
5579          * the very same RT priority as kswapd, thus we will never get
5580          * into a priority inversion deadlock.
5581          *
5582          * we definitely have to have equal or higher priority than
5583          * bdflush, otherwise bdflush will deadlock if there are too
5584          * many dirty RAID5 blocks.
5585          */
5586
5587         allow_signal(SIGKILL);
5588         while (!kthread_should_stop()) {
5589
5590                 /* We need to wait INTERRUPTIBLE so that
5591                  * we don't add to the load-average.
5592                  * That means we need to be sure no signals are
5593                  * pending
5594                  */
5595                 if (signal_pending(current))
5596                         flush_signals(current);
5597
5598                 wait_event_interruptible_timeout
5599                         (thread->wqueue,
5600                          test_bit(THREAD_WAKEUP, &thread->flags)
5601                          || kthread_should_stop(),
5602                          thread->timeout);
5603
5604                 clear_bit(THREAD_WAKEUP, &thread->flags);
5605
5606                 thread->run(thread->mddev);
5607         }
5608
5609         return 0;
5610 }
5611
5612 void md_wakeup_thread(mdk_thread_t *thread)
5613 {
5614         if (thread) {
5615                 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
5616                 set_bit(THREAD_WAKEUP, &thread->flags);
5617                 wake_up(&thread->wqueue);
5618         }
5619 }
5620
5621 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
5622                                  const char *name)
5623 {
5624         mdk_thread_t *thread;
5625
5626         thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
5627         if (!thread)
5628                 return NULL;
5629
5630         init_waitqueue_head(&thread->wqueue);
5631
5632         thread->run = run;
5633         thread->mddev = mddev;
5634         thread->timeout = MAX_SCHEDULE_TIMEOUT;
5635         thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
5636         if (IS_ERR(thread->tsk)) {
5637                 kfree(thread);
5638                 return NULL;
5639         }
5640         return thread;
5641 }
5642
5643 void md_unregister_thread(mdk_thread_t *thread)
5644 {
5645         if (!thread)
5646                 return;
5647         dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
5648
5649         kthread_stop(thread->tsk);
5650         kfree(thread);
5651 }
5652
5653 void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5654 {
5655         if (!mddev) {
5656                 MD_BUG();
5657                 return;
5658         }
5659
5660         if (!rdev || test_bit(Faulty, &rdev->flags))
5661                 return;
5662
5663         if (mddev->external)
5664                 set_bit(Blocked, &rdev->flags);
5665 /*
5666         dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
5667                 mdname(mddev),
5668                 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
5669                 __builtin_return_address(0),__builtin_return_address(1),
5670                 __builtin_return_address(2),__builtin_return_address(3));
5671 */
5672         if (!mddev->pers)
5673                 return;
5674         if (!mddev->pers->error_handler)
5675                 return;
5676         mddev->pers->error_handler(mddev,rdev);
5677         if (mddev->degraded)
5678                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5679         set_bit(StateChanged, &rdev->flags);
5680         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5681         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5682         md_wakeup_thread(mddev->thread);
5683         md_new_event_inintr(mddev);
5684 }
5685
5686 /* seq_file implementation /proc/mdstat */
5687
5688 static void status_unused(struct seq_file *seq)
5689 {
5690         int i = 0;
5691         mdk_rdev_t *rdev;
5692
5693         seq_printf(seq, "unused devices: ");
5694
5695         list_for_each_entry(rdev, &pending_raid_disks, same_set) {
5696                 char b[BDEVNAME_SIZE];
5697                 i++;
5698                 seq_printf(seq, "%s ",
5699                               bdevname(rdev->bdev,b));
5700         }
5701         if (!i)
5702                 seq_printf(seq, "<none>");
5703
5704         seq_printf(seq, "\n");
5705 }
5706
5707
5708 static void status_resync(struct seq_file *seq, mddev_t * mddev)
5709 {
5710         sector_t max_sectors, resync, res;
5711         unsigned long dt, db;
5712         sector_t rt;
5713         int scale;
5714         unsigned int per_milli;
5715
5716         resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
5717
5718         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5719                 max_sectors = mddev->resync_max_sectors;
5720         else
5721                 max_sectors = mddev->dev_sectors;
5722
5723         /*
5724          * Should not happen.
5725          */
5726         if (!max_sectors) {
5727                 MD_BUG();
5728                 return;
5729         }
5730         /* Pick 'scale' such that (resync>>scale)*1000 will fit
5731          * in a sector_t, and (max_sectors>>scale) will fit in a
5732          * u32, as those are the requirements for sector_div.
5733          * Thus 'scale' must be at least 10
5734          */
5735         scale = 10;
5736         if (sizeof(sector_t) > sizeof(unsigned long)) {
5737                 while ( max_sectors/2 > (1ULL<<(scale+32)))
5738                         scale++;
5739         }
5740         res = (resync>>scale)*1000;
5741         sector_div(res, (u32)((max_sectors>>scale)+1));
5742
5743         per_milli = res;
5744         {
5745                 int i, x = per_milli/50, y = 20-x;
5746                 seq_printf(seq, "[");
5747                 for (i = 0; i < x; i++)
5748                         seq_printf(seq, "=");
5749                 seq_printf(seq, ">");
5750                 for (i = 0; i < y; i++)
5751                         seq_printf(seq, ".");
5752                 seq_printf(seq, "] ");
5753         }
5754         seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
5755                    (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
5756                     "reshape" :
5757                     (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
5758                      "check" :
5759                      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
5760                       "resync" : "recovery"))),
5761                    per_milli/10, per_milli % 10,
5762                    (unsigned long long) resync/2,
5763                    (unsigned long long) max_sectors/2);
5764
5765         /*
5766          * dt: time from mark until now
5767          * db: blocks written from mark until now
5768          * rt: remaining time
5769          *
5770          * rt is a sector_t, so could be 32bit or 64bit.
5771          * So we divide before multiply in case it is 32bit and close
5772          * to the limit.
5773          * We scale the divisor (db) by 32 to avoid loosing precision
5774          * near the end of resync when the number of remaining sectors
5775          * is close to 'db'.
5776          * We then divide rt by 32 after multiplying by db to compensate.
5777          * The '+1' avoids division by zero if db is very small.
5778          */
5779         dt = ((jiffies - mddev->resync_mark) / HZ);
5780         if (!dt) dt++;
5781         db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
5782                 - mddev->resync_mark_cnt;
5783
5784         rt = max_sectors - resync;    /* number of remaining sectors */
5785         sector_div(rt, db/32+1);
5786         rt *= dt;
5787         rt >>= 5;
5788
5789         seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
5790                    ((unsigned long)rt % 60)/6);
5791
5792         seq_printf(seq, " speed=%ldK/sec", db/2/dt);
5793 }
5794
5795 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
5796 {
5797         struct list_head *tmp;
5798         loff_t l = *pos;
5799         mddev_t *mddev;
5800
5801         if (l >= 0x10000)
5802                 return NULL;
5803         if (!l--)
5804                 /* header */
5805                 return (void*)1;
5806
5807         spin_lock(&all_mddevs_lock);
5808         list_for_each(tmp,&all_mddevs)
5809                 if (!l--) {
5810                         mddev = list_entry(tmp, mddev_t, all_mddevs);
5811                         mddev_get(mddev);
5812                         spin_unlock(&all_mddevs_lock);
5813                         return mddev;
5814                 }
5815         spin_unlock(&all_mddevs_lock);
5816         if (!l--)
5817                 return (void*)2;/* tail */
5818         return NULL;
5819 }
5820
5821 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
5822 {
5823         struct list_head *tmp;
5824         mddev_t *next_mddev, *mddev = v;
5825         
5826         ++*pos;
5827         if (v == (void*)2)
5828                 return NULL;
5829
5830         spin_lock(&all_mddevs_lock);
5831         if (v == (void*)1)
5832                 tmp = all_mddevs.next;
5833         else
5834                 tmp = mddev->all_mddevs.next;
5835         if (tmp != &all_mddevs)
5836                 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
5837         else {
5838                 next_mddev = (void*)2;
5839                 *pos = 0x10000;
5840         }               
5841         spin_unlock(&all_mddevs_lock);
5842
5843         if (v != (void*)1)
5844                 mddev_put(mddev);
5845         return next_mddev;
5846
5847 }
5848
5849 static void md_seq_stop(struct seq_file *seq, void *v)
5850 {
5851         mddev_t *mddev = v;
5852
5853         if (mddev && v != (void*)1 && v != (void*)2)
5854                 mddev_put(mddev);
5855 }
5856
5857 struct mdstat_info {
5858         int event;
5859 };
5860
5861 static int md_seq_show(struct seq_file *seq, void *v)
5862 {
5863         mddev_t *mddev = v;
5864         sector_t sectors;
5865         mdk_rdev_t *rdev;
5866         struct mdstat_info *mi = seq->private;
5867         struct bitmap *bitmap;
5868
5869         if (v == (void*)1) {
5870                 struct mdk_personality *pers;
5871                 seq_printf(seq, "Personalities : ");
5872                 spin_lock(&pers_lock);
5873                 list_for_each_entry(pers, &pers_list, list)
5874                         seq_printf(seq, "[%s] ", pers->name);
5875
5876                 spin_unlock(&pers_lock);
5877                 seq_printf(seq, "\n");
5878                 mi->event = atomic_read(&md_event_count);
5879                 return 0;
5880         }
5881         if (v == (void*)2) {
5882                 status_unused(seq);
5883                 return 0;
5884         }
5885
5886         if (mddev_lock(mddev) < 0)
5887                 return -EINTR;
5888
5889         if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
5890                 seq_printf(seq, "%s : %sactive", mdname(mddev),
5891                                                 mddev->pers ? "" : "in");
5892                 if (mddev->pers) {
5893                         if (mddev->ro==1)
5894                                 seq_printf(seq, " (read-only)");
5895                         if (mddev->ro==2)
5896                                 seq_printf(seq, " (auto-read-only)");
5897                         seq_printf(seq, " %s", mddev->pers->name);
5898                 }
5899
5900                 sectors = 0;
5901                 list_for_each_entry(rdev, &mddev->disks, same_set) {
5902                         char b[BDEVNAME_SIZE];
5903                         seq_printf(seq, " %s[%d]",
5904                                 bdevname(rdev->bdev,b), rdev->desc_nr);
5905                         if (test_bit(WriteMostly, &rdev->flags))
5906                                 seq_printf(seq, "(W)");
5907                         if (test_bit(Faulty, &rdev->flags)) {
5908                                 seq_printf(seq, "(F)");
5909                                 continue;
5910                         } else if (rdev->raid_disk < 0)
5911                                 seq_printf(seq, "(S)"); /* spare */
5912                         sectors += rdev->sectors;
5913                 }
5914
5915                 if (!list_empty(&mddev->disks)) {
5916                         if (mddev->pers)
5917                                 seq_printf(seq, "\n      %llu blocks",
5918                                            (unsigned long long)
5919                                            mddev->array_sectors / 2);
5920                         else
5921                                 seq_printf(seq, "\n      %llu blocks",
5922                                            (unsigned long long)sectors / 2);
5923                 }
5924                 if (mddev->persistent) {
5925                         if (mddev->major_version != 0 ||
5926                             mddev->minor_version != 90) {
5927                                 seq_printf(seq," super %d.%d",
5928                                            mddev->major_version,
5929                                            mddev->minor_version);
5930                         }
5931                 } else if (mddev->external)
5932                         seq_printf(seq, " super external:%s",
5933                                    mddev->metadata_type);
5934                 else
5935                         seq_printf(seq, " super non-persistent");
5936
5937                 if (mddev->pers) {
5938                         mddev->pers->status(seq, mddev);
5939                         seq_printf(seq, "\n      ");
5940                         if (mddev->pers->sync_request) {
5941                                 if (mddev->curr_resync > 2) {
5942                                         status_resync(seq, mddev);
5943                                         seq_printf(seq, "\n      ");
5944                                 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
5945                                         seq_printf(seq, "\tresync=DELAYED\n      ");
5946                                 else if (mddev->recovery_cp < MaxSector)
5947                                         seq_printf(seq, "\tresync=PENDING\n      ");
5948                         }
5949                 } else
5950                         seq_printf(seq, "\n       ");
5951
5952                 if ((bitmap = mddev->bitmap)) {
5953                         unsigned long chunk_kb;
5954                         unsigned long flags;
5955                         spin_lock_irqsave(&bitmap->lock, flags);
5956                         chunk_kb = bitmap->chunksize >> 10;
5957                         seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
5958                                 "%lu%s chunk",
5959                                 bitmap->pages - bitmap->missing_pages,
5960                                 bitmap->pages,
5961                                 (bitmap->pages - bitmap->missing_pages)
5962                                         << (PAGE_SHIFT - 10),
5963                                 chunk_kb ? chunk_kb : bitmap->chunksize,
5964                                 chunk_kb ? "KB" : "B");
5965                         if (bitmap->file) {
5966                                 seq_printf(seq, ", file: ");
5967                                 seq_path(seq, &bitmap->file->f_path, " \t\n");
5968                         }
5969
5970                         seq_printf(seq, "\n");
5971                         spin_unlock_irqrestore(&bitmap->lock, flags);
5972                 }
5973
5974                 seq_printf(seq, "\n");
5975         }
5976         mddev_unlock(mddev);
5977         
5978         return 0;
5979 }
5980
5981 static const struct seq_operations md_seq_ops = {
5982         .start  = md_seq_start,
5983         .next   = md_seq_next,
5984         .stop   = md_seq_stop,
5985         .show   = md_seq_show,
5986 };
5987
5988 static int md_seq_open(struct inode *inode, struct file *file)
5989 {
5990         int error;
5991         struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
5992         if (mi == NULL)
5993                 return -ENOMEM;
5994
5995         error = seq_open(file, &md_seq_ops);
5996         if (error)
5997                 kfree(mi);
5998         else {
5999                 struct seq_file *p = file->private_data;
6000                 p->private = mi;
6001                 mi->event = atomic_read(&md_event_count);
6002         }
6003         return error;
6004 }
6005
6006 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
6007 {
6008         struct seq_file *m = filp->private_data;
6009         struct mdstat_info *mi = m->private;
6010         int mask;
6011
6012         poll_wait(filp, &md_event_waiters, wait);
6013
6014         /* always allow read */
6015         mask = POLLIN | POLLRDNORM;
6016
6017         if (mi->event != atomic_read(&md_event_count))
6018                 mask |= POLLERR | POLLPRI;
6019         return mask;
6020 }
6021
6022 static const struct file_operations md_seq_fops = {
6023         .owner          = THIS_MODULE,
6024         .open           = md_seq_open,
6025         .read           = seq_read,
6026         .llseek         = seq_lseek,
6027         .release        = seq_release_private,
6028         .poll           = mdstat_poll,
6029 };
6030
6031 int register_md_personality(struct mdk_personality *p)
6032 {
6033         spin_lock(&pers_lock);
6034         list_add_tail(&p->list, &pers_list);
6035         printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
6036         spin_unlock(&pers_lock);
6037         return 0;
6038 }
6039
6040 int unregister_md_personality(struct mdk_personality *p)
6041 {
6042         printk(KERN_INFO "md: %s personality unregistered\n", p->name);
6043         spin_lock(&pers_lock);
6044         list_del_init(&p->list);
6045         spin_unlock(&pers_lock);
6046         return 0;
6047 }
6048
6049 static int is_mddev_idle(mddev_t *mddev, int init)
6050 {
6051         mdk_rdev_t * rdev;
6052         int idle;
6053         int curr_events;
6054
6055         idle = 1;
6056         rcu_read_lock();
6057         rdev_for_each_rcu(rdev, mddev) {
6058                 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
6059                 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
6060                               (int)part_stat_read(&disk->part0, sectors[1]) -
6061                               atomic_read(&disk->sync_io);
6062                 /* sync IO will cause sync_io to increase before the disk_stats
6063                  * as sync_io is counted when a request starts, and
6064                  * disk_stats is counted when it completes.
6065                  * So resync activity will cause curr_events to be smaller than
6066                  * when there was no such activity.
6067                  * non-sync IO will cause disk_stat to increase without
6068                  * increasing sync_io so curr_events will (eventually)
6069                  * be larger than it was before.  Once it becomes
6070                  * substantially larger, the test below will cause
6071                  * the array to appear non-idle, and resync will slow
6072                  * down.
6073                  * If there is a lot of outstanding resync activity when
6074                  * we set last_event to curr_events, then all that activity
6075                  * completing might cause the array to appear non-idle
6076                  * and resync will be slowed down even though there might
6077                  * not have been non-resync activity.  This will only
6078                  * happen once though.  'last_events' will soon reflect
6079                  * the state where there is little or no outstanding
6080                  * resync requests, and further resync activity will
6081                  * always make curr_events less than last_events.
6082                  *
6083                  */
6084                 if (init || curr_events - rdev->last_events > 64) {
6085                         rdev->last_events = curr_events;
6086                         idle = 0;
6087                 }
6088         }
6089         rcu_read_unlock();
6090         return idle;
6091 }
6092
6093 void md_done_sync(mddev_t *mddev, int blocks, int ok)
6094 {
6095         /* another "blocks" (512byte) blocks have been synced */
6096         atomic_sub(blocks, &mddev->recovery_active);
6097         wake_up(&mddev->recovery_wait);
6098         if (!ok) {
6099                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6100                 md_wakeup_thread(mddev->thread);
6101                 // stop recovery, signal do_sync ....
6102         }
6103 }
6104
6105
6106 /* md_write_start(mddev, bi)
6107  * If we need to update some array metadata (e.g. 'active' flag
6108  * in superblock) before writing, schedule a superblock update
6109  * and wait for it to complete.
6110  */
6111 void md_write_start(mddev_t *mddev, struct bio *bi)
6112 {
6113         int did_change = 0;
6114         if (bio_data_dir(bi) != WRITE)
6115                 return;
6116
6117         BUG_ON(mddev->ro == 1);
6118         if (mddev->ro == 2) {
6119                 /* need to switch to read/write */
6120                 mddev->ro = 0;
6121                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6122                 md_wakeup_thread(mddev->thread);
6123                 md_wakeup_thread(mddev->sync_thread);
6124                 did_change = 1;
6125         }
6126         atomic_inc(&mddev->writes_pending);
6127         if (mddev->safemode == 1)
6128                 mddev->safemode = 0;
6129         if (mddev->in_sync) {
6130                 spin_lock_irq(&mddev->write_lock);
6131                 if (mddev->in_sync) {
6132                         mddev->in_sync = 0;
6133                         set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6134                         md_wakeup_thread(mddev->thread);
6135                         did_change = 1;
6136                 }
6137                 spin_unlock_irq(&mddev->write_lock);
6138         }
6139         if (did_change)
6140                 sysfs_notify_dirent(mddev->sysfs_state);
6141         wait_event(mddev->sb_wait,
6142                    !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
6143                    !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6144 }
6145
6146 void md_write_end(mddev_t *mddev)
6147 {
6148         if (atomic_dec_and_test(&mddev->writes_pending)) {
6149                 if (mddev->safemode == 2)
6150                         md_wakeup_thread(mddev->thread);
6151                 else if (mddev->safemode_delay)
6152                         mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
6153         }
6154 }
6155
6156 /* md_allow_write(mddev)
6157  * Calling this ensures that the array is marked 'active' so that writes
6158  * may proceed without blocking.  It is important to call this before
6159  * attempting a GFP_KERNEL allocation while holding the mddev lock.
6160  * Must be called with mddev_lock held.
6161  *
6162  * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
6163  * is dropped, so return -EAGAIN after notifying userspace.
6164  */
6165 int md_allow_write(mddev_t *mddev)
6166 {
6167         if (!mddev->pers)
6168                 return 0;
6169         if (mddev->ro)
6170                 return 0;
6171         if (!mddev->pers->sync_request)
6172                 return 0;
6173
6174         spin_lock_irq(&mddev->write_lock);
6175         if (mddev->in_sync) {
6176                 mddev->in_sync = 0;
6177                 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6178                 if (mddev->safemode_delay &&
6179                     mddev->safemode == 0)
6180                         mddev->safemode = 1;
6181                 spin_unlock_irq(&mddev->write_lock);
6182                 md_update_sb(mddev, 0);
6183                 sysfs_notify_dirent(mddev->sysfs_state);
6184         } else
6185                 spin_unlock_irq(&mddev->write_lock);
6186
6187         if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
6188                 return -EAGAIN;
6189         else
6190                 return 0;
6191 }
6192 EXPORT_SYMBOL_GPL(md_allow_write);
6193
6194 #define SYNC_MARKS      10
6195 #define SYNC_MARK_STEP  (3*HZ)
6196 void md_do_sync(mddev_t *mddev)
6197 {
6198         mddev_t *mddev2;
6199         unsigned int currspeed = 0,
6200                  window;
6201         sector_t max_sectors,j, io_sectors;
6202         unsigned long mark[SYNC_MARKS];
6203         sector_t mark_cnt[SYNC_MARKS];
6204         int last_mark,m;
6205         struct list_head *tmp;
6206         sector_t last_check;
6207         int skipped = 0;
6208         mdk_rdev_t *rdev;
6209         char *desc;
6210
6211         /* just incase thread restarts... */
6212         if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
6213                 return;
6214         if (mddev->ro) /* never try to sync a read-only array */
6215                 return;
6216
6217         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6218                 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
6219                         desc = "data-check";
6220                 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6221                         desc = "requested-resync";
6222                 else
6223                         desc = "resync";
6224         } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6225                 desc = "reshape";
6226         else
6227                 desc = "recovery";
6228
6229         /* we overload curr_resync somewhat here.
6230          * 0 == not engaged in resync at all
6231          * 2 == checking that there is no conflict with another sync
6232          * 1 == like 2, but have yielded to allow conflicting resync to
6233          *              commense
6234          * other == active in resync - this many blocks
6235          *
6236          * Before starting a resync we must have set curr_resync to
6237          * 2, and then checked that every "conflicting" array has curr_resync
6238          * less than ours.  When we find one that is the same or higher
6239          * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
6240          * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
6241          * This will mean we have to start checking from the beginning again.
6242          *
6243          */
6244
6245         do {
6246                 mddev->curr_resync = 2;
6247
6248         try_again:
6249                 if (kthread_should_stop()) {
6250                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6251                         goto skip;
6252                 }
6253                 for_each_mddev(mddev2, tmp) {
6254                         if (mddev2 == mddev)
6255                                 continue;
6256                         if (!mddev->parallel_resync
6257                         &&  mddev2->curr_resync
6258                         &&  match_mddev_units(mddev, mddev2)) {
6259                                 DEFINE_WAIT(wq);
6260                                 if (mddev < mddev2 && mddev->curr_resync == 2) {
6261                                         /* arbitrarily yield */
6262                                         mddev->curr_resync = 1;
6263                                         wake_up(&resync_wait);
6264                                 }
6265                                 if (mddev > mddev2 && mddev->curr_resync == 1)
6266                                         /* no need to wait here, we can wait the next
6267                                          * time 'round when curr_resync == 2
6268                                          */
6269                                         continue;
6270                                 /* We need to wait 'interruptible' so as not to
6271                                  * contribute to the load average, and not to
6272                                  * be caught by 'softlockup'
6273                                  */
6274                                 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
6275                                 if (!kthread_should_stop() &&
6276                                     mddev2->curr_resync >= mddev->curr_resync) {
6277                                         printk(KERN_INFO "md: delaying %s of %s"
6278                                                " until %s has finished (they"
6279                                                " share one or more physical units)\n",
6280                                                desc, mdname(mddev), mdname(mddev2));
6281                                         mddev_put(mddev2);
6282                                         if (signal_pending(current))
6283                                                 flush_signals(current);
6284                                         schedule();
6285                                         finish_wait(&resync_wait, &wq);
6286                                         goto try_again;
6287                                 }
6288                                 finish_wait(&resync_wait, &wq);
6289                         }
6290                 }
6291         } while (mddev->curr_resync < 2);
6292
6293         j = 0;
6294         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6295                 /* resync follows the size requested by the personality,
6296                  * which defaults to physical size, but can be virtual size
6297                  */
6298                 max_sectors = mddev->resync_max_sectors;
6299                 mddev->resync_mismatches = 0;
6300                 /* we don't use the checkpoint if there's a bitmap */
6301                 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6302                         j = mddev->resync_min;
6303                 else if (!mddev->bitmap)
6304                         j = mddev->recovery_cp;
6305
6306         } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6307                 max_sectors = mddev->dev_sectors;
6308         else {
6309                 /* recovery follows the physical size of devices */
6310                 max_sectors = mddev->dev_sectors;
6311                 j = MaxSector;
6312                 list_for_each_entry(rdev, &mddev->disks, same_set)
6313                         if (rdev->raid_disk >= 0 &&
6314                             !test_bit(Faulty, &rdev->flags) &&
6315                             !test_bit(In_sync, &rdev->flags) &&
6316                             rdev->recovery_offset < j)
6317                                 j = rdev->recovery_offset;
6318         }
6319
6320         printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
6321         printk(KERN_INFO "md: minimum _guaranteed_  speed:"
6322                 " %d KB/sec/disk.\n", speed_min(mddev));
6323         printk(KERN_INFO "md: using maximum available idle IO bandwidth "
6324                "(but not more than %d KB/sec) for %s.\n",
6325                speed_max(mddev), desc);
6326
6327         is_mddev_idle(mddev, 1); /* this initializes IO event counters */
6328
6329         io_sectors = 0;
6330         for (m = 0; m < SYNC_MARKS; m++) {
6331                 mark[m] = jiffies;
6332                 mark_cnt[m] = io_sectors;
6333         }
6334         last_mark = 0;
6335         mddev->resync_mark = mark[last_mark];
6336         mddev->resync_mark_cnt = mark_cnt[last_mark];
6337
6338         /*
6339          * Tune reconstruction:
6340          */
6341         window = 32*(PAGE_SIZE/512);
6342         printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
6343                 window/2,(unsigned long long) max_sectors/2);
6344
6345         atomic_set(&mddev->recovery_active, 0);
6346         last_check = 0;
6347
6348         if (j>2) {
6349                 printk(KERN_INFO 
6350                        "md: resuming %s of %s from checkpoint.\n",
6351                        desc, mdname(mddev));
6352                 mddev->curr_resync = j;
6353         }
6354
6355         while (j < max_sectors) {
6356                 sector_t sectors;
6357
6358                 skipped = 0;
6359
6360                 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
6361                     ((mddev->curr_resync > mddev->curr_resync_completed &&
6362                       (mddev->curr_resync - mddev->curr_resync_completed)
6363                       > (max_sectors >> 4)) ||
6364                      (j - mddev->curr_resync_completed)*2
6365                      >= mddev->resync_max - mddev->curr_resync_completed
6366                             )) {
6367                         /* time to update curr_resync_completed */
6368                         blk_unplug(mddev->queue);
6369                         wait_event(mddev->recovery_wait,
6370                                    atomic_read(&mddev->recovery_active) == 0);
6371                         mddev->curr_resync_completed =
6372                                 mddev->curr_resync;
6373                         set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6374                         sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6375                 }
6376
6377                 while (j >= mddev->resync_max && !kthread_should_stop()) {
6378                         /* As this condition is controlled by user-space,
6379                          * we can block indefinitely, so use '_interruptible'
6380                          * to avoid triggering warnings.
6381                          */
6382                         flush_signals(current); /* just in case */
6383                         wait_event_interruptible(mddev->recovery_wait,
6384                                                  mddev->resync_max > j
6385                                                  || kthread_should_stop());
6386                 }
6387
6388                 if (kthread_should_stop())
6389                         goto interrupted;
6390
6391                 sectors = mddev->pers->sync_request(mddev, j, &skipped,
6392                                                   currspeed < speed_min(mddev));
6393                 if (sectors == 0) {
6394                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6395                         goto out;
6396                 }
6397
6398                 if (!skipped) { /* actual IO requested */
6399                         io_sectors += sectors;
6400                         atomic_add(sectors, &mddev->recovery_active);
6401                 }
6402
6403                 j += sectors;
6404                 if (j>1) mddev->curr_resync = j;
6405                 mddev->curr_mark_cnt = io_sectors;
6406                 if (last_check == 0)
6407                         /* this is the earliers that rebuilt will be
6408                          * visible in /proc/mdstat
6409                          */
6410                         md_new_event(mddev);
6411
6412                 if (last_check + window > io_sectors || j == max_sectors)
6413                         continue;
6414
6415                 last_check = io_sectors;
6416
6417                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6418                         break;
6419
6420         repeat:
6421                 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
6422                         /* step marks */
6423                         int next = (last_mark+1) % SYNC_MARKS;
6424
6425                         mddev->resync_mark = mark[next];
6426                         mddev->resync_mark_cnt = mark_cnt[next];
6427                         mark[next] = jiffies;
6428                         mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
6429                         last_mark = next;
6430                 }
6431
6432
6433                 if (kthread_should_stop())
6434                         goto interrupted;
6435
6436
6437                 /*
6438                  * this loop exits only if either when we are slower than
6439                  * the 'hard' speed limit, or the system was IO-idle for
6440                  * a jiffy.
6441                  * the system might be non-idle CPU-wise, but we only care
6442                  * about not overloading the IO subsystem. (things like an
6443                  * e2fsck being done on the RAID array should execute fast)
6444                  */
6445                 blk_unplug(mddev->queue);
6446                 cond_resched();
6447
6448                 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
6449                         /((jiffies-mddev->resync_mark)/HZ +1) +1;
6450
6451                 if (currspeed > speed_min(mddev)) {
6452                         if ((currspeed > speed_max(mddev)) ||
6453                                         !is_mddev_idle(mddev, 0)) {
6454                                 msleep(500);
6455                                 goto repeat;
6456                         }
6457                 }
6458         }
6459         printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
6460         /*
6461          * this also signals 'finished resyncing' to md_stop
6462          */
6463  out:
6464         blk_unplug(mddev->queue);
6465
6466         wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
6467
6468         /* tell personality that we are finished */
6469         mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
6470
6471         if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
6472             mddev->curr_resync > 2) {
6473                 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6474                         if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
6475                                 if (mddev->curr_resync >= mddev->recovery_cp) {
6476                                         printk(KERN_INFO
6477                                                "md: checkpointing %s of %s.\n",
6478                                                desc, mdname(mddev));
6479                                         mddev->recovery_cp = mddev->curr_resync;
6480                                 }
6481                         } else
6482                                 mddev->recovery_cp = MaxSector;
6483                 } else {
6484                         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6485                                 mddev->curr_resync = MaxSector;
6486                         list_for_each_entry(rdev, &mddev->disks, same_set)
6487                                 if (rdev->raid_disk >= 0 &&
6488                                     !test_bit(Faulty, &rdev->flags) &&
6489                                     !test_bit(In_sync, &rdev->flags) &&
6490                                     rdev->recovery_offset < mddev->curr_resync)
6491                                         rdev->recovery_offset = mddev->curr_resync;
6492                 }
6493         }
6494         set_bit(MD_CHANGE_DEVS, &mddev->flags);
6495
6496  skip:
6497         mddev->curr_resync = 0;
6498         mddev->curr_resync_completed = 0;
6499         mddev->resync_min = 0;
6500         mddev->resync_max = MaxSector;
6501         sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6502         wake_up(&resync_wait);
6503         set_bit(MD_RECOVERY_DONE, &mddev->recovery);
6504         md_wakeup_thread(mddev->thread);
6505         return;
6506
6507  interrupted:
6508         /*
6509          * got a signal, exit.
6510          */
6511         printk(KERN_INFO
6512                "md: md_do_sync() got signal ... exiting\n");
6513         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6514         goto out;
6515
6516 }
6517 EXPORT_SYMBOL_GPL(md_do_sync);
6518
6519
6520 static int remove_and_add_spares(mddev_t *mddev)
6521 {
6522         mdk_rdev_t *rdev;
6523         int spares = 0;
6524
6525         mddev->curr_resync_completed = 0;
6526
6527         list_for_each_entry(rdev, &mddev->disks, same_set)
6528                 if (rdev->raid_disk >= 0 &&
6529                     !test_bit(Blocked, &rdev->flags) &&
6530                     (test_bit(Faulty, &rdev->flags) ||
6531                      ! test_bit(In_sync, &rdev->flags)) &&
6532                     atomic_read(&rdev->nr_pending)==0) {
6533                         if (mddev->pers->hot_remove_disk(
6534                                     mddev, rdev->raid_disk)==0) {
6535                                 char nm[20];
6536                                 sprintf(nm,"rd%d", rdev->raid_disk);
6537                                 sysfs_remove_link(&mddev->kobj, nm);
6538                                 rdev->raid_disk = -1;
6539                         }
6540                 }
6541
6542         if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) {
6543                 list_for_each_entry(rdev, &mddev->disks, same_set) {
6544                         if (rdev->raid_disk >= 0 &&
6545                             !test_bit(In_sync, &rdev->flags) &&
6546                             !test_bit(Blocked, &rdev->flags))
6547                                 spares++;
6548                         if (rdev->raid_disk < 0
6549                             && !test_bit(Faulty, &rdev->flags)) {
6550                                 rdev->recovery_offset = 0;
6551                                 if (mddev->pers->
6552                                     hot_add_disk(mddev, rdev) == 0) {
6553                                         char nm[20];
6554                                         sprintf(nm, "rd%d", rdev->raid_disk);
6555                                         if (sysfs_create_link(&mddev->kobj,
6556                                                               &rdev->kobj, nm))
6557                                                 printk(KERN_WARNING
6558                                                        "md: cannot register "
6559                                                        "%s for %s\n",
6560                                                        nm, mdname(mddev));
6561                                         spares++;
6562                                         md_new_event(mddev);
6563                                 } else
6564                                         break;
6565                         }
6566                 }
6567         }
6568         return spares;
6569 }
6570 /*
6571  * This routine is regularly called by all per-raid-array threads to
6572  * deal with generic issues like resync and super-block update.
6573  * Raid personalities that don't have a thread (linear/raid0) do not
6574  * need this as they never do any recovery or update the superblock.
6575  *
6576  * It does not do any resync itself, but rather "forks" off other threads
6577  * to do that as needed.
6578  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
6579  * "->recovery" and create a thread at ->sync_thread.
6580  * When the thread finishes it sets MD_RECOVERY_DONE
6581  * and wakeups up this thread which will reap the thread and finish up.
6582  * This thread also removes any faulty devices (with nr_pending == 0).
6583  *
6584  * The overall approach is:
6585  *  1/ if the superblock needs updating, update it.
6586  *  2/ If a recovery thread is running, don't do anything else.
6587  *  3/ If recovery has finished, clean up, possibly marking spares active.
6588  *  4/ If there are any faulty devices, remove them.
6589  *  5/ If array is degraded, try to add spares devices
6590  *  6/ If array has spares or is not in-sync, start a resync thread.
6591  */
6592 void md_check_recovery(mddev_t *mddev)
6593 {
6594         mdk_rdev_t *rdev;
6595
6596
6597         if (mddev->bitmap)
6598                 bitmap_daemon_work(mddev->bitmap);
6599
6600         if (mddev->ro)
6601                 return;
6602
6603         if (signal_pending(current)) {
6604                 if (mddev->pers->sync_request && !mddev->external) {
6605                         printk(KERN_INFO "md: %s in immediate safe mode\n",
6606                                mdname(mddev));
6607                         mddev->safemode = 2;
6608                 }
6609                 flush_signals(current);
6610         }
6611
6612         if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
6613                 return;
6614         if ( ! (
6615                 (mddev->flags && !mddev->external) ||
6616                 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
6617                 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
6618                 (mddev->external == 0 && mddev->safemode == 1) ||
6619                 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
6620                  && !mddev->in_sync && mddev->recovery_cp == MaxSector)
6621                 ))
6622                 return;
6623
6624         if (mddev_trylock(mddev)) {
6625                 int spares = 0;
6626
6627                 if (mddev->ro) {
6628                         /* Only thing we do on a ro array is remove
6629                          * failed devices.
6630                          */
6631                         remove_and_add_spares(mddev);
6632                         clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6633                         goto unlock;
6634                 }
6635
6636                 if (!mddev->external) {
6637                         int did_change = 0;
6638                         spin_lock_irq(&mddev->write_lock);
6639                         if (mddev->safemode &&
6640                             !atomic_read(&mddev->writes_pending) &&
6641                             !mddev->in_sync &&
6642                             mddev->recovery_cp == MaxSector) {
6643                                 mddev->in_sync = 1;
6644                                 did_change = 1;
6645                                 if (mddev->persistent)
6646                                         set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6647                         }
6648                         if (mddev->safemode == 1)
6649                                 mddev->safemode = 0;
6650                         spin_unlock_irq(&mddev->write_lock);
6651                         if (did_change)
6652                                 sysfs_notify_dirent(mddev->sysfs_state);
6653                 }
6654
6655                 if (mddev->flags)
6656                         md_update_sb(mddev, 0);
6657
6658                 list_for_each_entry(rdev, &mddev->disks, same_set)
6659                         if (test_and_clear_bit(StateChanged, &rdev->flags))
6660                                 sysfs_notify_dirent(rdev->sysfs_state);
6661
6662
6663                 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
6664                     !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
6665                         /* resync/recovery still happening */
6666                         clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6667                         goto unlock;
6668                 }
6669                 if (mddev->sync_thread) {
6670                         /* resync has finished, collect result */
6671                         md_unregister_thread(mddev->sync_thread);
6672                         mddev->sync_thread = NULL;
6673                         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
6674                             !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
6675                                 /* success...*/
6676                                 /* activate any spares */
6677                                 if (mddev->pers->spare_active(mddev))
6678                                         sysfs_notify(&mddev->kobj, NULL,
6679                                                      "degraded");
6680                         }
6681                         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
6682                             mddev->pers->finish_reshape)
6683                                 mddev->pers->finish_reshape(mddev);
6684                         md_update_sb(mddev, 1);
6685
6686                         /* if array is no-longer degraded, then any saved_raid_disk
6687                          * information must be scrapped
6688                          */
6689                         if (!mddev->degraded)
6690                                 list_for_each_entry(rdev, &mddev->disks, same_set)
6691                                         rdev->saved_raid_disk = -1;
6692
6693                         mddev->recovery = 0;
6694                         /* flag recovery needed just to double check */
6695                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6696                         sysfs_notify_dirent(mddev->sysfs_action);
6697                         md_new_event(mddev);
6698                         goto unlock;
6699                 }
6700                 /* Set RUNNING before clearing NEEDED to avoid
6701                  * any transients in the value of "sync_action".
6702                  */
6703                 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6704                 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6705                 /* Clear some bits that don't mean anything, but
6706                  * might be left set
6707                  */
6708                 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
6709                 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
6710
6711                 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
6712                         goto unlock;
6713                 /* no recovery is running.
6714                  * remove any failed drives, then
6715                  * add spares if possible.
6716                  * Spare are also removed and re-added, to allow
6717                  * the personality to fail the re-add.
6718                  */
6719
6720                 if (mddev->reshape_position != MaxSector) {
6721                         if (mddev->pers->check_reshape == NULL ||
6722                             mddev->pers->check_reshape(mddev) != 0)
6723                                 /* Cannot proceed */
6724                                 goto unlock;
6725                         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
6726                         clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6727                 } else if ((spares = remove_and_add_spares(mddev))) {
6728                         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6729                         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
6730                         clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
6731                         set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6732                 } else if (mddev->recovery_cp < MaxSector) {
6733                         set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6734                         clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6735                 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
6736                         /* nothing to be done ... */
6737                         goto unlock;
6738
6739                 if (mddev->pers->sync_request) {
6740                         if (spares && mddev->bitmap && ! mddev->bitmap->file) {
6741                                 /* We are adding a device or devices to an array
6742                                  * which has the bitmap stored on all devices.
6743                                  * So make sure all bitmap pages get written
6744                                  */
6745                                 bitmap_write_all(mddev->bitmap);
6746                         }
6747                         mddev->sync_thread = md_register_thread(md_do_sync,
6748                                                                 mddev,
6749                                                                 "%s_resync");
6750                         if (!mddev->sync_thread) {
6751                                 printk(KERN_ERR "%s: could not start resync"
6752                                         " thread...\n", 
6753                                         mdname(mddev));
6754                                 /* leave the spares where they are, it shouldn't hurt */
6755                                 mddev->recovery = 0;
6756                         } else
6757                                 md_wakeup_thread(mddev->sync_thread);
6758                         sysfs_notify_dirent(mddev->sysfs_action);
6759                         md_new_event(mddev);
6760                 }
6761         unlock:
6762                 if (!mddev->sync_thread) {
6763                         clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6764                         if (test_and_clear_bit(MD_RECOVERY_RECOVER,
6765                                                &mddev->recovery))
6766                                 if (mddev->sysfs_action)
6767                                         sysfs_notify_dirent(mddev->sysfs_action);
6768                 }
6769                 mddev_unlock(mddev);
6770         }
6771 }
6772
6773 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
6774 {
6775         sysfs_notify_dirent(rdev->sysfs_state);
6776         wait_event_timeout(rdev->blocked_wait,
6777                            !test_bit(Blocked, &rdev->flags),
6778                            msecs_to_jiffies(5000));
6779         rdev_dec_pending(rdev, mddev);
6780 }
6781 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
6782
6783 static int md_notify_reboot(struct notifier_block *this,
6784                             unsigned long code, void *x)
6785 {
6786         struct list_head *tmp;
6787         mddev_t *mddev;
6788
6789         if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
6790
6791                 printk(KERN_INFO "md: stopping all md devices.\n");
6792
6793                 for_each_mddev(mddev, tmp)
6794                         if (mddev_trylock(mddev)) {
6795                                 /* Force a switch to readonly even array
6796                                  * appears to still be in use.  Hence
6797                                  * the '100'.
6798                                  */
6799                                 do_md_stop(mddev, 1, 100);
6800                                 mddev_unlock(mddev);
6801                         }
6802                 /*
6803                  * certain more exotic SCSI devices are known to be
6804                  * volatile wrt too early system reboots. While the
6805                  * right place to handle this issue is the given
6806                  * driver, we do want to have a safe RAID driver ...
6807                  */
6808                 mdelay(1000*1);
6809         }
6810         return NOTIFY_DONE;
6811 }
6812
6813 static struct notifier_block md_notifier = {
6814         .notifier_call  = md_notify_reboot,
6815         .next           = NULL,
6816         .priority       = INT_MAX, /* before any real devices */
6817 };
6818
6819 static void md_geninit(void)
6820 {
6821         dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
6822
6823         proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
6824 }
6825
6826 static int __init md_init(void)
6827 {
6828         if (register_blkdev(MD_MAJOR, "md"))
6829                 return -1;
6830         if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
6831                 unregister_blkdev(MD_MAJOR, "md");
6832                 return -1;
6833         }
6834         blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
6835                             md_probe, NULL, NULL);
6836         blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
6837                             md_probe, NULL, NULL);
6838
6839         register_reboot_notifier(&md_notifier);
6840         raid_table_header = register_sysctl_table(raid_root_table);
6841
6842         md_geninit();
6843         return 0;
6844 }
6845
6846
6847 #ifndef MODULE
6848
6849 /*
6850  * Searches all registered partitions for autorun RAID arrays
6851  * at boot time.
6852  */
6853
6854 static LIST_HEAD(all_detected_devices);
6855 struct detected_devices_node {
6856         struct list_head list;
6857         dev_t dev;
6858 };
6859
6860 void md_autodetect_dev(dev_t dev)
6861 {
6862         struct detected_devices_node *node_detected_dev;
6863
6864         node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
6865         if (node_detected_dev) {
6866                 node_detected_dev->dev = dev;
6867                 list_add_tail(&node_detected_dev->list, &all_detected_devices);
6868         } else {
6869                 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
6870                         ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
6871         }
6872 }
6873
6874
6875 static void autostart_arrays(int part)
6876 {
6877         mdk_rdev_t *rdev;
6878         struct detected_devices_node *node_detected_dev;
6879         dev_t dev;
6880         int i_scanned, i_passed;
6881
6882         i_scanned = 0;
6883         i_passed = 0;
6884
6885         printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
6886
6887         while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
6888                 i_scanned++;
6889                 node_detected_dev = list_entry(all_detected_devices.next,
6890                                         struct detected_devices_node, list);
6891                 list_del(&node_detected_dev->list);
6892                 dev = node_detected_dev->dev;
6893                 kfree(node_detected_dev);
6894                 rdev = md_import_device(dev,0, 90);
6895                 if (IS_ERR(rdev))
6896                         continue;
6897
6898                 if (test_bit(Faulty, &rdev->flags)) {
6899                         MD_BUG();
6900                         continue;
6901                 }
6902                 set_bit(AutoDetected, &rdev->flags);
6903                 list_add(&rdev->same_set, &pending_raid_disks);
6904                 i_passed++;
6905         }
6906
6907         printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
6908                                                 i_scanned, i_passed);
6909
6910         autorun_devices(part);
6911 }
6912
6913 #endif /* !MODULE */
6914
6915 static __exit void md_exit(void)
6916 {
6917         mddev_t *mddev;
6918         struct list_head *tmp;
6919
6920         blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
6921         blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
6922
6923         unregister_blkdev(MD_MAJOR,"md");
6924         unregister_blkdev(mdp_major, "mdp");
6925         unregister_reboot_notifier(&md_notifier);
6926         unregister_sysctl_table(raid_table_header);
6927         remove_proc_entry("mdstat", NULL);
6928         for_each_mddev(mddev, tmp) {
6929                 export_array(mddev);
6930                 mddev->hold_active = 0;
6931         }
6932 }
6933
6934 subsys_initcall(md_init);
6935 module_exit(md_exit)
6936
6937 static int get_ro(char *buffer, struct kernel_param *kp)
6938 {
6939         return sprintf(buffer, "%d", start_readonly);
6940 }
6941 static int set_ro(const char *val, struct kernel_param *kp)
6942 {
6943         char *e;
6944         int num = simple_strtoul(val, &e, 10);
6945         if (*val && (*e == '\0' || *e == '\n')) {
6946                 start_readonly = num;
6947                 return 0;
6948         }
6949         return -EINVAL;
6950 }
6951
6952 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
6953 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
6954
6955 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
6956
6957 EXPORT_SYMBOL(register_md_personality);
6958 EXPORT_SYMBOL(unregister_md_personality);
6959 EXPORT_SYMBOL(md_error);
6960 EXPORT_SYMBOL(md_done_sync);
6961 EXPORT_SYMBOL(md_write_start);
6962 EXPORT_SYMBOL(md_write_end);
6963 EXPORT_SYMBOL(md_register_thread);
6964 EXPORT_SYMBOL(md_unregister_thread);
6965 EXPORT_SYMBOL(md_wakeup_thread);
6966 EXPORT_SYMBOL(md_check_recovery);
6967 MODULE_LICENSE("GPL");
6968 MODULE_ALIAS("md");
6969 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);