Merge branch 'for-linus' of git://neil.brown.name/md
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Jan 2011 01:30:20 +0000 (17:30 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Jan 2011 01:30:20 +0000 (17:30 -0800)
* 'for-linus' of git://neil.brown.name/md:
  md: Fix removal of extra drives when converting RAID6 to RAID5
  md: range check slot number when manually adding a spare.
  md/raid5: handle manually-added spares in start_reshape.
  md: fix sync_completed reporting for very large drives (>2TB)
  md: allow suspend_lo and suspend_hi to decrease as well as increase.
  md: Don't let implementation detail of curr_resync leak out through sysfs.
  md: separate meta and data devs
  md-new-param-to_sync_page_io
  md-new-param-to-calc_dev_sboffset
  md: Be more careful about clearing flags bit in ->recovery
  md: md_stop_writes requires mddev_lock.
  md/raid5: use sysfs_notify_dirent_safe to avoid NULL pointer
  md: Ensure no IO request to get md device before it is properly initialised.
  md: Fix single printks with multiple KERN_<level>s
  md: fix regression resulting in delays in clearing bits in a bitmap
  md: fix regression with re-adding devices to arrays with no metadata

1  2 
drivers/md/md.c

diff --combined drivers/md/md.c
@@@ -288,10 -288,12 +288,12 @@@ static int md_make_request(struct reque
        int rv;
        int cpu;
  
-       if (mddev == NULL || mddev->pers == NULL) {
+       if (mddev == NULL || mddev->pers == NULL
+           || !mddev->ready) {
                bio_io_error(bio);
                return 0;
        }
+       smp_rmb(); /* Ensure implications of  'active' are visible */
        rcu_read_lock();
        if (mddev->suspended) {
                DEFINE_WAIT(__wait);
@@@ -703,9 -705,9 +705,9 @@@ static struct mdk_personality *find_per
  }
  
  /* return the offset of the super block in 512byte sectors */
- static inline sector_t calc_dev_sboffset(struct block_device *bdev)
+ static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
  {
-       sector_t num_sectors = i_size_read(bdev->bd_inode) / 512;
+       sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
        return MD_NEW_SIZE_SECTORS(num_sectors);
  }
  
@@@ -763,7 -765,7 +765,7 @@@ void md_super_write(mddev_t *mddev, mdk
         */
        struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
  
-       bio->bi_bdev = rdev->bdev;
+       bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
        bio->bi_sector = sector;
        bio_add_page(bio, page, size, 0);
        bio->bi_private = rdev;
@@@ -793,7 -795,7 +795,7 @@@ static void bi_complete(struct bio *bio
  }
  
  int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
-                struct page *page, int rw)
+                struct page *page, int rw, bool metadata_op)
  {
        struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
        struct completion event;
  
        rw |= REQ_SYNC | REQ_UNPLUG;
  
-       bio->bi_bdev = rdev->bdev;
-       bio->bi_sector = sector;
+       bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
+               rdev->meta_bdev : rdev->bdev;
+       if (metadata_op)
+               bio->bi_sector = sector + rdev->sb_start;
+       else
+               bio->bi_sector = sector + rdev->data_offset;
        bio_add_page(bio, page, size, 0);
        init_completion(&event);
        bio->bi_private = &event;
@@@ -827,7 -833,7 +833,7 @@@ static int read_disk_sb(mdk_rdev_t * rd
                return 0;
  
  
-       if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ))
+       if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
                goto fail;
        rdev->sb_loaded = 1;
        return 0;
@@@ -989,7 -995,7 +995,7 @@@ static int super_90_load(mdk_rdev_t *rd
         *
         * It also happens to be a multiple of 4Kb.
         */
-       rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+       rdev->sb_start = calc_dev_sboffset(rdev);
  
        ret = read_disk_sb(rdev, MD_SB_BYTES);
        if (ret) return ret;
@@@ -1330,7 -1336,7 +1336,7 @@@ super_90_rdev_size_change(mdk_rdev_t *r
                return 0; /* component must fit device */
        if (rdev->mddev->bitmap_info.offset)
                return 0; /* can't move bitmap */
-       rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+       rdev->sb_start = calc_dev_sboffset(rdev);
        if (!num_sectors || num_sectors > rdev->sb_start)
                num_sectors = rdev->sb_start;
        md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
@@@ -1879,7 -1885,7 +1885,7 @@@ static int bind_rdev_to_array(mdk_rdev_
        rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
  
        list_add_rcu(&rdev->same_set, &mddev->disks);
 -      bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
 +      bd_link_disk_holder(rdev->bdev, mddev->gendisk);
  
        /* May as well allow recovery to be retried once */
        mddev->recovery_disabled = 0;
@@@ -1906,6 -1912,7 +1912,6 @@@ static void unbind_rdev_from_array(mdk_
                MD_BUG();
                return;
        }
 -      bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
        list_del_rcu(&rdev->same_set);
        printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
        rdev->mddev = NULL;
@@@ -1933,13 -1940,19 +1939,13 @@@ static int lock_rdev(mdk_rdev_t *rdev, 
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
  
 -      bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
 +      bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
 +                               shared ? (mdk_rdev_t *)lock_rdev : rdev);
        if (IS_ERR(bdev)) {
                printk(KERN_ERR "md: could not open %s.\n",
                        __bdevname(dev, b));
                return PTR_ERR(bdev);
        }
 -      err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
 -      if (err) {
 -              printk(KERN_ERR "md: could not bd_claim %s.\n",
 -                      bdevname(bdev, b));
 -              blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 -              return err;
 -      }
        if (!shared)
                set_bit(AllReserved, &rdev->flags);
        rdev->bdev = bdev;
@@@ -1952,7 -1965,8 +1958,7 @@@ static void unlock_rdev(mdk_rdev_t *rde
        rdev->bdev = NULL;
        if (!bdev)
                MD_BUG();
 -      bd_release(bdev);
 -      blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 +      blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
  }
  
  void md_autodetect_dev(dev_t dev);
@@@ -2465,6 -2479,10 +2471,10 @@@ slot_store(mdk_rdev_t *rdev, const cha
                        if (rdev2->raid_disk == slot)
                                return -EEXIST;
  
+               if (slot >= rdev->mddev->raid_disks &&
+                   slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
+                       return -ENOSPC;
                rdev->raid_disk = slot;
                if (test_bit(In_sync, &rdev->flags))
                        rdev->saved_raid_disk = slot;
                        /* failure here is OK */;
                /* don't wakeup anyone, leave that to userspace. */
        } else {
-               if (slot >= rdev->mddev->raid_disks)
+               if (slot >= rdev->mddev->raid_disks &&
+                   slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
                        return -ENOSPC;
                rdev->raid_disk = slot;
                /* assume it is working */
@@@ -3107,7 -3126,7 +3118,7 @@@ level_store(mddev_t *mddev, const char 
                char nm[20];
                if (rdev->raid_disk < 0)
                        continue;
-               if (rdev->new_raid_disk > mddev->raid_disks)
+               if (rdev->new_raid_disk >= mddev->raid_disks)
                        rdev->new_raid_disk = -1;
                if (rdev->new_raid_disk == rdev->raid_disk)
                        continue;
@@@ -3736,6 -3755,8 +3747,8 @@@ action_show(mddev_t *mddev, char *page
        return sprintf(page, "%s\n", type);
  }
  
+ static void reap_sync_thread(mddev_t *mddev);
  static ssize_t
  action_store(mddev_t *mddev, const char *page, size_t len)
  {
        if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
                if (mddev->sync_thread) {
                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                       md_unregister_thread(mddev->sync_thread);
-                       mddev->sync_thread = NULL;
-                       mddev->recovery = 0;
+                       reap_sync_thread(mddev);
                }
        } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
                   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@@ -3904,7 -3923,7 +3915,7 @@@ static struct md_sysfs_entry md_sync_sp
  static ssize_t
  sync_completed_show(mddev_t *mddev, char *page)
  {
-       unsigned long max_sectors, resync;
+       unsigned long long max_sectors, resync;
  
        if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                return sprintf(page, "none\n");
                max_sectors = mddev->dev_sectors;
  
        resync = mddev->curr_resync_completed;
-       return sprintf(page, "%lu / %lu\n", resync, max_sectors);
+       return sprintf(page, "%llu / %llu\n", resync, max_sectors);
  }
  
  static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@@ -4002,19 -4021,24 +4013,24 @@@ suspend_lo_store(mddev_t *mddev, const 
  {
        char *e;
        unsigned long long new = simple_strtoull(buf, &e, 10);
+       unsigned long long old = mddev->suspend_lo;
  
        if (mddev->pers == NULL || 
            mddev->pers->quiesce == NULL)
                return -EINVAL;
        if (buf == e || (*e && *e != '\n'))
                return -EINVAL;
-       if (new >= mddev->suspend_hi ||
-           (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
-               mddev->suspend_lo = new;
+       mddev->suspend_lo = new;
+       if (new >= old)
+               /* Shrinking suspended region */
                mddev->pers->quiesce(mddev, 2);
-               return len;
-       } else
-               return -EINVAL;
+       else {
+               /* Expanding suspended region - need to wait */
+               mddev->pers->quiesce(mddev, 1);
+               mddev->pers->quiesce(mddev, 0);
+       }
+       return len;
  }
  static struct md_sysfs_entry md_suspend_lo =
  __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@@ -4031,20 -4055,24 +4047,24 @@@ suspend_hi_store(mddev_t *mddev, const 
  {
        char *e;
        unsigned long long new = simple_strtoull(buf, &e, 10);
+       unsigned long long old = mddev->suspend_hi;
  
        if (mddev->pers == NULL ||
            mddev->pers->quiesce == NULL)
                return -EINVAL;
        if (buf == e || (*e && *e != '\n'))
                return -EINVAL;
-       if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
-           (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
-               mddev->suspend_hi = new;
+       mddev->suspend_hi = new;
+       if (new <= old)
+               /* Shrinking suspended region */
+               mddev->pers->quiesce(mddev, 2);
+       else {
+               /* Expanding suspended region - need to wait */
                mddev->pers->quiesce(mddev, 1);
                mddev->pers->quiesce(mddev, 0);
-               return len;
-       } else
-               return -EINVAL;
+       }
+       return len;
  }
  static struct md_sysfs_entry md_suspend_hi =
  __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@@ -4422,7 -4450,9 +4442,9 @@@ int md_run(mddev_t *mddev
                 * We don't want the data to overlap the metadata,
                 * Internal Bitmap issues have been handled elsewhere.
                 */
-               if (rdev->data_offset < rdev->sb_start) {
+               if (rdev->meta_bdev) {
+                       /* Nothing to check */;
+               } else if (rdev->data_offset < rdev->sb_start) {
                        if (mddev->dev_sectors &&
                            rdev->data_offset + mddev->dev_sectors
                            > rdev->sb_start) {
        mddev->safemode_timer.data = (unsigned long) mddev;
        mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
        mddev->in_sync = 1;
+       smp_wmb();
+       mddev->ready = 1;
        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk >= 0) {
                        char nm[20];
@@@ -4693,13 -4724,12 +4716,12 @@@ static void md_clean(mddev_t *mddev
        mddev->plug = NULL;
  }
  
void md_stop_writes(mddev_t *mddev)
static void __md_stop_writes(mddev_t *mddev)
  {
        if (mddev->sync_thread) {
                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-               md_unregister_thread(mddev->sync_thread);
-               mddev->sync_thread = NULL;
+               reap_sync_thread(mddev);
        }
  
        del_timer_sync(&mddev->safemode_timer);
                md_update_sb(mddev, 1);
        }
  }
+ void md_stop_writes(mddev_t *mddev)
+ {
+       mddev_lock(mddev);
+       __md_stop_writes(mddev);
+       mddev_unlock(mddev);
+ }
  EXPORT_SYMBOL_GPL(md_stop_writes);
  
  void md_stop(mddev_t *mddev)
  {
+       mddev->ready = 0;
        mddev->pers->stop(mddev);
        if (mddev->pers->sync_request && mddev->to_remove == NULL)
                mddev->to_remove = &md_redundancy_group;
@@@ -4736,7 -4774,7 +4766,7 @@@ static int md_set_readonly(mddev_t *mdd
                goto out;
        }
        if (mddev->pers) {
-               md_stop_writes(mddev);
+               __md_stop_writes(mddev);
  
                err  = -ENXIO;
                if (mddev->ro==1)
@@@ -4773,7 -4811,7 +4803,7 @@@ static int do_md_stop(mddev_t * mddev, 
                if (mddev->ro)
                        set_disk_ro(disk, 0);
  
-               md_stop_writes(mddev);
+               __md_stop_writes(mddev);
                md_stop(mddev);
                mddev->queue->merge_bvec_fn = NULL;
                mddev->queue->unplug_fn = NULL;
@@@ -5151,9 -5189,10 +5181,10 @@@ static int add_new_disk(mddev_t * mddev
                /* set saved_raid_disk if appropriate */
                if (!mddev->persistent) {
                        if (info->state & (1<<MD_DISK_SYNC)  &&
-                           info->raid_disk < mddev->raid_disks)
+                           info->raid_disk < mddev->raid_disks) {
                                rdev->raid_disk = info->raid_disk;
-                       else
+                               set_bit(In_sync, &rdev->flags);
+                       } else
                                rdev->raid_disk = -1;
                } else
                        super_types[mddev->major_version].
                        printk(KERN_INFO "md: nonpersistent superblock ...\n");
                        rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
                } else
-                       rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+                       rdev->sb_start = calc_dev_sboffset(rdev);
                rdev->sectors = rdev->sb_start;
  
                err = bind_rdev_to_array(rdev, mddev);
@@@ -5297,7 -5336,7 +5328,7 @@@ static int hot_add_disk(mddev_t * mddev
        }
  
        if (mddev->persistent)
-               rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+               rdev->sb_start = calc_dev_sboffset(rdev);
        else
                rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
  
@@@ -5510,7 -5549,6 +5541,6 @@@ static int update_size(mddev_t *mddev, 
         * sb_start or, if that is <data_offset, it must fit before the size
         * of each device.  If num_sectors is zero, we find the largest size
         * that fits.
         */
        if (mddev->sync_thread)
                return -EBUSY;
@@@ -6033,7 -6071,8 +6063,8 @@@ static int md_thread(void * arg
                         || kthread_should_stop(),
                         thread->timeout);
  
-               if (test_and_clear_bit(THREAD_WAKEUP, &thread->flags))
+               clear_bit(THREAD_WAKEUP, &thread->flags);
+               if (!kthread_should_stop())
                        thread->run(thread->mddev);
        }
  
@@@ -6799,7 -6838,7 +6830,7 @@@ void md_do_sync(mddev_t *mddev
                       desc, mdname(mddev));
                mddev->curr_resync = j;
        }
-       mddev->curr_resync_completed = mddev->curr_resync;
+       mddev->curr_resync_completed = j;
  
        while (j < max_sectors) {
                sector_t sectors;
                        md_unplug(mddev);
                        wait_event(mddev->recovery_wait,
                                   atomic_read(&mddev->recovery_active) == 0);
-                       mddev->curr_resync_completed =
-                               mddev->curr_resync;
+                       mddev->curr_resync_completed = j;
                        set_bit(MD_CHANGE_CLEAN, &mddev->flags);
                        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
                }
@@@ -7023,6 -7061,45 +7053,45 @@@ static int remove_and_add_spares(mddev_
        }
        return spares;
  }
+ static void reap_sync_thread(mddev_t *mddev)
+ {
+       mdk_rdev_t *rdev;
+       /* resync has finished, collect result */
+       md_unregister_thread(mddev->sync_thread);
+       mddev->sync_thread = NULL;
+       if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+           !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+               /* success...*/
+               /* activate any spares */
+               if (mddev->pers->spare_active(mddev))
+                       sysfs_notify(&mddev->kobj, NULL,
+                                    "degraded");
+       }
+       if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+           mddev->pers->finish_reshape)
+               mddev->pers->finish_reshape(mddev);
+       md_update_sb(mddev, 1);
+       /* if array is no-longer degraded, then any saved_raid_disk
+        * information must be scrapped
+        */
+       if (!mddev->degraded)
+               list_for_each_entry(rdev, &mddev->disks, same_set)
+                       rdev->saved_raid_disk = -1;
+       clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+       clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+       clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+       clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+       clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+       /* flag recovery needed just to double check */
+       set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+       sysfs_notify_dirent_safe(mddev->sysfs_action);
+       md_new_event(mddev);
+ }
  /*
   * This routine is regularly called by all per-raid-array threads to
   * deal with generic issues like resync and super-block update.
   */
  void md_check_recovery(mddev_t *mddev)
  {
-       mdk_rdev_t *rdev;
        if (mddev->bitmap)
                bitmap_daemon_work(mddev);
  
                        goto unlock;
                }
                if (mddev->sync_thread) {
-                       /* resync has finished, collect result */
-                       md_unregister_thread(mddev->sync_thread);
-                       mddev->sync_thread = NULL;
-                       if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
-                           !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
-                               /* success...*/
-                               /* activate any spares */
-                               if (mddev->pers->spare_active(mddev))
-                                       sysfs_notify(&mddev->kobj, NULL,
-                                                    "degraded");
-                       }
-                       if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-                           mddev->pers->finish_reshape)
-                               mddev->pers->finish_reshape(mddev);
-                       md_update_sb(mddev, 1);
-                       /* if array is no-longer degraded, then any saved_raid_disk
-                        * information must be scrapped
-                        */
-                       if (!mddev->degraded)
-                               list_for_each_entry(rdev, &mddev->disks, same_set)
-                                       rdev->saved_raid_disk = -1;
-                       mddev->recovery = 0;
-                       /* flag recovery needed just to double check */
-                       set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-                       sysfs_notify_dirent_safe(mddev->sysfs_action);
-                       md_new_event(mddev);
+                       reap_sync_thread(mddev);
                        goto unlock;
                }
                /* Set RUNNING before clearing NEEDED to avoid
                                        " thread...\n", 
                                        mdname(mddev));
                                /* leave the spares where they are, it shouldn't hurt */
-                               mddev->recovery = 0;
+                               clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+                               clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+                               clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+                               clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+                               clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
                        } else
                                md_wakeup_thread(mddev->sync_thread);
                        sysfs_notify_dirent_safe(mddev->sysfs_action);