md: factor do_md_run to separate accesses to ->gendisk
[pandora-kernel.git] / drivers / md / md.c
index a20a71e..e752332 100644 (file)
@@ -214,8 +214,11 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
  */
 static int md_make_request(struct request_queue *q, struct bio *bio)
 {
+       const int rw = bio_data_dir(bio);
        mddev_t *mddev = q->queuedata;
        int rv;
+       int cpu;
+
        if (mddev == NULL || mddev->pers == NULL) {
                bio_io_error(bio);
                return 0;
@@ -236,7 +239,15 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
        }
        atomic_inc(&mddev->active_io);
        rcu_read_unlock();
+
        rv = mddev->pers->make_request(q, bio);
+
+       cpu = part_stat_lock();
+       part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+       part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
+                     bio_sectors(bio));
+       part_stat_unlock();
+
        if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
                wake_up(&mddev->sb_wait);
 
@@ -507,9 +518,36 @@ static inline int mddev_trylock(mddev_t * mddev)
        return mutex_trylock(&mddev->reconfig_mutex);
 }
 
-static inline void mddev_unlock(mddev_t * mddev)
+static struct attribute_group md_redundancy_group;
+
+static void mddev_unlock(mddev_t * mddev)
 {
-       mutex_unlock(&mddev->reconfig_mutex);
+       if (mddev->to_remove) {
+               /* These cannot be removed under reconfig_mutex as
+                * an access to the files will try to take reconfig_mutex
+                * while holding the file unremovable, which leads to
+                * a deadlock.
+                * So hold open_mutex instead - we are allowed to take
+                * it while holding reconfig_mutex, and md_run can
+                * use it to wait for the remove to complete.
+                */
+               struct attribute_group *to_remove = mddev->to_remove;
+               mddev->to_remove = NULL;
+               mutex_lock(&mddev->open_mutex);
+               mutex_unlock(&mddev->reconfig_mutex);
+
+               if (to_remove != &md_redundancy_group)
+                       sysfs_remove_group(&mddev->kobj, to_remove);
+               if (mddev->pers == NULL ||
+                   mddev->pers->sync_request == NULL) {
+                       sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
+                       if (mddev->sysfs_action)
+                               sysfs_put(mddev->sysfs_action);
+                       mddev->sysfs_action = NULL;
+               }
+               mutex_unlock(&mddev->open_mutex);
+       } else
+               mutex_unlock(&mddev->reconfig_mutex);
 
        md_wakeup_thread(mddev->thread);
 }
@@ -2358,6 +2396,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                        return err;
                sprintf(nm, "rd%d", rdev->raid_disk);
                sysfs_remove_link(&rdev->mddev->kobj, nm);
+               rdev->raid_disk = -1;
                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
                md_wakeup_thread(rdev->mddev->thread);
        } else if (rdev->mddev->pers) {
@@ -2773,8 +2812,9 @@ static void analyze_sbs(mddev_t * mddev)
 
        i = 0;
        rdev_for_each(rdev, tmp, mddev) {
-               if (rdev->desc_nr >= mddev->max_disks ||
-                   i > mddev->max_disks) {
+               if (mddev->max_disks &&
+                   (rdev->desc_nr >= mddev->max_disks ||
+                    i > mddev->max_disks)) {
                        printk(KERN_WARNING
                               "md: %s: %s: only %d devices permitted\n",
                               mdname(mddev), bdevname(rdev->bdev, b),
@@ -2973,6 +3013,37 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
        /* Looks like we have a winner */
        mddev_suspend(mddev);
        mddev->pers->stop(mddev);
+       
+       if (mddev->pers->sync_request == NULL &&
+           pers->sync_request != NULL) {
+               /* need to add the md_redundancy_group */
+               if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
+                       printk(KERN_WARNING
+                              "md: cannot register extra attributes for %s\n",
+                              mdname(mddev));
+               mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
+       }               
+       if (mddev->pers->sync_request != NULL &&
+           pers->sync_request == NULL) {
+               /* need to remove the md_redundancy_group */
+               if (mddev->to_remove == NULL)
+                       mddev->to_remove = &md_redundancy_group;
+       }
+
+       if (mddev->pers->sync_request == NULL &&
+           mddev->external) {
+               /* We are converting from a no-redundancy array
+                * to a redundancy array and metadata is managed
+                * externally so we need to be sure that writes
+                * won't block due to a need to transition
+                *      clean->dirty
+                * until external management is started.
+                */
+               mddev->in_sync = 0;
+               mddev->safemode_delay = 0;
+               mddev->safemode = 0;
+       }
+
        module_put(mddev->pers->owner);
        /* Invalidate devices that are now superfluous */
        list_for_each_entry(rdev, &mddev->disks, same_set)
@@ -2987,11 +3058,19 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
        mddev->layout = mddev->new_layout;
        mddev->chunk_sectors = mddev->new_chunk_sectors;
        mddev->delta_disks = 0;
+       if (mddev->pers->sync_request == NULL) {
+               /* this is now an array without redundancy, so
+                * it must always be in_sync
+                */
+               mddev->in_sync = 1;
+               del_timer_sync(&mddev->safemode_timer);
+       }
        pers->run(mddev);
        mddev_resume(mddev);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
+       sysfs_notify(&mddev->kobj, NULL, "level");
        return rv;
 }
 
@@ -4075,15 +4154,6 @@ static void mddev_delayed_delete(struct work_struct *ws)
 {
        mddev_t *mddev = container_of(ws, mddev_t, del_work);
 
-       if (mddev->private) {
-               sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
-               if (mddev->private != (void*)1)
-                       sysfs_remove_group(&mddev->kobj, mddev->private);
-               if (mddev->sysfs_action)
-                       sysfs_put(mddev->sysfs_action);
-               mddev->sysfs_action = NULL;
-               mddev->private = NULL;
-       }
        sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
        kobject_del(&mddev->kobj);
        kobject_put(&mddev->kobj);
@@ -4227,11 +4297,10 @@ static void md_safemode_timeout(unsigned long data)
 
 static int start_dirty_degraded;
 
-static int do_md_run(mddev_t * mddev)
+static int md_run(mddev_t *mddev)
 {
        int err;
        mdk_rdev_t *rdev;
-       struct gendisk *disk;
        struct mdk_personality *pers;
 
        if (list_empty(&mddev->disks))
@@ -4241,6 +4310,13 @@ static int do_md_run(mddev_t * mddev)
        if (mddev->pers)
                return -EBUSY;
 
+       /* These two calls synchronise us with the
+        * sysfs_remove_group calls in mddev_unlock,
+        * so they must have completed.
+        */
+       mutex_lock(&mddev->open_mutex);
+       mutex_unlock(&mddev->open_mutex);
+
        /*
         * Analyze all RAID superblock(s)
         */
@@ -4289,8 +4365,6 @@ static int do_md_run(mddev_t * mddev)
                sysfs_notify_dirent(rdev->sysfs_state);
        }
 
-       disk = mddev->gendisk;
-
        spin_lock(&pers_lock);
        pers = find_pers(mddev->level, mddev->clevel);
        if (!pers || !try_module_get(pers->owner)) {
@@ -4418,22 +4492,32 @@ static int do_md_run(mddev_t * mddev)
        if (mddev->flags)
                md_update_sb(mddev, 0);
 
-       set_capacity(disk, mddev->array_sectors);
-
        md_wakeup_thread(mddev->thread);
        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
 
-       revalidate_disk(mddev->gendisk);
-       mddev->changed = 1;
        md_new_event(mddev);
        sysfs_notify_dirent(mddev->sysfs_state);
        if (mddev->sysfs_action)
                sysfs_notify_dirent(mddev->sysfs_action);
        sysfs_notify(&mddev->kobj, NULL, "degraded");
-       kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
        return 0;
 }
 
+static int do_md_run(mddev_t *mddev)
+{
+       int err;
+
+       err = md_run(mddev);
+       if (err)
+               goto out;
+
+       set_capacity(mddev->gendisk, mddev->array_sectors);
+       revalidate_disk(mddev->gendisk);
+       kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+out:
+       return err;
+}
+
 static int restart_array(mddev_t *mddev)
 {
        struct gendisk *disk = mddev->gendisk;
@@ -4529,8 +4613,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                        mddev->queue->unplug_fn = NULL;
                        mddev->queue->backing_dev_info.congested_fn = NULL;
                        module_put(mddev->pers->owner);
-                       if (mddev->pers->sync_request && mddev->private == NULL)
-                               mddev->private = (void*)1;
+                       if (mddev->pers->sync_request && mddev->to_remove == NULL)
+                               mddev->to_remove = &md_redundancy_group;
                        mddev->pers = NULL;
                        /* tell userspace to handle 'inactive' */
                        sysfs_notify_dirent(mddev->sysfs_state);
@@ -4543,7 +4627,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                                }
 
                        set_capacity(disk, 0);
-                       mddev->changed = 1;
+                       revalidate_disk(disk);
 
                        if (mddev->ro)
                                mddev->ro = 0;
@@ -4609,7 +4693,6 @@ out:
                mddev->sync_speed_min = mddev->sync_speed_max = 0;
                mddev->recovery = 0;
                mddev->in_sync = 0;
-               mddev->changed = 0;
                mddev->degraded = 0;
                mddev->barriers_work = 0;
                mddev->safemode = 0;
@@ -5342,7 +5425,7 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
        if (mddev->pers->check_reshape == NULL)
                return -EINVAL;
        if (raid_disks <= 0 ||
-           raid_disks >= mddev->max_disks)
+           (mddev->max_disks && raid_disks >= mddev->max_disks))
                return -EINVAL;
        if (mddev->sync_thread || mddev->reshape_position != MaxSector)
                return -EBUSY;
@@ -5479,7 +5562,7 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 
        geo->heads = 2;
        geo->sectors = 4;
-       geo->cylinders = get_capacity(mddev->gendisk) / 8;
+       geo->cylinders = mddev->array_sectors / 8;
        return 0;
 }
 
@@ -5489,6 +5572,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
        int err = 0;
        void __user *argp = (void __user *)arg;
        mddev_t *mddev = NULL;
+       int ro;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
@@ -5624,6 +5708,34 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
                        err = do_md_stop(mddev, 1, 1);
                        goto done_unlock;
 
+               case BLKROSET:
+                       if (get_user(ro, (int __user *)(arg))) {
+                               err = -EFAULT;
+                               goto done_unlock;
+                       }
+                       err = -EINVAL;
+
+                       /* if the bdev is going readonly the value of mddev->ro
+                        * does not matter, no writes are coming
+                        */
+                       if (ro)
+                               goto done_unlock;
+
+                       /* are we are already prepared for writes? */
+                       if (mddev->ro != 1)
+                               goto done_unlock;
+
+                       /* transitioning to readauto need only happen for
+                        * arrays that call md_write_start
+                        */
+                       if (mddev->pers) {
+                               err = restart_array(mddev);
+                               if (err == 0) {
+                                       mddev->ro = 2;
+                                       set_disk_ro(mddev->gendisk, 0);
+                               }
+                       }
+                       goto done_unlock;
        }
 
        /*
@@ -5744,7 +5856,6 @@ static int md_open(struct block_device *bdev, fmode_t mode)
        atomic_inc(&mddev->openers);
        mutex_unlock(&mddev->open_mutex);
 
-       check_disk_change(bdev);
  out:
        return err;
 }
@@ -5759,21 +5870,6 @@ static int md_release(struct gendisk *disk, fmode_t mode)
 
        return 0;
 }
-
-static int md_media_changed(struct gendisk *disk)
-{
-       mddev_t *mddev = disk->private_data;
-
-       return mddev->changed;
-}
-
-static int md_revalidate(struct gendisk *disk)
-{
-       mddev_t *mddev = disk->private_data;
-
-       mddev->changed = 0;
-       return 0;
-}
 static const struct block_device_operations md_fops =
 {
        .owner          = THIS_MODULE,
@@ -5784,8 +5880,6 @@ static const struct block_device_operations md_fops =
        .compat_ioctl   = md_compat_ioctl,
 #endif
        .getgeo         = md_getgeo,
-       .media_changed  = md_media_changed,
-       .revalidate_disk= md_revalidate,
 };
 
 static int md_thread(void * arg)
@@ -5899,7 +5993,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
        mddev->pers->error_handler(mddev,rdev);
        if (mddev->degraded)
                set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
-       set_bit(StateChanged, &rdev->flags);
+       sysfs_notify_dirent(rdev->sysfs_state);
        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
@@ -6891,11 +6985,6 @@ void md_check_recovery(mddev_t *mddev)
                if (mddev->flags)
                        md_update_sb(mddev, 0);
 
-               list_for_each_entry(rdev, &mddev->disks, same_set)
-                       if (test_and_clear_bit(StateChanged, &rdev->flags))
-                               sysfs_notify_dirent(rdev->sysfs_state);
-
-
                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
                    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
                        /* resync/recovery still happening */