md: Protect access to mddev->disks list using RCU
authorNeilBrown <neilb@suse.de>
Mon, 21 Jul 2008 07:05:25 +0000 (17:05 +1000)
committerNeilBrown <neilb@suse.de>
Mon, 21 Jul 2008 07:05:25 +0000 (17:05 +1000)
All modifications and most access to the mddev->disks list are made
under the reconfig_mutex lock.  However there are three places where
the list is walked without any locking.  If a reconfig happens at this
time, havoc (and oops) can ensue.

So use RCU to protect these accesses:
  - wrap them in rcu_read_{,un}lock()
  - use list_for_each_entry_rcu
  - add to the list with list_add_rcu
  - delete from the list with list_del_rcu
  - delay the 'free' with call_rcu rather than schedule_work

Note that export_rdev did a list_del_init on this list.  In almost all
cases the entry was not in the list anymore so it was a no-op and so
safe.  It is no longer safe as after list_del_rcu we may not touch
the list_head.
An audit shows that export_rdev is called:
  - after unbind_rdev_from_array, in which case the delete has
     already been done,
  - after bind_rdev_to_array fails, in which case the delete isn't needed.
  - before the device has been put on a list at all (e.g. in
      add_new_disk where reading the superblock fails).
  - and in autorun devices after a failure when the device is on a
      different list.

So remove the list_del_init call from export_rdev, and add it back
immediately before the called to export_rdev for that last case.

Note also that ->same_set is sometimes used for lists other than
mddev->list (e.g. candidates).  In these cases rcu is not needed.

Signed-off-by: NeilBrown <neilb@suse.de>
drivers/md/bitmap.c
drivers/md/md.c
include/linux/raid/md_k.h

index eba83e2..621a272 100644 (file)
@@ -241,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
 static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 {
        mdk_rdev_t *rdev;
-       struct list_head *tmp;
        mddev_t *mddev = bitmap->mddev;
 
-       rdev_for_each(rdev, tmp, mddev)
+       rcu_read_lock();
+       rdev_for_each_rcu(rdev, mddev)
                if (test_bit(In_sync, &rdev->flags)
                    && !test_bit(Faulty, &rdev->flags)) {
                        int size = PAGE_SIZE;
@@ -260,11 +260,11 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
                                    + (long)(page->index * (PAGE_SIZE/512))
                                    + size/512 > 0)
                                        /* bitmap runs in to metadata */
-                                       return -EINVAL;
+                                       goto bad_alignment;
                                if (rdev->data_offset + mddev->size*2
                                    > rdev->sb_start + bitmap->offset)
                                        /* data runs in to bitmap */
-                                       return -EINVAL;
+                                       goto bad_alignment;
                        } else if (rdev->sb_start < rdev->data_offset) {
                                /* METADATA BITMAP DATA */
                                if (rdev->sb_start
@@ -272,7 +272,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
                                    + page->index*(PAGE_SIZE/512) + size/512
                                    > rdev->data_offset)
                                        /* bitmap runs in to data */
-                                       return -EINVAL;
+                                       goto bad_alignment;
                        } else {
                                /* DATA METADATA BITMAP - no problems */
                        }
@@ -282,10 +282,15 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
                                       size,
                                       page);
                }
+       rcu_read_unlock();
 
        if (wait)
                md_super_wait(mddev);
        return 0;
+
+ bad_alignment:
+       rcu_read_unlock();
+       return -EINVAL;
 }
 
 static void bitmap_file_kick(struct bitmap *bitmap);
index 450f30b..c2ff77c 100644 (file)
@@ -1395,15 +1395,17 @@ static struct super_type super_types[] = {
 
 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
 {
-       struct list_head *tmp, *tmp2;
        mdk_rdev_t *rdev, *rdev2;
 
-       rdev_for_each(rdev, tmp, mddev1)
-               rdev_for_each(rdev2, tmp2, mddev2)
+       rcu_read_lock();
+       rdev_for_each_rcu(rdev, mddev1)
+               rdev_for_each_rcu(rdev2, mddev2)
                        if (rdev->bdev->bd_contains ==
-                           rdev2->bdev->bd_contains)
+                           rdev2->bdev->bd_contains) {
+                               rcu_read_unlock();
                                return 1;
-
+                       }
+       rcu_read_unlock();
        return 0;
 }
 
@@ -1470,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
                kobject_del(&rdev->kobj);
                goto fail;
        }
-       list_add(&rdev->same_set, &mddev->disks);
+       list_add_rcu(&rdev->same_set, &mddev->disks);
        bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
        return 0;
 
@@ -1495,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
                return;
        }
        bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
-       list_del_init(&rdev->same_set);
+       list_del_rcu(&rdev->same_set);
        printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
        rdev->mddev = NULL;
        sysfs_remove_link(&rdev->kobj, "block");
 
        /* We need to delay this, otherwise we can deadlock when
-        * writing to 'remove' to "dev/state"
+        * writing to 'remove' to "dev/state".  We also need
+        * to delay it due to rcu usage.
         */
+       synchronize_rcu();
        INIT_WORK(&rdev->del_work, md_delayed_delete);
        kobject_get(&rdev->kobj);
        schedule_work(&rdev->del_work);
@@ -1558,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev)
        if (rdev->mddev)
                MD_BUG();
        free_disk_sb(rdev);
-       list_del_init(&rdev->same_set);
 #ifndef MODULE
        if (test_bit(AutoDetected, &rdev->flags))
                md_autodetect_dev(rdev->bdev->bd_dev);
@@ -4062,8 +4065,10 @@ static void autorun_devices(int part)
                /* on success, candidates will be empty, on error
                 * it won't...
                 */
-               rdev_for_each_list(rdev, tmp, candidates)
+               rdev_for_each_list(rdev, tmp, candidates) {
+                       list_del_init(&rdev->same_set);
                        export_rdev(rdev);
+               }
                mddev_put(mddev);
        }
        printk(KERN_INFO "md: ... autorun DONE.\n");
@@ -5529,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p)
 static int is_mddev_idle(mddev_t *mddev)
 {
        mdk_rdev_t * rdev;
-       struct list_head *tmp;
        int idle;
        long curr_events;
 
        idle = 1;
-       rdev_for_each(rdev, tmp, mddev) {
+       rcu_read_lock();
+       rdev_for_each_rcu(rdev, mddev) {
                struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
                curr_events = disk_stat_read(disk, sectors[0]) + 
                                disk_stat_read(disk, sectors[1]) - 
@@ -5566,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev)
                        idle = 0;
                }
        }
+       rcu_read_unlock();
        return idle;
 }
 
index 4bef479..9f2549a 100644 (file)
@@ -339,6 +339,9 @@ static inline char * mdname (mddev_t * mddev)
 #define rdev_for_each(rdev, tmp, mddev)                                \
        rdev_for_each_list(rdev, tmp, (mddev)->disks)
 
+#define rdev_for_each_rcu(rdev, mddev)                         \
+       list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
+
 typedef struct mdk_thread_s {
        void                    (*run) (mddev_t *mddev);
        mddev_t                 *mddev;