md: make it easier to wait for bad blocks to be acknowledged.

[pandora-kernel.git] / drivers / md / raid10.c
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c

index 6e84668..fe6692e 100644 (file)
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -22,6 +22,7 @@
  #include <linux/delay.h>
  #include <linux/blkdev.h>
  #include <linux/seq_file.h>
+#include <linux/ratelimit.h>
  #include "md.h"
  #include "raid10.h"
  #include "raid0.h"
@@ -123,7 +124,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
         for (j = 0 ; j < nalloc; j++) {
                 bio = r10_bio->devs[j].bio;
                 for (i = 0; i < RESYNC_PAGES; i++) {
-                       page = alloc_page(gfp_flags);
+                       if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
+                                               &conf->mddev->recovery)) {
+                               /* we can share bv_page's during recovery */
+                               struct bio *rbio = r10_bio->devs[0].bio;
+                               page = rbio->bi_io_vec[i].bv_page;
+                               get_page(page);
+                       } else
+                               page = alloc_page(gfp_flags);
                         if (unlikely(!page))
                                 goto out_free_pages;
  
@@ -244,6 +252,23 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio)
                 r10_bio->devs[slot].addr + (r10_bio->sectors);
  }
  
+/*
+ * Find the disk number which triggered given bio
+ */
+static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio, struct bio *bio)
+{
+       int slot;
+
+       for (slot = 0; slot < conf->copies; slot++)
+               if (r10_bio->devs[slot].bio == bio)
+                       break;
+
+       BUG_ON(slot == conf->copies);
+       update_head_pos(slot, r10_bio);
+
+       return r10_bio->devs[slot].devnum;
+}
+
  static void raid10_end_read_request(struct bio *bio, int error)
  {
         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -277,10 +302,11 @@ static void raid10_end_read_request(struct bio *bio, int error)
                  * oops, read error - keep the refcount on the rdev
                  */
                 char b[BDEVNAME_SIZE];
-               if (printk_ratelimit())
-                       printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n",
-                              mdname(conf->mddev),
-                              bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
+               printk_ratelimited(KERN_ERR
+                                  "md/raid10:%s: %s: rescheduling sector %llu\n",
+                                  mdname(conf->mddev),
+                                  bdevname(conf->mirrors[dev].rdev->bdev, b),
+                                  (unsigned long long)r10_bio->sector);
                 reschedule_retry(r10_bio);
         }
  }
@@ -289,13 +315,10 @@ static void raid10_end_write_request(struct bio *bio, int error)
  {
         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
         r10bio_t *r10_bio = bio->bi_private;
-       int slot, dev;
+       int dev;
         conf_t *conf = r10_bio->mddev->private;
  
-       for (slot = 0; slot < conf->copies; slot++)
-               if (r10_bio->devs[slot].bio == bio)
-                       break;
-       dev = r10_bio->devs[slot].devnum;
+       dev = find_bio_disk(conf, r10_bio, bio);
  
         /*
          * this branch is our 'one mirror IO has finished' event handler:
@@ -316,8 +339,6 @@ static void raid10_end_write_request(struct bio *bio, int error)
                  */
                 set_bit(R10BIO_Uptodate, &r10_bio->state);
  
-       update_head_pos(slot, r10_bio);
-
         /*
          *
          * Let's see if all mirrored write operations have finished
@@ -949,6 +970,30 @@ static void status(struct seq_file *seq, mddev_t *mddev)
         seq_printf(seq, "]");
  }
  
+/* check if there are enough drives for
+ * every block to appear on atleast one.
+ * Don't consider the device numbered 'ignore'
+ * as we might be about to remove it.
+ */
+static int enough(conf_t *conf, int ignore)
+{
+       int first = 0;
+
+       do {
+               int n = conf->copies;
+               int cnt = 0;
+               while (n--) {
+                       if (conf->mirrors[first].rdev &&
+                           first != ignore)
+                               cnt++;
+                       first = (first+1) % conf->raid_disks;
+               }
+               if (cnt == 0)
+                       return 0;
+       } while (first != 0);
+       return 1;
+}
+
  static void error(mddev_t *mddev, mdk_rdev_t *rdev)
  {
         char b[BDEVNAME_SIZE];
@@ -961,13 +1006,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
          * else mark the drive as failed
          */
         if (test_bit(In_sync, &rdev->flags)
-           && conf->raid_disks-mddev->degraded == 1)
+           && !enough(conf, rdev->raid_disk))
                 /*
                  * Don't fail the drive, just return an IO error.
-                * The test should really be more sophisticated than
-                * "working_disks == 1", but it isn't critical, and
-                * can wait until we do more sophisticated "is the drive
-                * really dead" tests...
                  */
                 return;
         if (test_and_clear_bit(In_sync, &rdev->flags)) {
@@ -980,6 +1021,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
                  */
                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
         }
+       set_bit(Blocked, &rdev->flags);
         set_bit(Faulty, &rdev->flags);
         set_bit(MD_CHANGE_DEVS, &mddev->flags);
         printk(KERN_ALERT
@@ -1022,27 +1064,6 @@ static void close_sync(conf_t *conf)
         conf->r10buf_pool = NULL;
  }
  
-/* check if there are enough drives for
- * every block to appear on atleast one
- */
-static int enough(conf_t *conf)
-{
-       int first = 0;
-
-       do {
-               int n = conf->copies;
-               int cnt = 0;
-               while (n--) {
-                       if (conf->mirrors[first].rdev)
-                               cnt++;
-                       first = (first+1) % conf->raid_disks;
-               }
-               if (cnt == 0)
-                       return 0;
-       } while (first != 0);
-       return 1;
-}
-
  static int raid10_spare_active(mddev_t *mddev)
  {
         int i;
@@ -1078,53 +1099,58 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
         conf_t *conf = mddev->private;
         int err = -EEXIST;
         int mirror;
-       mirror_info_t *p;
         int first = 0;
         int last = conf->raid_disks - 1;
  
+       if (rdev->badblocks.count)
+               return -EINVAL;
+
         if (mddev->recovery_cp < MaxSector)
                 /* only hot-add to in-sync arrays, as recovery is
                  * very different from resync
                  */
                 return -EBUSY;
-       if (!enough(conf))
+       if (!enough(conf, -1))
                 return -EINVAL;
  
         if (rdev->raid_disk >= 0)
                 first = last = rdev->raid_disk;
  
-       if (rdev->saved_raid_disk >= 0 &&
-           rdev->saved_raid_disk >= first &&
+       if (rdev->saved_raid_disk >= first &&
             conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
                 mirror = rdev->saved_raid_disk;
         else
                 mirror = first;
-       for ( ; mirror <= last ; mirror++)
-               if ( !(p=conf->mirrors+mirror)->rdev) {
-
-                       disk_stack_limits(mddev->gendisk, rdev->bdev,
-                                         rdev->data_offset << 9);
-                       /* as we don't honour merge_bvec_fn, we must
-                        * never risk violating it, so limit
-                        * ->max_segments to one lying with a single
-                        * page, as a one page request is never in
-                        * violation.
-                        */
-                       if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-                               blk_queue_max_segments(mddev->queue, 1);
-                               blk_queue_segment_boundary(mddev->queue,
-                                                          PAGE_CACHE_SIZE - 1);
-                       }
+       for ( ; mirror <= last ; mirror++) {
+               mirror_info_t *p = &conf->mirrors[mirror];
+               if (p->recovery_disabled == mddev->recovery_disabled)
+                       continue;
+               if (!p->rdev)
+                       continue;
  
-                       p->head_position = 0;
-                       rdev->raid_disk = mirror;
-                       err = 0;
-                       if (rdev->saved_raid_disk != mirror)
-                               conf->fullsync = 1;
-                       rcu_assign_pointer(p->rdev, rdev);
-                       break;
+               disk_stack_limits(mddev->gendisk, rdev->bdev,
+                                 rdev->data_offset << 9);
+               /* as we don't honour merge_bvec_fn, we must
+                * never risk violating it, so limit
+                * ->max_segments to one lying with a single
+                * page, as a one page request is never in
+                * violation.
+                */
+               if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+                       blk_queue_max_segments(mddev->queue, 1);
+                       blk_queue_segment_boundary(mddev->queue,
+                                                  PAGE_CACHE_SIZE - 1);
                 }
  
+               p->head_position = 0;
+               rdev->raid_disk = mirror;
+               err = 0;
+               if (rdev->saved_raid_disk != mirror)
+                       conf->fullsync = 1;
+               rcu_assign_pointer(p->rdev, rdev);
+               break;
+       }
+
         md_integrity_add_rdev(rdev, mddev);
         print_conf(conf);
         return err;
@@ -1149,7 +1175,8 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
                  * is not possible.
                  */
                 if (!test_bit(Faulty, &rdev->flags) &&
-                   enough(conf)) {
+                   mddev->recovery_disabled != p->recovery_disabled &&
+                   enough(conf, -1)) {
                         err = -EBUSY;
                         goto abort;
                 }
@@ -1174,14 +1201,9 @@ static void end_sync_read(struct bio *bio, int error)
  {
         r10bio_t *r10_bio = bio->bi_private;
         conf_t *conf = r10_bio->mddev->private;
-       int i,d;
+       int d;
  
-       for (i=0; i<conf->copies; i++)
-               if (r10_bio->devs[i].bio == bio)
-                       break;
-       BUG_ON(i == conf->copies);
-       update_head_pos(i, r10_bio);
-       d = r10_bio->devs[i].devnum;
+       d = find_bio_disk(conf, r10_bio, bio);
  
         if (test_bit(BIO_UPTODATE, &bio->bi_flags))
                 set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1212,18 +1234,13 @@ static void end_sync_write(struct bio *bio, int error)
         r10bio_t *r10_bio = bio->bi_private;
         mddev_t *mddev = r10_bio->mddev;
         conf_t *conf = mddev->private;
-       int i,d;
+       int d;
  
-       for (i = 0; i < conf->copies; i++)
-               if (r10_bio->devs[i].bio == bio)
-                       break;
-       d = r10_bio->devs[i].devnum;
+       d = find_bio_disk(conf, r10_bio, bio);
  
         if (!uptodate)
                 md_error(mddev, conf->mirrors[d].rdev);
  
-       update_head_pos(i, r10_bio);
-
         rdev_dec_pending(conf->mirrors[d].rdev, mddev);
         while (atomic_dec_and_test(&r10_bio->remaining)) {
                 if (r10_bio->master_bio == NULL) {
@@ -1359,28 +1376,28 @@ done:
  static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
  {
         conf_t *conf = mddev->private;
-       int i, d;
-       struct bio *bio, *wbio;
-
+       int d;
+       struct bio *wbio;
  
-       /* move the pages across to the second bio
+       /*
+        * share the pages with the first bio
          * and submit the write request
          */
-       bio = r10_bio->devs[0].bio;
         wbio = r10_bio->devs[1].bio;
-       for (i=0; i < wbio->bi_vcnt; i++) {
-               struct page *p = bio->bi_io_vec[i].bv_page;
-               bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
-               wbio->bi_io_vec[i].bv_page = p;
-       }
         d = r10_bio->devs[1].devnum;
  
         atomic_inc(&conf->mirrors[d].rdev->nr_pending);
         md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
         if (test_bit(R10BIO_Uptodate, &r10_bio->state))
                 generic_make_request(wbio);
-       else
-               bio_endio(wbio, -EIO);
+       else {
+               printk(KERN_NOTICE
+                      "md/raid10:%s: recovery aborted due to read error\n",
+                      mdname(mddev));
+               conf->mirrors[d].recovery_disabled = mddev->recovery_disabled;
+               set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+               bio_endio(wbio, 0);
+       }
  }
  
  
@@ -1520,7 +1537,6 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                             test_bit(In_sync, &rdev->flags)) {
                                 atomic_inc(&rdev->nr_pending);
                                 rcu_read_unlock();
-                               atomic_add(s, &rdev->corrected_errors);
                                 if (sync_page_io(rdev,
                                                  r10_bio->devs[sl].addr +
                                                  sect,
@@ -1585,6 +1601,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                                (unsigned long long)(
                                                        sect + rdev->data_offset),
                                                bdevname(rdev->bdev, b));
+                                       atomic_add(s, &rdev->corrected_errors);
                                 }
  
                                 rdev_dec_pending(rdev, mddev);
@@ -1667,12 +1684,13 @@ static void raid10d(mddev_t *mddev)
                                 bio_put(bio);
                                 slot = r10_bio->read_slot;
                                 rdev = conf->mirrors[mirror].rdev;
-                               if (printk_ratelimit())
-                                       printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
-                                              " another mirror\n",
-                                              mdname(mddev),
-                                              bdevname(rdev->bdev,b),
-                                              (unsigned long long)r10_bio->sector);
+                               printk_ratelimited(
+                                       KERN_ERR
+                                       "md/raid10:%s: %s: redirecting"
+                                       "sector %llu to another mirror\n",
+                                       mdname(mddev),
+                                       bdevname(rdev->bdev, b),
+                                       (unsigned long long)r10_bio->sector);
                                 bio = bio_clone_mddev(r10_bio->master_bio,
                                                       GFP_NOIO, mddev);
                                 r10_bio->devs[slot].bio = bio;
@@ -1686,6 +1704,8 @@ static void raid10d(mddev_t *mddev)
                         }
                 }
                 cond_resched();
+               if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+                       md_check_recovery(mddev);
         }
         blk_finish_plug(&plug);
  }
@@ -2249,6 +2269,11 @@ static int run(mddev_t *mddev)
                                  (conf->raid_disks / conf->near_copies));
  
         list_for_each_entry(rdev, &mddev->disks, same_set) {
+
+               if (rdev->badblocks.count) {
+                       printk(KERN_ERR "md/raid10: cannot handle bad blocks yet\n");
+                       goto out_free_conf;
+               }
                 disk_idx = rdev->raid_disk;
                 if (disk_idx >= conf->raid_disks
                     || disk_idx < 0)
@@ -2271,7 +2296,7 @@ static int run(mddev_t *mddev)
                 disk->head_position = 0;
         }
         /* need to check that every block has at least one working mirror */
-       if (!enough(conf)) {
+       if (!enough(conf, -1)) {
                 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
                        mdname(mddev));
                 goto out_free_conf;